From 129986eb34d26c9322e7bbfc6475e009ed42bd81 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 29 Dec 2020 15:54:54 +0000
Subject: [PATCH 01/51] first try

---
 cookiecutter-template-BART/bart.rst           |   80 +
 cookiecutter-template-BART/to_replace_bart.py |  212 +++
 .../tokenization_bart.py                      |   51 +
 .../tokenization_fast_bart.py                 |   53 +
 src/transformers/commands/add_new_model.py    |   30 +-
 src/transformers/models/bart/__init__.py      |   12 +-
 .../models/bart/configuration_bart.py         |  174 +-
 src/transformers/models/bart/modeling_bart.py |  355 ++--
 src/transformers/models/old_bart/__init__.py  |   39 +
 .../models/old_bart/configuration_bart.py     |  230 +++
 ..._original_pytorch_checkpoint_to_pytorch.py |  143 ++
 .../models/old_bart/modeling_bart.py          | 1520 +++++++++++++++++
 .../models/old_bart/modeling_tf_bart.py       | 1326 ++++++++++++++
 .../models/old_bart/tokenization_bart.py      |   99 ++
 .../models/old_bart/tokenization_bart_fast.py |   92 +
 tests/test_modeling_bart.py                   |  540 +-----
 16 files changed, 4105 insertions(+), 851 deletions(-)
 create mode 100644 cookiecutter-template-BART/bart.rst
 create mode 100644 cookiecutter-template-BART/to_replace_bart.py
 create mode 100644 cookiecutter-template-BART/tokenization_bart.py
 create mode 100644 cookiecutter-template-BART/tokenization_fast_bart.py
 mode change 100644 => 100755 src/transformers/models/bart/modeling_bart.py
 create mode 100644 src/transformers/models/old_bart/__init__.py
 create mode 100644 src/transformers/models/old_bart/configuration_bart.py
 create mode 100644 src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/old_bart/modeling_bart.py
 create mode 100644 src/transformers/models/old_bart/modeling_tf_bart.py
 create mode 100644 src/transformers/models/old_bart/tokenization_bart.py
 create mode 100644 src/transformers/models/old_bart/tokenization_bart_fast.py
diff --git a/cookiecutter-template-BART/bart.rst b/cookiecutter-template-BART/bart.rst
new file mode 100644
index 00000000000000..d8a4841a7db474
--- /dev/null
+++ b/cookiecutter-template-BART/bart.rst
@@ -0,0 +1,80 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BART
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BART model was proposed in `<INSERT PAPER NAME HERE>
+<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+BartConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
+
+BartTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartModel
+    :members: forward
+
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+    :members: forward
+
+
+BartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForSequenceClassification
+    :members: forward
+
+
+BartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForQuestionAnswering
+    :members: forward
+
+
diff --git a/cookiecutter-template-BART/to_replace_bart.py b/cookiecutter-template-BART/to_replace_bart.py
new file mode 100644
index 00000000000000..437d98f6566875
--- /dev/null
+++ b/cookiecutter-template-BART/to_replace_bart.py
@@ -0,0 +1,212 @@
+## Copyright 2020 The HuggingFace Team. All rights reserved.
+##
+## Licensed under the Apache License, Version 2.0 (the "License");
+## you may not use this file except in compliance with the License.
+## You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+## This file is made so that specific statements may be copied inside existing files. This is useful to copy
+## import statements in __init__.py, or to complete model lists in the AUTO files.
+##
+## It is to be used as such:
+## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
+## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH
+## Put '# Replace with:' followed by the lines containing the content to define the content
+## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
+## content in that file.
+##
+## Put '## COMMENT' to comment on the file.
+
+
+# To replace in: "src/transformers/__init__.py"
+# Below: "if is_torch_available():" if generating PyTorch
+# Replace with:
+
+    from .models.bart import (
+        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BartForConditionalGeneration,
+        BartForQuestionAnswering,
+        BartForSequenceClassification,
+        BartModel,
+    )
+# End.
+
+# Below: "if is_tf_available():" if generating TensorFlow
+# Replace with:
+
+    from .models.bart import (
+        TFBartForConditionalGeneration,
+        TFBartModel,
+        TFBartPreTrainedModel,
+    )
+# End.
+
+# Below: "if is_tokenizers_available():"
+# Replace with:
+    from .models.bart import BartTokenizerFast
+# End.
+
+# Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
+# Replace with:
+from .models.bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig, BartTokenizer
+# End.
+
+
+
+# To replace in: "src/transformers/models/auto/configuration_auto.py"
+# Below: "# Add configs here"
+# Replace with:
+        ("bart", BartConfig),
+# End.
+
+# Below: "# Add archive maps here"
+# Replace with:
+        BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
+# End.
+
+# Below: "from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
+# Replace with:
+from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
+# End.
+
+# Below: "# Add full (and cased) model names here"
+# Replace with:
+        ("bart", "Bart"),
+# End.
+
+
+
+# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
+# Below: "from .configuration_auto import ("
+# Replace with:
+    BartConfig,
+# End.
+
+# Below: "# Add modeling imports here"
+# Replace with:
+from ..bart.modeling_bart import (
+    BartForConditionalGeneration,
+    BartForQuestionAnswering,
+    BartForSequenceClassification,
+    BartModel,
+)
+# End.
+
+# Below: "# Base model mapping"
+# Replace with:
+        (BartConfig, BartModel),
+# End.
+
+# Below: "# Model with LM heads mapping"
+# Replace with:
+
+        (BartConfig, BartForConditionalGeneration),
+# End.
+
+# Below: "# Model for Causal LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Masked LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Sequence Classification mapping"
+# Replace with:
+        (BartConfig, BartForSequenceClassification),
+# End.
+
+# Below: "# Model for Question Answering mapping"
+# Replace with:
+        (BartConfig, BartForQuestionAnswering),
+# End.
+
+# Below: "# Model for Token Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Multiple Choice mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+
+        (BartConfig, BartForConditionalGeneration),
+# End.
+
+# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
+# Below: "from .configuration_auto import ("
+# Replace with:
+    BartConfig,
+# End.
+
+# Below: "# Add modeling imports here"
+# Replace with:
+from ..bart.modeling_tf_bart import (
+    TFBartForConditionalGeneration,
+    TFBartModel,
+)
+# End.
+
+# Below: "# Base model mapping"
+# Replace with:
+        (BartConfig, TFBartModel),
+# End.
+
+# Below: "# Model with LM heads mapping"
+# Replace with:
+
+        (BartConfig, TFBartForConditionalGeneration),
+# End.
+
+# Below: "# Model for Causal LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Masked LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Sequence Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Question Answering mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Token Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Multiple Choice mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+
+        (BartConfig, TFBartForConditionalGeneration),
+# End.
+
+# To replace in: "utils/check_repo.py" if generating PyTorch
+
+# Below: "models to ignore for model xxx mapping"
+# Replace with:
+"BartEncoder",
+    "BartDecoder",
+# End.
+
+# Below: "models to ignore for not tested"
+# Replace with:
+"BartEncoder",  # Building part of bigger (tested) model.
+    "BartDecoder",  # Building part of bigger (tested) model.
+# End.
diff --git a/cookiecutter-template-BART/tokenization_bart.py b/cookiecutter-template-BART/tokenization_bart.py
new file mode 100644
index 00000000000000..a032d8b8b7ad56
--- /dev/null
+++ b/cookiecutter-template-BART/tokenization_bart.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for BART."""
+from ...utils import logging
+from ..bart.tokenization_bart import BartTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/bart-large": 1024,
+}
+
+
+class BartTokenizer(BartTokenizer):
+    """
+    Construct a BART tokenizer.
+
+    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+    tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/cookiecutter-template-BART/tokenization_fast_bart.py b/cookiecutter-template-BART/tokenization_fast_bart.py
new file mode 100644
index 00000000000000..c25e7e91353259
--- /dev/null
+++ b/cookiecutter-template-BART/tokenization_fast_bart.py
@@ -0,0 +1,53 @@
+# coding=utf-8
+# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for BART."""
+from ...utils import logging
+from ..bart.tokenization_bart_fast import BartTokenizerFast
+from .tokenization_bart import BartTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/bart-large": 1024,
+}
+
+
+class BartTokenizerFast(BartTokenizerFast):
+    r"""
+    Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BartTokenizer
diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index 6b27e0b24c3fc7..d1b29e0af3ca8d 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -154,20 +154,20 @@ def remove_copy_lines(path):
             os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
             os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
 
-        shutil.move(
-            f"{directory}/{lowercase_model_name}.rst",
-            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
-        )
-
-        shutil.move(
-            f"{directory}/tokenization_{lowercase_model_name}.py",
-            f"{model_dir}/tokenization_{lowercase_model_name}.py",
-        )
-
-        shutil.move(
-            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
-            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
-        )
+#        shutil.move(
+#            f"{directory}/{lowercase_model_name}.rst",
+#            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
+#        )
+#
+#        shutil.move(
+#            f"{directory}/tokenization_{lowercase_model_name}.py",
+#            f"{model_dir}/tokenization_{lowercase_model_name}.py",
+#        )
+#
+#        shutil.move(
+#            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
+#            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
+#        )
 
         from os import fdopen, remove
         from shutil import copymode, move
@@ -225,5 +225,5 @@ def replace_in_files(path_to_datafile):
 
             remove(path_to_datafile)
 
-        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
+#        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
         os.rmdir(directory)
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index 22acfebc2fbd77..28b59ce9688966 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -15,12 +15,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_bart import BartConfig
+from ...file_utils import is_torch_available, is_tokenizers_available
+from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
 from .tokenization_bart import BartTokenizer
 
-
 if is_tokenizers_available():
     from .tokenization_bart_fast import BartTokenizerFast
 
@@ -31,9 +29,7 @@
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
-        BartPretrainedModel,
-        PretrainedBartModel,
+        BartPreTrainedModel,
     )
 
-if is_tf_available():
-    from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel
+
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 90a1ea780ad06d..59235db6eaf123 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BART configuration """
+""" BART model configuration """
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -21,34 +21,34 @@
 logger = logging.get_logger(__name__)
 
 BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/config.json",
     "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
-    "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json",
-    "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json",
-    "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json",
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
-    "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/config.json",
+    # See all BART models at https://huggingface.co/models?filter=bart
 }
 
 
 class BartConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
-    instantiate a BART model according to the specified arguments, defining the model architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`.
+    It is used to instantiate an BART model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the BART `facebook/bart-large <https://huggingface.co/facebook/bart-large>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 50265):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel`.
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
+            :class:`~transformers.TFBartModel`.
         d_model (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
         encoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of encoder layers, 6 are used for the `bart-base` model.
+            Number of encoder layers.
         decoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of decoder layers, 6 are used for the `bart-base` model.
+            Number of decoder layers.
         encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
@@ -73,146 +73,90 @@ class BartConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        num_labels: (:obj:`int`, `optional`, defaults to 3):
-            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
-            :obj:`True` for `bart-large-cnn`.
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        Example::
+
+        >>> from transformers import BartModel, BartConfig
+
+        >>> # Initializing a BART facebook/bart-large style configuration
+        >>> configuration = BartConfig()
+
+        >>> # Initializing a model from the facebook/bart-large style configuration
+        >>> model = BartModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "bart"
-    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=2,
-        activation_function="gelu",
         vocab_size=50265,
-        d_model=1024,
-        encoder_ffn_dim=4096,
+        max_position_embeddings=1024,
         encoder_layers=12,
+        encoder_ffn_dim=4096,
         encoder_attention_heads=16,
-        decoder_ffn_dim=4096,
         decoder_layers=12,
+        decoder_ffn_dim=4096,
         decoder_attention_heads=16,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        attention_dropout=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
         dropout=0.1,
-        max_position_embeddings=1024,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
         init_std=0.02,
+        decoder_start_token_id=2,
         classifier_dropout=0.0,
-        num_labels=3,
-        is_encoder_decoder=True,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=False,
         scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
-        force_bos_token_to_be_generated=False,
-        use_cache=True,
+        gradient_checkpointing=False,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **common_kwargs
+        **kwargs
     ):
-        r"""
-        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
-
-        Examples::
-
-            >>> from transformers import BartConfig, BartModel
-
-            >>> config = BartConfig.from_pretrained('facebook/bart-large')
-            >>> model = BartModel(config)
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
         super().__init__(
-            num_labels=num_labels,
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            **common_kwargs,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs
         )
+
         self.vocab_size = vocab_size
-        self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
         self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = self.num_hidden_layers = encoder_layers
+        self.encoder_layers = encoder_layers
         self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
         self.decoder_ffn_dim = decoder_ffn_dim
         self.decoder_layers = decoder_layers
         self.decoder_attention_heads = decoder_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.init_std = init_std  # Normal(0, this parameter)
-        self.activation_function = activation_function
-
-        # Params introduced for Mbart
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.normalize_embedding = normalize_embedding  # True for mbart, False otherwise
-        self.normalize_before = normalize_before  # combo of fairseq's encoder_ and decoder_normalize_before
-        self.add_final_layer_norm = add_final_layer_norm
-
-        # Params introduced for Marian
-        self.add_bias_logits = add_bias_logits
-        self.static_position_embeddings = static_position_embeddings
-
-        # 3 Types of Dropout
+        self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
-        self.dropout = dropout
-
-        # Classifier stuff
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
         self.classifier_dropout = classifier_dropout
-
-        # pos embedding offset
-        self.extra_pos_embeddings = extra_pos_embeddings
-        # bart has a hack that offsets positional embeddings by 2, other models don't do this
-
-        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
-
-        self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
-
         self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
+        
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -220,11 +164,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    def is_valid_mbart(self) -> bool:
-        """Is the configuration aligned with the MBART paper."""
-        if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
-            return True
-        if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
-            logger.info("This configuration is a mixture of MBART and BART settings")
-        return False
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
old mode 100644
new mode 100755
index f631736cc4380b..975812a3eecca8
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BART model, ported from the fairseq repo."""
-import math
+""" PyTorch BART model. """
+
+
 import random
-import warnings
 from typing import Optional, Tuple
 
-import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -52,32 +51,24 @@
 
 
 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/bart-base",
     "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "facebook/mbart-large-en-ro",
+    # See all BART models at https://huggingface.co/models?filter=bart
 ]
-# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
 
 
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
-    Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
+    Shift input ids one token to the right.
     """
-    prev_output_tokens = input_ids.clone()
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
 
     assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
     # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
-
-    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
-    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
-    prev_output_tokens[:, 0] = decoder_start_tokens
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
-    return prev_output_tokens
+    return shifted_input_ids
 
 
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
@@ -95,7 +86,9 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+def _expand_mask(
+    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -122,53 +115,14 @@ def BartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_a
 
 class BartLearnedPositionalEmbedding(nn.Embedding):
     """
-    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
-    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
-    the forward function.
+    This module learns positional embeddings up to a fixed maximum size. 
     """
 
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int):
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
-        self.offset = offset
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings += offset
+        num_embeddings
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
 
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
-        return super().forward(positions + self.offset)
-
-
-class BartSinusoidalPositionalEmbedding(nn.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
-
-    @staticmethod
-    def _init_weight(out: nn.Parameter):
-        """
-        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
-        the 2nd half of the vector. [dim // 2:]
-        """
-        n_pos, dim = out.shape
-        position_enc = np.array(
-            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
-        )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
-        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
-        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
-        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
-
-    @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         bsz, seq_len = input_ids_shape[:2]
@@ -321,7 +275,6 @@ def __init__(self, config: BartConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.normalize_before = config.normalize_before
         self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -336,33 +289,35 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
         """
         residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.self_attn(
             hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn_layer_norm(hidden_states)
 
         residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
         hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
         if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        return hidden_states, attn_weights
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
 
 
 class BartDecoderLayer(nn.Module):
@@ -379,7 +334,6 @@ def __init__(self, config: BartConfig):
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
-        self.normalize_before = config.normalize_before
 
         self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
         self.encoder_attn = BartAttention(
@@ -400,7 +354,8 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[torch.Tensor] = False,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
     ):
         """
         Args:
@@ -411,11 +366,10 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
         """
         residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
 
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
@@ -429,16 +383,13 @@ def forward(
         )
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn_layer_norm(hidden_states)
 
         # Cross-Attention Block
         cross_attn_present_key_value = None
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             residual = hidden_states
-            if self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -451,30 +402,29 @@ def forward(
             )
             hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
-            if not self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
             # add cross-attn to positions 3,4 of present_key_value tuple
             present_key_value = present_key_value + cross_attn_present_key_value
 
         # Fully Connected
         residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
         hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        return (
-            hidden_states,
-            self_attn_weights,
-            present_key_value,
-            cross_attn_weights,
-        )
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
 
 
 class BartClassificationHead(nn.Module):
@@ -501,7 +451,7 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-class BartPretrainedModel(PreTrainedModel):
+class BartPreTrainedModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
 
@@ -511,8 +461,6 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, BartSinusoidalPositionalEmbedding):
-            pass
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -529,14 +477,6 @@ def dummy_inputs(self):
         return dummy_inputs
 
 
-class PretrainedBartModel(BartPretrainedModel):
-    def __init_subclass__(self):
-        warnings.warn(
-            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
-            FutureWarning,
-        )
-
-
 BART_START_DOCSTRING = r"""
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -547,7 +487,8 @@ def __init_subclass__(self):
     general usage and behavior.
 
     Parameters:
-        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.BartConfig`):
+            Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
             weights.
@@ -558,9 +499,8 @@ def __init_subclass__(self):
 
         >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
 
-        >>> # see ``examples/summarization/bart/run_eval.py`` for a longer example
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
 
         >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
@@ -635,7 +575,7 @@ def __init_subclass__(self):
 """
 
 
-class BartEncoder(BartPretrainedModel):
+class BartEncoder(BartPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
     :class:`BartEncoderLayer`.
@@ -652,30 +592,22 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
         self.layerdrop = config.encoder_layerdrop
 
         embed_dim = config.d_model
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
         else:
             self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
 
-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                embed_dim,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
         self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
-        # mbart has one extra layer_norm
-        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
+        self.layernorm_embedding = BartLayerNorm(embed_dim)
 
         self.init_weights()
 
@@ -758,15 +690,28 @@ def forward(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = random.uniform(0, 1)
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
-                attn = None
+                layer_outputs = (None, None)
             else:
-                hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+                if getattr(self.config, "gradient_checkpointing", False):
 
-            if output_attentions:
-                all_attentions = all_attentions + (attn,)
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
 
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
 
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)
@@ -778,7 +723,7 @@ def forward(
         )
 
 
-class BartDecoder(BartPretrainedModel):
+class BartDecoder(BartPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
 
@@ -791,7 +736,6 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
-        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm  # layernorm variant
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
@@ -801,20 +745,13 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
         else:
             self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
 
-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, config.pad_token_id
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
         self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
-        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
+        self.layernorm_embedding = BartLayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -913,33 +850,6 @@ def forward(
                 input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
             ).to(self.device)
 
-        # create decoder_padding_mask if not provided and needed
-        # 4.12.20 (PVP): Not a fan of this "magical" function that
-        # automatically creates attention_mask for padded tokens
-        # => this is inconsistent with other models
-        # => Pegasus uses the pad_token as decoder_start_token_id, so that this could
-        # pose some problems.
-        if (
-            attention_mask is None
-            and input_ids is not None
-            and input_shape[-1] > 1
-            and self.config.pad_token_id in input_ids
-        ):
-            # should be kept for backwards compatibility
-            attention_mask = input_ids.ne(self.config.pad_token_id).to(torch.long)
-            # never mask leading token, even if it is pad
-            attention_mask[:, 0] = attention_mask[:, 1]
-            if past_key_values_length > 0:
-                attention_mask = torch.cat(
-                    [
-                        torch.ones(
-                            (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device
-                        ),
-                        attention_mask,
-                    ],
-                    dim=-1,
-                )
-
         if attention_mask is not None and combined_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             combined_attention_mask = combined_attention_mask + _expand_mask(
@@ -954,12 +864,8 @@ def forward(
         # embed positions
         positions = self.embed_positions(input_shape, past_key_values_length)
 
-        if self.do_blenderbot_90_layernorm:
-            hidden_states = self.layernorm_embedding(inputs_embeds)
-            hidden_states += positions
-        else:
-            hidden_states = inputs_embeds + positions
-            hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
 
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
 
@@ -978,30 +884,51 @@ def forward(
 
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
-            hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(
-                hidden_states,
-                attention_mask=combined_attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-            )
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (present_key_value,)
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
 
             if output_attentions:
-                all_self_attns += (layer_self_attn,)
-                all_cross_attentions += (layer_cross_attn,)
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        # if config.add_final_layer_norm (mBART)
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
-
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
@@ -1022,7 +949,7 @@ def forward(
     "The bare BART Model outputting raw hidden-states without any specific head on top.",
     BART_START_DOCSTRING,
 )
-class BartModel(BartPretrainedModel):
+class BartModel(BartPreTrainedModel):
     def __init__(self, config: BartConfig):
         super().__init__(config)
 
@@ -1070,14 +997,6 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
-
-        # 4.12.20 (PVP): Not a fan of this "magical" function and
-        # also wonder how often it's actually used ... keep now
-        # for backward compatibility
-        # -> is this used for backward compatibility
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1134,7 +1053,7 @@ def forward(
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
-class BartForConditionalGeneration(BartPretrainedModel):
+class BartForConditionalGeneration(BartPreTrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
         r"final_logits_bias",
@@ -1206,7 +1125,6 @@ def forward(
 
         Conditional generation example::
 
-            >>> # Mask filling only works for bart-large
             >>> from transformers import BartTokenizer, BartForConditionalGeneration
             >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
             >>> TXT = "My friends are <mask> but they eat too many carbs."
@@ -1220,14 +1138,12 @@ def forward(
             >>> values, predictions = probs.topk(5)
 
             >>> tokenizer.decode(predictions).split()
-            >>> # ['good', 'great', 'all', 'really', 'very']
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if labels is not None:
-            use_cache = False
             if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
 
         outputs = self.model(
             input_ids,
@@ -1248,7 +1164,6 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            # TODO(SS): do we need to ignore pad tokens in labels?
             masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
@@ -1283,18 +1198,6 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
-        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
-        return logits
-
-    @staticmethod
-    def _force_token_id_to_be_generated(scores, token_id) -> None:
-        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
-
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
@@ -1310,7 +1213,7 @@ def _reorder_cache(past, beam_idx):
     """,
     BART_START_DOCSTRING,
 )
-class BartForSequenceClassification(BartPretrainedModel):
+class BartForSequenceClassification(BartPreTrainedModel):
     def __init__(self, config: BartConfig, **kwargs):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
@@ -1412,7 +1315,7 @@ def forward(
     """,
     BART_START_DOCSTRING,
 )
-class BartForQuestionAnswering(BartPretrainedModel):
+class BartForQuestionAnswering(BartPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/src/transformers/models/old_bart/__init__.py b/src/transformers/models/old_bart/__init__.py
new file mode 100644
index 00000000000000..22acfebc2fbd77
--- /dev/null
+++ b/src/transformers/models/old_bart/__init__.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_bart import BartConfig
+from .tokenization_bart import BartTokenizer
+
+
+if is_tokenizers_available():
+    from .tokenization_bart_fast import BartTokenizerFast
+
+if is_torch_available():
+    from .modeling_bart import (
+        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BartForConditionalGeneration,
+        BartForQuestionAnswering,
+        BartForSequenceClassification,
+        BartModel,
+        BartPretrainedModel,
+        PretrainedBartModel,
+    )
+
+if is_tf_available():
+    from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel
diff --git a/src/transformers/models/old_bart/configuration_bart.py b/src/transformers/models/old_bart/configuration_bart.py
new file mode 100644
index 00000000000000..90a1ea780ad06d
--- /dev/null
+++ b/src/transformers/models/old_bart/configuration_bart.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BART configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/config.json",
+    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
+    "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json",
+    "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json",
+    "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json",
+    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
+    "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/config.json",
+}
+
+
+class BartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
+    instantiate a BART model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers, 6 are used for the `bart-base` model.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers, 6 are used for the `bart-base` model.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            This should be completed, specific to marian.
+        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Call layernorm before attention ops.
+        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Call layernorm after embeddings.
+        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Don't learn positional embeddings, use sinusoidal.
+        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Why not add another layernorm?
+        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
+            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
+        num_labels: (:obj:`int`, `optional`, defaults to 3):
+            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model.
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
+            :obj:`True` for `bart-large-cnn`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+    model_type = "bart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        activation_dropout=0.0,
+        extra_pos_embeddings=2,
+        activation_function="gelu",
+        vocab_size=50265,
+        d_model=1024,
+        encoder_ffn_dim=4096,
+        encoder_layers=12,
+        encoder_attention_heads=16,
+        decoder_ffn_dim=4096,
+        decoder_layers=12,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        attention_dropout=0.0,
+        dropout=0.1,
+        max_position_embeddings=1024,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        num_labels=3,
+        is_encoder_decoder=True,
+        normalize_before=False,
+        add_final_layer_norm=False,
+        do_blenderbot_90_layernorm=False,
+        scale_embedding=False,
+        normalize_embedding=True,
+        static_position_embeddings=False,
+        add_bias_logits=False,
+        force_bos_token_to_be_generated=False,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **common_kwargs
+    ):
+        r"""
+        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
+
+        Examples::
+
+            >>> from transformers import BartConfig, BartModel
+
+            >>> config = BartConfig.from_pretrained('facebook/bart-large')
+            >>> model = BartModel(config)
+
+        """
+        if "hidden_size" in common_kwargs:
+            raise ValueError("hidden size is called d_model")
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **common_kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = self.num_hidden_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std  # Normal(0, this parameter)
+        self.activation_function = activation_function
+
+        # Params introduced for Mbart
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.normalize_embedding = normalize_embedding  # True for mbart, False otherwise
+        self.normalize_before = normalize_before  # combo of fairseq's encoder_ and decoder_normalize_before
+        self.add_final_layer_norm = add_final_layer_norm
+
+        # Params introduced for Marian
+        self.add_bias_logits = add_bias_logits
+        self.static_position_embeddings = static_position_embeddings
+
+        # 3 Types of Dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+        # Classifier stuff
+        self.classifier_dropout = classifier_dropout
+
+        # pos embedding offset
+        self.extra_pos_embeddings = extra_pos_embeddings
+        # bart has a hack that offsets positional embeddings by 2, other models don't do this
+
+        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
+
+        self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
+
+        self.use_cache = use_cache
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    def is_valid_mbart(self) -> bool:
+        """Is the configuration aligned with the MBART paper."""
+        if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
+            return True
+        if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
+            logger.info("This configuration is a mixture of MBART and BART settings")
+        return False
diff --git a/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..8978b8b2e57f45
--- /dev/null
+++ b/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BART checkpoint."""
+
+
+import argparse
+import os
+from pathlib import Path
+
+import fairseq
+import torch
+from packaging import version
+
+from transformers import (
+    BartConfig,
+    BartForConditionalGeneration,
+    BartForSequenceClassification,
+    BartModel,
+    BartTokenizer,
+)
+from transformers.models.bart.modeling_bart import _make_linear_from_emb
+from transformers.utils import logging
+
+
+FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
+extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = " Hello world! cécé herlolip"
+
+mnli_rename_keys = [
+    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
+    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
+    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
+    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
+]
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "_float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def load_xsum_checkpoint(checkpoint_path):
+    """Checkpoint path should end in model.pt"""
+    sd = torch.load(checkpoint_path, map_location="cpu")
+    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
+    hub_interface.model.load_state_dict(sd["model"])
+    return hub_interface
+
+
+@torch.no_grad()
+def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
+    """
+    Copy/paste/tweak model's weights to our BERT structure.
+    """
+    if not os.path.exists(checkpoint_path):
+        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
+    else:
+        bart = load_xsum_checkpoint(checkpoint_path)
+
+    bart.model.upgrade_state_dict(bart.model.state_dict())
+    if hf_checkpoint_name is None:
+        hf_checkpoint_name = checkpoint_path.replace(".", "-")
+    config = BartConfig.from_pretrained(hf_checkpoint_name)
+    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
+    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
+    assert torch.eq(tokens, tokens2).all()
+
+    if checkpoint_path == "bart.large.mnli":
+        state_dict = bart.state_dict()
+        remove_ignore_keys_(state_dict)
+        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
+        for src, dest in mnli_rename_keys:
+            rename_key(state_dict, src, dest)
+        model = BartForSequenceClassification(config).eval()
+        model.load_state_dict(state_dict)
+        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
+        new_model_outputs = model(tokens)[0]  # logits
+    else:  # no classification heads to worry about
+        state_dict = bart.model.state_dict()
+        remove_ignore_keys_(state_dict)
+        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
+        fairseq_output = bart.extract_features(tokens)
+        if hf_checkpoint_name == "facebook/bart-large":
+            model = BartModel(config).eval()
+            model.load_state_dict(state_dict)
+            new_model_outputs = model(tokens).model[0]
+        else:
+            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
+            model.model.load_state_dict(state_dict)
+            if hasattr(model, "lm_head"):
+                model.lm_head = _make_linear_from_emb(model.model.shared)
+            new_model_outputs = model.model(tokens)[0]
+
+    # Check results
+    assert fairseq_output.shape == new_model_outputs.shape
+    assert (fairseq_output == new_model_outputs).all().item()
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
+    )
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
+    )
+    args = parser.parse_args()
+    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
diff --git a/src/transformers/models/old_bart/modeling_bart.py b/src/transformers/models/old_bart/modeling_bart.py
new file mode 100644
index 00000000000000..f631736cc4380b
--- /dev/null
+++ b/src/transformers/models/old_bart/modeling_bart.py
@@ -0,0 +1,1520 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BART model, ported from the fairseq repo."""
+import math
+import random
+import warnings
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_bart import BartConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BartConfig"
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+
+
+BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/bart-base",
+    "facebook/bart-large",
+    "facebook/bart-large-mnli",
+    "facebook/bart-large-cnn",
+    "facebook/bart-large-xsum",
+    "facebook/mbart-large-en-ro",
+]
+# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
+    """
+    Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
+    """
+    prev_output_tokens = input_ids.clone()
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
+
+    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
+    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
+    prev_output_tokens[:, 0] = decoder_start_tokens
+
+    return prev_output_tokens
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+def BartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class BartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
+    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
+    the forward function.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = offset
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
+        num_embeddings += offset
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+
+
+class BartSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class BartAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class BartEncoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.normalize_before = config.normalize_before
+        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BartLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        return hidden_states, attn_weights
+
+
+class BartDecoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.normalize_before = config.normalize_before
+
+        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.encoder_attn = BartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BartLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[torch.Tensor] = False,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            present_key_value,
+            cross_attn_weights,
+        )
+
+
+class BartClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class BartPretrainedModel(PreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, BartSinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+class PretrainedBartModel(BartPretrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            FutureWarning,
+        )
+
+
+BART_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+BART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+
+        >>> # see ``examples/summarization/bart/run_eval.py`` for a longer example
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+"""
+
+BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class BartEncoder(BartPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`BartEncoderLayer`.
+
+    Args:
+        config: BartConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        if config.static_position_embeddings:
+            self.embed_positions = BartSinusoidalPositionalEmbedding(
+                config.max_position_embeddings, embed_dim, self.padding_idx
+            )
+        else:
+            self.embed_positions = BartLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                embed_dim,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+            )
+        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = BartLayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
+        # mbart has one extra layer_norm
+        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                attn = None
+            else:
+                hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+            if output_attentions:
+                all_attentions = all_attentions + (attn,)
+
+        if self.layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class BartDecoder(BartPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
+
+    Args:
+        config: BartConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm  # layernorm variant
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        if config.static_position_embeddings:
+            self.embed_positions = BartSinusoidalPositionalEmbedding(
+                config.max_position_embeddings, config.d_model, config.pad_token_id
+            )
+        else:
+            self.embed_positions = BartLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+            )
+        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = BartLayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
+        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        # create decoder_padding_mask if not provided and needed
+        # 4.12.20 (PVP): Not a fan of this "magical" function that
+        # automatically creates attention_mask for padded tokens
+        # => this is inconsistent with other models
+        # => Pegasus uses the pad_token as decoder_start_token_id, so that this could
+        # pose some problems.
+        if (
+            attention_mask is None
+            and input_ids is not None
+            and input_shape[-1] > 1
+            and self.config.pad_token_id in input_ids
+        ):
+            # should be kept for backwards compatibility
+            attention_mask = input_ids.ne(self.config.pad_token_id).to(torch.long)
+            # never mask leading token, even if it is pad
+            attention_mask[:, 0] = attention_mask[:, 1]
+            if past_key_values_length > 0:
+                attention_mask = torch.cat(
+                    [
+                        torch.ones(
+                            (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device
+                        ),
+                        attention_mask,
+                    ],
+                    dim=-1,
+                )
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        if self.do_blenderbot_90_layernorm:
+            hidden_states = self.layernorm_embedding(inputs_embeds)
+            hidden_states += positions
+        else:
+            hidden_states = inputs_embeds + positions
+            hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(
+                hidden_states,
+                attention_mask=combined_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+
+            if use_cache:
+                next_decoder_cache += (present_key_value,)
+
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+                all_cross_attentions += (layer_cross_attn,)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # if config.add_final_layer_norm (mBART)
+        if self.layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+class BartModel(BartPretrainedModel):
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = BartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        # 4.12.20 (PVP): Not a fan of this "magical" function and
+        # also wonder how often it's actually used ... keep now
+        # for backward compatibility
+        # -> is this used for backward compatibility
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
+)
+class BartForConditionalGeneration(BartPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.model = BartModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> # Mask filling only works for bart-large
+            >>> from transformers import BartTokenizer, BartForConditionalGeneration
+            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+            >>> # ['good', 'great', 'all', 'really', 'very']
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            use_cache = False
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # TODO(SS): do we need to ignore pad tokens in labels?
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
+            self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
+        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    BART_START_DOCSTRING,
+)
+class BartForSequenceClassification(BartPretrainedModel):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
+        self.classification_head = BartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BART_START_DOCSTRING,
+)
+class BartForQuestionAnswering(BartPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = BartModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.model._init_weights(self.qa_outputs)
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
diff --git a/src/transformers/models/old_bart/modeling_tf_bart.py b/src/transformers/models/old_bart/modeling_tf_bart.py
new file mode 100644
index 00000000000000..03c24c209c5a1c
--- /dev/null
+++ b/src/transformers/models/old_bart/modeling_tf_bart.py
@@ -0,0 +1,1326 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF BART model, ported from the fairseq repo."""
+
+import math
+import random
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPast,
+    TFSeq2SeqLMOutput,
+    TFSeq2SeqModelOutput,
+)
+
+# Public API
+from ...modeling_tf_utils import (
+    DUMMY_INPUTS,
+    TFPreTrainedModel,
+    TFSharedEmbeddings,
+    TFWrappedEmbeddings,
+    input_processing,
+    keras_serializable,
+    shape_list,
+)
+from ...utils import logging
+from .configuration_bart import BartConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BartConfig"
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+
+LARGE_NEGATIVE = -1e8
+
+
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, eos_token_id: int):
+    shifted_input_ids = tf.cast(input_ids, tf.int32)
+    shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1)
+    start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), eos_token_id)
+    shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, tf.int32))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = tf.ones((tgt_len, tgt_len), dtype=tf.float32) * LARGE_NEGATIVE
+    mask_cond = tf.range(shape_list(mask)[-1])
+
+    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+    mask = tf.cast(mask, tf.float32)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1)
+    return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
+
+
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = shape_list(mask)
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32)
+
+    return (1.0 - expanded_mask) * LARGE_NEGATIVE
+
+
+class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
+    """
+    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
+    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
+    the forward function.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset, **kwargs):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = offset
+        assert padding_idx is not None, "padding_idx cannot be None"
+        num_embeddings += offset
+        super().__init__(num_embeddings, embedding_dim, **kwargs)
+
+    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input_shape[:2]
+
+        positions = tf.range(
+            past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
+        )
+        return super().call(positions + self.offset)  # super object is not callable for some reason
+
+
+class TFBartSinusoidalPositionalEmbedding(tf.keras.layers.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
+
+        if embedding_dim % 2 != 0:
+            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+        super().__init__(
+            num_positions,
+            embedding_dim,
+            **kwargs,
+        )
+
+    def build(self, input_shape: tf.TensorShape):
+        """
+        Build shared token embedding layer Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        super().build(input_shape)  # Instantiates self.weight so it can be loaded
+        weight: np.ndarray = self._init_weight(self.input_dim, self.output_dim)
+        self.set_weights([weight])  # overwrite self.weight to correct value
+
+    @staticmethod
+    def _init_weight(n_pos: int, dim: int):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        # index 0 is all zero
+        position_enc[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
+        position_enc[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
+        # convert to tensor
+        table = tf.convert_to_tensor(position_enc, dtype=tf.float32)
+        tf.stop_gradient(table)
+        return table
+
+    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input_shape[:2]
+
+        positions = tf.range(
+            past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
+        )
+        return super().call(positions)
+
+
+class TFBartAttention(tf.keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = tf.keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: Optional[tf.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        training=False,
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {shape_list(attn_weights)}",
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {shape_list(attention_mask)}",
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
+
+        attn_probs = self.dropout(attn_weights, training=training)
+
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {shape_list(attn_output)}",
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+
+class TFBartEncoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFBartAttention(
+            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
+        )
+        self.normalize_before = config.normalize_before
+        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
+        self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
+        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+
+    def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, training=False):
+        """
+        Args:
+            hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, self_attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask
+        )
+        tf.debugging.assert_equal(
+            shape_list(hidden_states),
+            shape_list(residual),
+            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, self_attn_weights
+
+
+class TFBartDecoderLayer(tf.keras.layers.Layer):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFBartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="self_attn",
+            is_decoder=True,
+        )
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
+        self.normalize_before = config.normalize_before
+
+        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.encoder_attn = TFBartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="encoder_attn",
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
+        self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
+        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask: Optional[tf.Tensor] = None,
+        encoder_hidden_states: Optional[tf.Tensor] = None,
+        encoder_attention_mask: Optional[tf.Tensor] = None,
+        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        training=False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, _, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            hidden_states = self.dropout(hidden_states, training=training)
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            present_key_value,
+        )
+
+
+class TFBartPretrainedModel(TFPreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix = "model"
+
+    @property
+    def dummy_inputs(self):
+        pad_token = 1
+        input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
+        decoder_input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
+        dummy_inputs = {
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": tf.math.not_equal(input_ids, pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+class TFPretrainedBartModel(TFBartPretrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `TFPretrainedBartModel` has been deprecated, please use `TFBartPretrainedModel` instead.",
+            FutureWarning,
+        )
+
+
+BART_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+    generic methods the library implements for all its model (such as downloading or saving, resizing the input
+    embeddings, pruning heads etc.)
+
+    This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
+    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+    and behavior.
+
+    .. note::
+
+        TF 2.0 models accepts two formats as inputs:
+
+        - having all inputs as keyword arguments (like PyTorch models), or
+        - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
+        the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+        the first positional argument :
+
+        - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+          :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Args:
+        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
+            model weights.
+"""
+
+BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the input_ids right, following the paper.
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
+        encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
+            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
+        past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.TFModelOutput` instead of a plain tuple.
+        training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@keras_serializable
+class TFBartEncoder(tf.keras.layers.Layer):
+    config_class = BartConfig
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`TFBartEncoderLayer`.
+
+    Args:
+        config: BartConfig
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.layerdrop = config.encoder_layerdrop
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+
+        self.embed_tokens = embed_tokens
+        if config.static_position_embeddings:
+            self.embed_positions = TFBartSinusoidalPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                name="embed_positions",
+            )
+        else:
+            self.embed_positions = TFBartLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+                name="embed_positions",
+            )
+        self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
+        self.layernorm_embedding = (
+            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+            if config.normalize_embedding
+            else tf.keras.layers.Layer()
+        )
+        self.layer_norm = (
+            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+            if config.add_final_layer_norm
+            else None
+        )
+
+    def call(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+        **kwargs,
+    ):
+        """
+        Args:
+            input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif inputs["input_ids"] is not None:
+            input_shape = shape_list(inputs["input_ids"])
+        elif inputs["inputs_embeds"] is not None:
+            input_shape = shape_list(inputs["inputs_embeds"])[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs["inputs_embeds"] is None:
+            inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
+        else:
+            inputs["inputs_embeds"] = inputs["inputs_embeds"]
+
+        inputs["inputs_embeds"] = inputs["inputs_embeds"] * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+        hidden_states = inputs["inputs_embeds"] + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=inputs["training"])
+
+        # check attention mask and invert
+        if inputs["attention_mask"] is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            inputs["attention_mask"] = _expand_mask(inputs["attention_mask"])
+
+        encoder_states = () if inputs["output_hidden_states"] else None
+        all_attentions = () if inputs["output_attentions"] else None
+
+        # encoder layers
+        for encoder_layer in self.layers:
+
+            if inputs["output_hidden_states"]:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if inputs["training"] and (dropout_probability < self.layerdrop):  # skip the layer
+                continue
+
+            hidden_states, attn = encoder_layer(hidden_states, inputs["attention_mask"])
+
+            if inputs["output_attentions"]:
+                all_attentions += (attn,)
+        if self.layer_norm:
+            hidden_states = self.layer_norm(hidden_states)
+        if inputs["output_hidden_states"]:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not inputs["return_dict"]:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@keras_serializable
+class TFBartDecoder(tf.keras.layers.Layer):
+    config_class = BartConfig
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFBartDecoderLayer`
+
+    Args:
+        config: BartConfig
+        embed_tokens: output embedding
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.embed_tokens = embed_tokens
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.layerdrop = config.decoder_layerdrop
+        if config.static_position_embeddings:
+            self.embed_positions = TFBartSinusoidalPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                name="embed_positions",
+            )
+        else:
+            self.embed_positions = TFBartLearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+                name="embed_positions",
+            )
+        self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
+        self.layernorm_embedding = (
+            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
+            if config.normalize_embedding
+            else tf.keras.layers.Layer()
+        )
+        self.layer_norm = (
+            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+            if config.add_final_layer_norm
+            else None
+        )
+
+        self.dropout = tf.keras.layers.Dropout(config.dropout)
+        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm
+
+    def call(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+        **kwargs,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif inputs["input_ids"] is not None:
+            input_shape = shape_list(inputs["input_ids"])
+        elif inputs["inputs_embeds"] is not None:
+            input_shape = shape_list(inputs["inputs_embeds"])[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        past_key_values_length = (
+            inputs["past_key_values"][0][0].shape[2] if inputs["past_key_values"] is not None else 0
+        )
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        if inputs["inputs_embeds"] is None:
+            inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
+
+        hidden_states = inputs["inputs_embeds"] * self.embed_scale
+
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        else:
+            combined_attention_mask = _expand_mask(
+                tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
+            )
+
+        if inputs["attention_mask"] is None and inputs["input_ids"] is not None and input_shape[-1] > 1:
+            inputs["attention_mask"] = tf.cast(
+                tf.math.not_equal(inputs["input_ids"], self.config.pad_token_id), inputs["input_ids"].dtype
+            )
+            inputs["attention_mask"] = tf.concat(
+                [
+                    tf.ones((input_shape[0], past_key_values_length), dtype=inputs["attention_mask"].dtype),
+                    inputs["attention_mask"],
+                ],
+                axis=-1,
+            )
+        else:
+            inputs["attention_mask"] = tf.ones(
+                (input_shape[0], input_shape[1] + past_key_values_length), dtype=tf.int32
+            )
+
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = combined_attention_mask + _expand_mask(
+            inputs["attention_mask"], tgt_len=input_shape[-1]
+        )
+
+        if inputs["encoder_hidden_states"] is not None and inputs["encoder_attention_mask"] is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            inputs["encoder_attention_mask"] = _expand_mask(inputs["encoder_attention_mask"], tgt_len=input_shape[-1])
+
+        if self.do_blenderbot_90_layernorm:
+            hidden_states = self.layernorm_embedding(hidden_states) + positions
+        else:
+            hidden_states = self.layernorm_embedding(hidden_states + positions)
+        hidden_states = self.dropout(hidden_states, training=inputs["training"])
+
+        # decoder layers
+        all_hidden_states = ()
+        all_self_attns = ()
+        present_key_values = ()
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if inputs["output_hidden_states"]:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+
+            if inputs["training"] and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = inputs["past_key_values"][idx] if inputs["past_key_values"] is not None else None
+
+            hidden_states, layer_self_attn, present_key_value = decoder_layer(
+                hidden_states,
+                attention_mask=combined_attention_mask,
+                encoder_hidden_states=inputs["encoder_hidden_states"],
+                encoder_attention_mask=inputs["encoder_attention_mask"],
+                past_key_value=past_key_value,
+            )
+
+            if inputs["use_cache"]:
+                present_key_values += (present_key_value,)
+
+            if inputs["output_attentions"]:
+                all_self_attns += (layer_self_attn,)
+
+        if self.layer_norm is not None:  # same as if config.add_final_layer_norm
+            hidden_states = self.layer_norm(hidden_states)
+
+        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        if inputs["output_hidden_states"]:
+            all_hidden_states += (hidden_states,)
+        else:
+            all_hidden_states = None
+
+        all_self_attns = list(all_self_attns) if inputs["output_attentions"] else None
+
+        present_key_values = (inputs["encoder_hidden_states"], present_key_values) if inputs["use_cache"] else None
+
+        if not inputs["return_dict"]:
+            return hidden_states, present_key_values, all_hidden_states, all_self_attns
+        else:
+            return TFBaseModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=present_key_values,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+            )
+
+
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+@keras_serializable
+class TFBartModel(TFBartPretrainedModel):
+    base_model_prefix = "model"
+
+    def __init__(self, config: BartConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, config.pad_token_id, name="model.shared")
+
+        with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name:
+            pass
+
+        # Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
+        embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
+        embed_tokens.vocab_size = self.shared.vocab_size
+        embed_tokens.hidden_size = self.shared.hidden_size
+
+        self.encoder = TFBartEncoder(config, embed_tokens, name="encoder")
+        self.decoder = TFBartDecoder(config, embed_tokens, name="decoder")
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=TFSeq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+        **kwargs
+    ):
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["decoder_input_ids"] is None and inputs["decoder_inputs_embeds"] is None:
+            inputs["use_cache"] = False
+
+        inputs["output_hidden_states"] = (
+            inputs["output_hidden_states"]
+            if inputs["output_hidden_states"] is not None
+            else self.config.output_hidden_states
+        )
+
+        if inputs["decoder_input_ids"] is None and inputs["input_ids"] is not None:
+            inputs["decoder_input_ids"] = shift_tokens_right(
+                inputs["input_ids"], self.config.pad_token_id, self.config.eos_token_id
+            )
+
+        if inputs["encoder_outputs"] is None:
+            inputs["encoder_outputs"] = self.encoder(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                inputs_embeds=inputs["inputs_embeds"],
+                output_attentions=inputs["output_attentions"],
+                output_hidden_states=inputs["output_hidden_states"],
+                return_dict=inputs["return_dict"],
+                training=inputs["training"],
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
+        elif inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], TFBaseModelOutput):
+            inputs["encoder_outputs"] = TFBaseModelOutput(
+                last_hidden_state=inputs["encoder_outputs"][0],
+                hidden_states=inputs["encoder_outputs"][1] if len(inputs["encoder_outputs"]) > 1 else None,
+                attentions=inputs["encoder_outputs"][2] if len(inputs["encoder_outputs"]) > 2 else None,
+            )
+        # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
+        elif not inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], tuple):
+            inputs["encoder_outputs"] = inputs["encoder_outputs"].to_tuple()
+
+        decoder_outputs = self.decoder(
+            inputs["decoder_input_ids"],
+            attention_mask=inputs["decoder_attention_mask"],
+            encoder_hidden_states=inputs["encoder_outputs"][0],
+            encoder_attention_mask=inputs["attention_mask"],
+            past_key_values=inputs["past_key_values"],
+            inputs_embeds=inputs["decoder_inputs_embeds"],
+            use_cache=inputs["use_cache"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+
+        if not inputs["return_dict"]:
+            return decoder_outputs + inputs["encoder_outputs"]
+
+        return TFSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state=inputs["encoder_outputs"].last_hidden_state,
+            encoder_hidden_states=inputs["encoder_outputs"].hidden_states,
+            encoder_attentions=inputs["encoder_outputs"].attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def get_output_embeddings(self):
+        return self.shared
+
+
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.",
+    BART_START_DOCSTRING,
+)
+class TFBartForConditionalGeneration(TFBartPretrainedModel):
+    _keys_to_ignore_on_load_unexpected = [
+        r"model.encoder.embed_tokens.weight",
+        r"model.decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFBartModel(config, name="model")
+        self.use_cache = config.use_cache
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
+        self.final_logits_bias = self.add_weight(
+            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
+        )
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def resize_token_embeddings(self, new_num_tokens):
+        super().resize_token_embeddings(new_num_tokens=new_num_tokens)
+
+        # BART is a special case where the bias has two dimensions
+        # and not named just `bias`
+        if new_num_tokens is not None:
+            num_tokens_to_copy = min(self.final_logits_bias.shape[0], new_num_tokens)
+            init_bias = tf.zeros((new_num_tokens,))
+            init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy]
+            self.final_logits_bias = self.add_weight(
+                shape=(1, new_num_tokens),
+                initializer="zeros",
+                trainable=False,
+                name="final_logits_bias",
+            )
+            self.final_logits_bias.assign(init_bias)
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs: Optional[TFBaseModelOutput] = None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        labels=None,
+        training=False,
+        **kwargs,
+    ):
+        """
+        Returns:
+
+        Examples::
+
+            # Mask filling only works for bart-large
+            from transformers import BartTokenizer, TFBartForConditionalGeneration
+            import tensorflow as tf
+            mname = 'facebook/bart-large'
+            tokenizer = BartTokenizer.from_pretrained(mname)
+            TXT = "My friends are <mask> but they eat too many carbs."
+            model = TFBartForConditionalGeneration.from_pretrained(mname)
+            batch = tokenizer([TXT], return_tensors='tf')
+            logits = model(inputs=batch.input_ids).logits
+            probs = tf.nn.softmax(logits[0])
+            # probs[5] is associated with the mask token
+        """
+        inputs = input_processing(
+            func=self.call,
+            config=self.config,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            labels=labels,
+            training=training,
+            kwargs_call=kwargs,
+        )
+
+        if inputs["labels"] is not None:
+            inputs["use_cache"] = False
+            if inputs["decoder_input_ids"] is None:
+                inputs["decoder_input_ids"] = shift_tokens_right(
+                    inputs["labels"], self.config.pad_token_id, self.config.eos_token_id
+                )
+
+        outputs = self.model(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            decoder_input_ids=inputs["decoder_input_ids"],
+            encoder_outputs=inputs["encoder_outputs"],
+            decoder_attention_mask=inputs["decoder_attention_mask"],
+            past_key_values=inputs["past_key_values"],
+            inputs_embeds=inputs["inputs_embeds"],
+            decoder_inputs_embeds=inputs["decoder_inputs_embeds"],
+            use_cache=inputs["use_cache"],
+            output_attentions=inputs["output_attentions"],
+            output_hidden_states=inputs["output_hidden_states"],
+            return_dict=inputs["return_dict"],
+            training=inputs["training"],
+        )
+        lm_logits = self.model.shared(outputs[0], mode="linear")
+        lm_logits = lm_logits + self.final_logits_bias
+        masked_lm_loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], lm_logits)
+
+        if not inputs["return_dict"]:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return TFSeq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,  # index 1 of d outputs
+            decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
+            decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
+            encoder_last_hidden_state=outputs.last_hidden_state,  # index 0 of encoder outputs
+            encoder_hidden_states=outputs.encoder_hidden_states,  # 1 of e out
+            encoder_attentions=outputs.encoder_attentions,  # 2 of e out
+        )
+
+    def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs) -> Dict:
+        assert past is not None and len(past) in {1, 2}, f"past has to be an iterable of length 1,2 got {past}"
+        if len(past) == 1:
+            assert isinstance(past[0], tf.Tensor), f"`past[0]` has to be of type `tf.Tensor`, but is {type(past[0])}"
+            encoder_outputs = TFBaseModelOutput(last_hidden_state=past[0])
+            past_key_values = None
+        else:
+            assert (
+                len(past) == 2
+            ), "`past` has to be of length 2 with the encoder_outputs at the first position and past_key_values at the second position."
+            encoder_outputs, past_key_values = past
+            if isinstance(encoder_outputs, tuple):
+                assert isinstance(
+                    encoder_outputs[0], tf.Tensor
+                ), f"`encoder_outputs[0]` has to be of type `tf.Tensor`, but is {type(encoder_outputs[0])}"
+                encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs[0])
+            elif isinstance(encoder_outputs, tf.Tensor):
+                encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs)
+            assert (
+                past_key_values
+            ), f"decoder cached states must be truthy. got {past_key_values} from the 2nd element of past"
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        assert isinstance(
+            encoder_outputs, TFBaseModelOutput
+        ), f"encoder_outputs should be a TFBaseModelOutput, Instead got {type(encoder_outputs)}."
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        if len(past) == 1:
+            return past
+
+        past_key_values = past[1]
+
+        reordered_past = ()
+        for layer_past_key_values in past_key_values:
+            reordered_past += (
+                tuple(tf.gather(layer_past_key_value, beam_idx) for layer_past_key_value in layer_past_key_values),
+            )
+        return (past[0], reordered_past)
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
+            vocab_range = tf.constant(range(self.config.vocab_size))
+            return tf.where(vocab_range != self.config.bos_token_id, LARGE_NEGATIVE, logits)
+        elif cur_len == max_length - 1:
+            vocab_range = tf.constant(range(self.config.vocab_size))
+            return tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
+        else:
+            return logits
+
+    def get_output_embeddings(self):
+        return self.model.shared
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def compute_loss(self, labels, logits):
+        """CrossEntropyLoss that ignores pad tokens"""
+        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
+            from_logits=True,
+            reduction=tf.keras.losses.Reduction.NONE,
+        )
+        melted_labels = tf.reshape(labels, (-1,))
+        active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
+        reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
+        labels = tf.boolean_mask(melted_labels, active_loss)
+        return loss_fn(labels, reduced_logits)
diff --git a/src/transformers/models/old_bart/tokenization_bart.py b/src/transformers/models/old_bart/tokenization_bart.py
new file mode 100644
index 00000000000000..6b46e30e9d527c
--- /dev/null
+++ b/src/transformers/models/old_bart/tokenization_bart.py
@@ -0,0 +1,99 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from transformers import add_start_docstrings
+
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
+from ...utils import logging
+from ..roberta.tokenization_roberta import RobertaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+# vocab and merges same as roberta
+vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
+merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
+_all_bart_models = [
+    "facebook/bart-base",
+    "facebook/bart-large",
+    "facebook/bart-large-mnli",
+    "facebook/bart-large-cnn",
+    "facebook/bart-large-xsum",
+    "yjernite/bart_eli5",
+    # This is not exhaustive: see https://huggingface.co/models?filter=bart
+]
+
+
+class BartTokenizer(RobertaTokenizer):
+    r"""
+    Construct a BART tokenizer.
+
+    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
+    :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
+
+    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
+    initialization parameters and other methods.
+    """
+    # merges and vocab same as Roberta
+    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
+    pretrained_vocab_files_map = {
+        "vocab_file": {m: vocab_url for m in _all_bart_models},
+        "merges_file": {m: merges_url for m in _all_bart_models},
+    }
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: str = None,
+        truncation=True,
+        **kwargs,
+    ) -> BatchEncoding:
+        kwargs.pop("src_lang", None)
+        kwargs.pop("tgt_lang", None)
+        if max_length is None:
+            max_length = self.model_max_length
+        model_inputs: BatchEncoding = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )["input_ids"]
+        model_inputs["labels"] = labels
+        return model_inputs
diff --git a/src/transformers/models/old_bart/tokenization_bart_fast.py b/src/transformers/models/old_bart/tokenization_bart_fast.py
new file mode 100644
index 00000000000000..30b77275f22169
--- /dev/null
+++ b/src/transformers/models/old_bart/tokenization_bart_fast.py
@@ -0,0 +1,92 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from transformers import add_start_docstrings
+
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
+from ...utils import logging
+from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
+from .tokenization_bart import BartTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+# vocab and merges same as roberta
+vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
+merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
+tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
+_all_bart_models = [
+    "facebook/bart-base",
+    "facebook/bart-large",
+    "facebook/bart-large-mnli",
+    "facebook/bart-large-cnn",
+    "facebook/bart-large-xsum",
+    "yjernite/bart_eli5",
+    # This is not exhaustive: see https://huggingface.co/models?filter=bart
+]
+
+
+class BartTokenizerFast(RobertaTokenizerFast):
+    # merges and vocab same as Roberta
+    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
+    pretrained_vocab_files_map = {
+        "vocab_file": {m: vocab_url for m in _all_bart_models},
+        "merges_file": {m: merges_url for m in _all_bart_models},
+        "tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
+    }
+    slow_tokenizer_class = BartTokenizer
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: Optional[str] = None,
+        truncation=True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.model_max_length
+        model_inputs: BatchEncoding = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )["input_ids"]
+        model_inputs["labels"] = labels
+        return model_inputs
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index f38816e095e2c7..564aba237ac126 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch BART model. """
+
 
 import copy
 import tempfile
@@ -32,44 +34,35 @@
     import torch
 
     from transformers import (
-        AutoModel,
-        AutoModelForSequenceClassification,
-        AutoTokenizer,
         BartConfig,
         BartForConditionalGeneration,
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
         BartTokenizer,
-        BartTokenizerFast,
-        BertConfig,
-        BlenderbotConfig,
-        MarianConfig,
-        MBartConfig,
-        PegasusConfig,
-        pipeline,
     )
     from transformers.models.bart.modeling_bart import (
         BartDecoder,
         BartEncoder,
-        BartSinusoidalPositionalEmbedding,
-        shift_tokens_right,
     )
 
 
-PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-
-
 def prepare_bart_inputs_dict(
     config,
     input_ids,
+    decoder_input_ids,
     attention_mask=None,
+    decoder_attention_mask=None,
 ):
     if attention_mask is None:
         attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
     return {
         "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
         "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
     }
 
 
@@ -112,13 +105,15 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
-        torch.manual_seed(0)
 
     def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
             3,
         )
-        input_ids[:, -1] = 2  # Eos Token
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
         config = BartConfig(
             vocab_size=self.vocab_size,
@@ -136,36 +131,33 @@ def prepare_config_and_inputs(self):
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
         )
-        inputs_dict = prepare_bart_inputs_dict(config, input_ids)
+        inputs_dict = prepare_bart_inputs_dict(config, input_ids, decoder_input_ids)
         return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
-        inputs_dict["decoder_input_ids"] = inputs_dict["input_ids"]
-        inputs_dict["decoder_attention_mask"] = inputs_dict["attention_mask"]
         return config, inputs_dict
 
     def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         model = BartModel(config=config).get_decoder().to(torch_device).eval()
         input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
 
         # first forward pass
-        outputs = model(input_ids, attention_mask=inputs_dict["attention_mask"], use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
 
         output, past_key_values = outputs.to_tuple()
 
         # create hypothetical multiple next token and extent to next_input_ids
         next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
 
         # append to next input_ids and
         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([inputs_dict["attention_mask"], next_mask], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
 
         output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
-            "last_hidden_state"
-        ]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
 
         # select random slice
         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -230,54 +222,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_initialization_more(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        model = BartModel(config)
-        model.to(torch_device)
-        model.eval()
-        # test init
-        self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item())
-
-        def _check_var(module):
-            """Check that we initialized various parameters from N(0, config.init_std)."""
-            self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2)
-
-        _check_var(model.encoder.embed_tokens)
-        _check_var(model.encoder.layers[0].self_attn.k_proj)
-        _check_var(model.encoder.layers[0].fc1)
-        _check_var(model.encoder.embed_positions)
-
-    def test_advanced_inputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        config.use_cache = False
-        inputs_dict["input_ids"][:, -2:] = config.pad_token_id
-
-        model = BartModel(config).to(torch_device).eval()
-        decoder_features_with_created_mask = model(**inputs_dict)[0]
-
-        decoder_input_ids = shift_tokens_right(inputs_dict["input_ids"], config.pad_token_id)
-        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
-        decoder_attention_mask[:, 0] = decoder_attention_mask[:, 1]
-
-        decoder_features_with_passed_mask = model(
-            decoder_attention_mask=decoder_attention_mask, decoder_input_ids=decoder_input_ids, **inputs_dict
-        )[0]
-        assert_tensors_close(decoder_features_with_passed_mask, decoder_features_with_created_mask)
-        useless_mask = torch.zeros_like(decoder_attention_mask)
-        decoder_features = model(decoder_attention_mask=useless_mask, **inputs_dict)[0]
-        self.assertTrue(isinstance(decoder_features, torch.Tensor))  # no hidden states or attentions
-        self.assertEqual(
-            decoder_features.size(), (self.model_tester.batch_size, self.model_tester.seq_length, config.d_model)
-        )
-        if decoder_attention_mask.min().item() == 0:  # some tokens were masked
-            self.assertFalse((decoder_features_with_created_mask == decoder_features).all().item())
-
-        # Test different encoder attention masks
-        decoder_features_with_long_encoder_mask = model(
-            inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"].long()
-        )[0]
-        assert_tensors_close(decoder_features_with_long_encoder_mask, decoder_features_with_created_mask)
-
     def test_save_load_strict(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
         for model_class in self.all_model_classes:
@@ -326,168 +270,9 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs)[0]
 
-    @require_sentencepiece
-    @require_tokenizers
-    def test_tiny_model(self):
-        model_name = "sshleifer/bart-tiny-random"
-        tiny = AutoModel.from_pretrained(model_name)  # same vocab size
-        tok = AutoTokenizer.from_pretrained(model_name)  # same tokenizer
-        inputs_dict = tok.batch_encode_plus(["Hello my friends"], return_tensors="pt")
-
-        with torch.no_grad():
-            tiny(**inputs_dict)
-
-
-@require_torch
-class BartHeadTests(unittest.TestCase):
-    vocab_size = 99
-
-    def _get_config_and_data(self):
-        input_ids = torch.tensor(
-            [
-                [71, 82, 18, 33, 46, 91, 2],
-                [68, 34, 26, 58, 30, 82, 2],
-                [5, 97, 17, 39, 94, 40, 2],
-                [76, 83, 94, 25, 70, 78, 2],
-                [87, 59, 41, 35, 48, 66, 2],
-                [55, 13, 16, 58, 5, 2, 1],  # note padding
-                [64, 27, 31, 51, 12, 75, 2],
-                [52, 64, 86, 17, 83, 39, 2],
-                [48, 61, 9, 24, 71, 82, 2],
-                [26, 1, 60, 48, 22, 13, 2],
-                [21, 5, 62, 28, 14, 76, 2],
-                [45, 98, 37, 86, 59, 48, 2],
-                [70, 70, 50, 9, 28, 0, 2],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-
-        batch_size = input_ids.shape[0]
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        return config, input_ids, batch_size
-
-    def test_sequence_classification_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        labels = _long_tensor([2] * batch_size).to(torch_device)
-        model = BartForSequenceClassification(config)
-        model.to(torch_device)
-        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
-        expected_shape = torch.Size((batch_size, config.num_labels))
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_question_answering_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        sequence_labels = ids_tensor([batch_size], 2).to(torch_device)
-        model = BartForQuestionAnswering(config)
-        model.to(torch_device)
-        outputs = model(
-            input_ids=input_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-
-        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
-        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    @timeout_decorator.timeout(1)
-    def test_lm_forward(self):
-        config, input_ids, batch_size = self._get_config_and_data()
-        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size).to(torch_device)
-        lm_model = BartForConditionalGeneration(config)
-        lm_model.to(torch_device)
-        outputs = lm_model(input_ids=input_ids, labels=lm_labels)
-        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-        self.assertIsInstance(outputs["loss"].item(), float)
-
-    def test_lm_uneven_forward(self):
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=14,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=8,
-            decoder_ffn_dim=8,
-            max_position_embeddings=48,
-        )
-        lm_model = BartForConditionalGeneration(config).to(torch_device)
-        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
-        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
-        outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(outputs["logits"].shape, expected_shape)
-
-    def test_generate_beam_search(self):
-        input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device)
-        config = BartConfig(
-            vocab_size=self.vocab_size,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            eos_token_id=2,
-            pad_token_id=1,
-            bos_token_id=0,
-        )
-        lm_model = BartForConditionalGeneration(config).to(torch_device)
-        lm_model.eval()
-
-        max_length = 5
-        generated_ids = lm_model.generate(
-            input_ids.clone(),
-            do_sample=True,
-            num_return_sequences=1,
-            num_beams=2,
-            no_repeat_ngram_size=3,
-            max_length=max_length,
-        )
-        self.assertEqual(generated_ids.shape, (input_ids.shape[0], max_length))
-
-    def test_shift_tokens_right(self):
-        input_ids = torch.Tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]]).long()
-        shifted = shift_tokens_right(input_ids, 1)
-        n_pad_before = input_ids.eq(1).float().sum()
-        n_pad_after = shifted.eq(1).float().sum()
-        self.assertEqual(shifted.shape, input_ids.shape)
-        self.assertEqual(n_pad_after, n_pad_before - 1)
-        self.assertTrue(torch.eq(shifted[:, 0], 2).all())
-
-    @slow
-    def test_tokenization(self):
-        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
-        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
-        fairseq_results = [
-            torch.Tensor([0, 20920, 232, 2]),
-            torch.Tensor([0, 11349, 495, 4040, 571, 2]),
-        ]
-        for ex, desired_result in zip(examples, fairseq_results):
-            bart_toks = tokenizer.encode(ex, return_tensors="pt").squeeze()
-            assert_tensors_close(desired_result.long(), bart_toks, prefix=ex)
-
     def test_generate_fp16(self):
-        config, input_ids, batch_size = self._get_config_and_data()
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
         attention_mask = input_ids.ne(1).to(torch_device)
         model = BartForConditionalGeneration(config).eval().to(torch_device)
         if torch_device == "cuda":
@@ -495,27 +280,6 @@ def test_generate_fp16(self):
         model.generate(input_ids, attention_mask=attention_mask)
         model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
-    def test_dummy_inputs(self):
-        config, *_ = self._get_config_and_data()
-        model = BartForConditionalGeneration(config).eval().to(torch_device)
-        model(**model.dummy_inputs)
-
-    def test_resize_tokens_embeddings_more(self):
-        config, input_ids, _ = self._get_config_and_data()
-
-        def _get_embs(m):
-            return (m.get_input_embeddings().weight.data.clone(), m.get_output_embeddings().weight.data.clone())
-
-        model = BartForConditionalGeneration(config).eval().to(torch_device)
-        input, output = _get_embs(model)
-        self.assertTrue(torch.eq(input, output).all())
-        new_vocab_size = 45
-        model.resize_token_embeddings(new_vocab_size)
-        input_new, output_new = _get_embs(model)
-        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
-        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
-        self.assertTrue(torch.eq(input_new, output_new).all())
-
 
 def assert_tensors_close(a, b, atol=1e-12, prefix=""):
     """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
@@ -546,269 +310,79 @@ def _long_tensor(tok_lst):
 @require_torch
 @require_sentencepiece
 @require_tokenizers
+@slow
 class BartModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_tokenizer(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @cached_property
-    def default_tokenizer_fast(self):
-        return BartTokenizerFast.from_pretrained("facebook/bart-large")
+        return BartTokenizer.from_pretrained('facebook/bart-large')
 
-    @slow
     def test_inference_no_head(self):
-        model = BartModel.from_pretrained("facebook/bart-large").to(torch_device)
+        model = BartModel.from_pretrained('facebook/bart-large').to(torch_device)
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
+        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
+        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids, decoder_input_ids)
         with torch.no_grad():
             output = model(**inputs_dict)[0]
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
         expected_slice = torch.tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
-    @slow
-    def test_base_mask_filling(self):
-        pbase = pipeline(task="fill-mask", model="facebook/bart-base")
-        src_text = [" I went to the <mask>."]
-        results = [x["token_str"] for x in pbase(src_text)]
-        assert "Ġbathroom" in results
-
-    @slow
-    def test_large_mask_filling(self):
-        plarge = pipeline(task="fill-mask", model="facebook/bart-large")
-        src_text = [" I went to the <mask>."]
-        results = [x["token_str"] for x in plarge(src_text)]
-        expected_results = ["Ġbathroom", "Ġgym", "Ġwrong", "Ġmovies", "Ġhospital"]
-        self.assertListEqual(results, expected_results)
-
-    @slow
-    def test_mnli_inference(self):
-
-        example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
-
-        model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").to(
-            torch_device
-        )  # eval called in from_pre
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
-        # Test that model hasn't changed
-        with torch.no_grad():
-            outputs = model(**inputs_dict)
-
-        batched_logits = outputs[0]
-        expected_shape = torch.Size((2, 3))
-        self.assertEqual(batched_logits.shape, expected_shape)
-        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)
-        logits_arr = batched_logits[0].detach()
-
-        # Test that padding does not change results
-        input_ids_no_pad = _long_tensor([example_b[:-1]])
+    def test_inference_head(self):
+        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(torch_device)
 
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids=input_ids_no_pad)
+        # change to intended input
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids, decoder_input_ids)
         with torch.no_grad():
-            logits2 = model(**inputs_dict)[0].squeeze()
-        assert_tensors_close(batched_logits[1], logits2, atol=TOLERANCE)
-        assert_tensors_close(expected_slice, logits_arr, atol=TOLERANCE)
-
-    @slow
-    def test_xsum_summarization_same_as_fairseq(self):
-        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum").to(torch_device)
-        self.assertFalse(model.config.is_valid_mbart())
-        tok = self.default_tokenizer
-
-        EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state."
-        dct = tok.batch_encode_plus(
-            [PGE_ARTICLE],
-            max_length=1024,
-            padding="max_length",
-            truncation=True,
-            return_tensors="pt",
-        ).to(torch_device)
-
-        hypotheses_batch = model.generate(
-            input_ids=dct["input_ids"],
-            attention_mask=dct["attention_mask"],
-            num_beams=2,
-            max_length=62,
-            min_length=11,
-            length_penalty=1.0,
-            no_repeat_ngram_size=3,
-            early_stopping=True,
-            decoder_start_token_id=model.config.eos_token_id,
-        )
-
-        decoded = tok.batch_decode(
-            hypotheses_batch,
-            skip_special_tokens=True,
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
-        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
-
-    def test_xsum_config_generation_params(self):
-        config = BartConfig.from_pretrained("facebook/bart-large-xsum")
-        expected_params = dict(num_beams=6, do_sample=False, early_stopping=True, length_penalty=1.0)
-        config_params = {k: getattr(config, k, "MISSING") for k, v in expected_params.items()}
-        self.assertDictEqual(expected_params, config_params)
-
-    @slow
-    def test_cnn_summarization_same_as_fairseq(self):
-        hf = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)
-        tok = BartTokenizer.from_pretrained("facebook/bart-large")
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
-        FRANCE_ARTICLE = ' Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noq
+    def test_seq_to_seq_generation(self):
+        hf = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(torch_device)
+        tok = BartTokenizer.from_pretrained('facebook/bart-large')
 
-        SHORTER_ARTICLE = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        batch_input = [
+            # string 1,
+            # string 2,
+            # string 3,
+            # string 4,
+        ]
 
         # The below article tests that we don't add any hypotheses outside of the top n_beams
-        IRAN_ARTICLE = " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
-
-        ARTICLE_SUBWAY = ' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
-
         dct = tok.batch_encode_plus(
-            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
-            max_length=1024,
+            batch_input,
+            max_length=512,
             padding="max_length",
             truncation_strategy="only_first",
             truncation=True,
             return_tensors="pt",
         )
 
-        self.assertEqual(1024, dct["input_ids"].shape[1])
         hypotheses_batch = hf.generate(
             input_ids=dct["input_ids"].to(torch_device),
             attention_mask=dct["attention_mask"].to(torch_device),
             num_beams=2,
         )
-        assert hypotheses_batch[:, 1].eq(0).all().item()
 
         EXPECTED = [
-            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
-            "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
-            "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
-            "9525 were killed.",
-            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
-            "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
-            "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
-            "move toward greater justice.",
-            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
-            "debate that has already begun will likely result in more heat than light. He says critics have made "
-            "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
-            "nuclear weapon.",
-            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
-            "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
-            "Bronx on Friday. If convicted, she faces up to four years in prison.",
+            # here expected 1,
+            # here expected 2,
+            # here expected 3,
+            # here expected 4,
         ]
 
-        generated_summaries = tok.batch_decode(
+        generated = tok.batch_decode(
             hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
         )
-        assert generated_summaries == EXPECTED
-
-
-@require_torch
-class TestBartSinusoidalPositionalEmbeddings(unittest.TestCase):
-    desired_weights = [
-        [0, 0, 0, 0, 0],
-        [0.84147096, 0.82177866, 0.80180490, 0.78165019, 0.76140374],
-        [0.90929741, 0.93651021, 0.95829457, 0.97505713, 0.98720258],
-    ]
-
-    def test_positional_emb_cache_logic(self):
-        emb1 = BartSinusoidalPositionalEmbedding(num_positions=32, embedding_dim=6, padding_idx=1).to(torch_device)
-        no_cache = emb1((4, 10), past_key_values_length=0)
-        yes_cache = emb1((4, 10), past_key_values_length=2)
-
-        self.assertTrue(no_cache.shape == yes_cache.shape == (10, 6))
-        self.assertListEqual(no_cache[2:].tolist(), yes_cache[:-2].tolist())
-
-    def test_odd_embed_dim(self):
-        # odd embedding_dim is allowed
-        BartSinusoidalPositionalEmbedding(num_positions=4, embedding_dim=5, padding_idx=0).to(torch_device)
-
-        # odd num_positions is allowed
-        BartSinusoidalPositionalEmbedding(num_positions=5, embedding_dim=4, padding_idx=0).to(torch_device)
-
-    def test_positional_emb_weights_against_marian(self):
-        pad = 1
-        emb1 = BartSinusoidalPositionalEmbedding(num_positions=512, embedding_dim=512, padding_idx=pad).to(
-            torch_device
-        )
-        weights = emb1.weight.data[:3, :5].tolist()
-        for i, (expected_weight, actual_weight) in enumerate(zip(self.desired_weights, weights)):
-            for j in range(5):
-                self.assertAlmostEqual(expected_weight[j], actual_weight[j], places=3)
-
-    def test_child_config_equivalence(self):
-        """Test that configs associated with children of BartForConditionalGeneration are identical."""
-        child_classes = [BlenderbotConfig, MBartConfig, MarianConfig, PegasusConfig]
-        parent_keys = BartConfig().to_dict().keys()
-        for c in child_classes:
-            assert c().to_dict().keys() == parent_keys  # traceback is very nice on it's own
-        # check that test is not stupid
-        assert BertConfig().to_dict().keys() != parent_keys
-
-
-@require_torch
-@slow
-class FastIntegrationTests(unittest.TestCase):
-    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
-
-    @cached_property
-    def tok(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @cached_property
-    def xsum_1_1_model(self):
-        return BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
-
-    def test_xsum_1_1_generation(self):
-        hf = self.xsum_1_1_model
-        tok = self.tok
-        ARTICLE = 'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
-        EXPECTED = " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
-
-        dct = tok(ARTICLE, return_tensors="pt")
-        generated_ids = hf.generate(**dct, num_beams=4)
-        result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        assert EXPECTED == result
-
-    def test_xsum_1_1_batch_generation(self):
-        # test batch
-
-        batch = self.tok(
-            [
-                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
-                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
-            ],
-            return_tensors="pt",
-            padding="longest",
-            truncation=True,
-        )
-        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
-        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
-        assert (
-            result[0]
-            == " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
-        )
-        assert (
-            result[1]
-            == " An investigation into the crash that killed at least 10 people in the French capital has been released by the French police investigating the crash."
-        )
-
-    def test_encoder_equiv(self):
-        # test batch
-
-        batch = self.tok(
-            [
-                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
-                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
-            ],
-            return_tensors="pt",
-            padding="longest",
-            truncation=True,
-        )
-        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
-        expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
-        assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3)
+        assert generated == EXPECTED

From c963c8ad2a0cc42925a2904cb816f33f2a9d28d5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 29 Dec 2020 16:00:20 +0000
Subject: [PATCH 02/51] remove old template

---
 cookiecutter-template-BART/bart.rst           |  80 -------
 cookiecutter-template-BART/to_replace_bart.py | 212 ------------------
 .../tokenization_bart.py                      |  51 -----
 .../tokenization_fast_bart.py                 |  53 -----
 4 files changed, 396 deletions(-)
 delete mode 100644 cookiecutter-template-BART/bart.rst
 delete mode 100644 cookiecutter-template-BART/to_replace_bart.py
 delete mode 100644 cookiecutter-template-BART/tokenization_bart.py
 delete mode 100644 cookiecutter-template-BART/tokenization_fast_bart.py

diff --git a/cookiecutter-template-BART/bart.rst b/cookiecutter-template-BART/bart.rst
deleted file mode 100644
index d8a4841a7db474..00000000000000
--- a/cookiecutter-template-BART/bart.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-BART
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The BART model was proposed in `<INSERT PAPER NAME HERE>
-<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-BartConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartConfig
-    :members:
-
-
-BartTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-BartTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-BartModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartModel
-    :members: forward
-
-
-BartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForConditionalGeneration
-    :members: forward
-
-
-BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForSequenceClassification
-    :members: forward
-
-
-BartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForQuestionAnswering
-    :members: forward
-
-
diff --git a/cookiecutter-template-BART/to_replace_bart.py b/cookiecutter-template-BART/to_replace_bart.py
deleted file mode 100644
index 437d98f6566875..00000000000000
--- a/cookiecutter-template-BART/to_replace_bart.py
+++ /dev/null
@@ -1,212 +0,0 @@
-## Copyright 2020 The HuggingFace Team. All rights reserved.
-##
-## Licensed under the Apache License, Version 2.0 (the "License");
-## you may not use this file except in compliance with the License.
-## You may obtain a copy of the License at
-##
-##     http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-## This file is made so that specific statements may be copied inside existing files. This is useful to copy
-## import statements in __init__.py, or to complete model lists in the AUTO files.
-##
-## It is to be used as such:
-## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
-## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH
-## Put '# Replace with:' followed by the lines containing the content to define the content
-## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
-## content in that file.
-##
-## Put '## COMMENT' to comment on the file.
-
-
-# To replace in: "src/transformers/__init__.py"
-# Below: "if is_torch_available():" if generating PyTorch
-# Replace with:
-
-    from .models.bart import (
-        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
-        BartForConditionalGeneration,
-        BartForQuestionAnswering,
-        BartForSequenceClassification,
-        BartModel,
-    )
-# End.
-
-# Below: "if is_tf_available():" if generating TensorFlow
-# Replace with:
-
-    from .models.bart import (
-        TFBartForConditionalGeneration,
-        TFBartModel,
-        TFBartPreTrainedModel,
-    )
-# End.
-
-# Below: "if is_tokenizers_available():"
-# Replace with:
-    from .models.bart import BartTokenizerFast
-# End.
-
-# Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
-# Replace with:
-from .models.bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig, BartTokenizer
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/configuration_auto.py"
-# Below: "# Add configs here"
-# Replace with:
-        ("bart", BartConfig),
-# End.
-
-# Below: "# Add archive maps here"
-# Replace with:
-        BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
-# End.
-
-# Below: "from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
-# Replace with:
-from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
-# End.
-
-# Below: "# Add full (and cased) model names here"
-# Replace with:
-        ("bart", "Bart"),
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
-# Below: "from .configuration_auto import ("
-# Replace with:
-    BartConfig,
-# End.
-
-# Below: "# Add modeling imports here"
-# Replace with:
-from ..bart.modeling_bart import (
-    BartForConditionalGeneration,
-    BartForQuestionAnswering,
-    BartForSequenceClassification,
-    BartModel,
-)
-# End.
-
-# Below: "# Base model mapping"
-# Replace with:
-        (BartConfig, BartModel),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-
-        (BartConfig, BartForConditionalGeneration),
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-        (BartConfig, BartForSequenceClassification),
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-        (BartConfig, BartForQuestionAnswering),
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-
-        (BartConfig, BartForConditionalGeneration),
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
-# Below: "from .configuration_auto import ("
-# Replace with:
-    BartConfig,
-# End.
-
-# Below: "# Add modeling imports here"
-# Replace with:
-from ..bart.modeling_tf_bart import (
-    TFBartForConditionalGeneration,
-    TFBartModel,
-)
-# End.
-
-# Below: "# Base model mapping"
-# Replace with:
-        (BartConfig, TFBartModel),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-
-        (BartConfig, TFBartForConditionalGeneration),
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-
-        (BartConfig, TFBartForConditionalGeneration),
-# End.
-
-# To replace in: "utils/check_repo.py" if generating PyTorch
-
-# Below: "models to ignore for model xxx mapping"
-# Replace with:
-"BartEncoder",
-    "BartDecoder",
-# End.
-
-# Below: "models to ignore for not tested"
-# Replace with:
-"BartEncoder",  # Building part of bigger (tested) model.
-    "BartDecoder",  # Building part of bigger (tested) model.
-# End.
diff --git a/cookiecutter-template-BART/tokenization_bart.py b/cookiecutter-template-BART/tokenization_bart.py
deleted file mode 100644
index a032d8b8b7ad56..00000000000000
--- a/cookiecutter-template-BART/tokenization_bart.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for BART."""
-from ...utils import logging
-from ..bart.tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
-    },
-    "tokenizer_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/bart-large": 1024,
-}
-
-
-class BartTokenizer(BartTokenizer):
-    """
-    Construct a BART tokenizer.
-
-    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
-    tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
-    parameters.
-    """
-
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/cookiecutter-template-BART/tokenization_fast_bart.py b/cookiecutter-template-BART/tokenization_fast_bart.py
deleted file mode 100644
index c25e7e91353259..00000000000000
--- a/cookiecutter-template-BART/tokenization_fast_bart.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# coding=utf-8
-# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for BART."""
-from ...utils import logging
-from ..bart.tokenization_bart_fast import BartTokenizerFast
-from .tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
-    },
-    "tokenizer_file": {
-        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/bart-large": 1024,
-}
-
-
-class BartTokenizerFast(BartTokenizerFast):
-    r"""
-    Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    slow_tokenizer_class = BartTokenizer

From 9b10bf196bf996e5d24d565295d9812096839871 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 29 Dec 2020 21:36:00 +0000
Subject: [PATCH 03/51] finish bart

---
 src/transformers/commands/add_new_model.py    |  30 +-
 src/transformers/models/bart/__init__.py      |   9 +-
 .../models/bart/configuration_bart.py         |  23 +-
 src/transformers/models/bart/modeling_bart.py |  68 ++-
 tests/test_modeling_bart.py                   | 415 ++++++++++++++++--
 5 files changed, 455 insertions(+), 90 deletions(-)

diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index d1b29e0af3ca8d..733b389106dbc6 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -154,20 +154,20 @@ def remove_copy_lines(path):
             os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
             os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
 
-#        shutil.move(
-#            f"{directory}/{lowercase_model_name}.rst",
-#            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
-#        )
-#
-#        shutil.move(
-#            f"{directory}/tokenization_{lowercase_model_name}.py",
-#            f"{model_dir}/tokenization_{lowercase_model_name}.py",
-#        )
-#
-#        shutil.move(
-#            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
-#            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
-#        )
+        #        shutil.move(
+        #            f"{directory}/{lowercase_model_name}.rst",
+        #            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
+        #        )
+        #
+        #        shutil.move(
+        #            f"{directory}/tokenization_{lowercase_model_name}.py",
+        #            f"{model_dir}/tokenization_{lowercase_model_name}.py",
+        #        )
+        #
+        #        shutil.move(
+        #            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
+        #            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
+        #        )
 
         from os import fdopen, remove
         from shutil import copymode, move
@@ -225,5 +225,5 @@ def replace_in_files(path_to_datafile):
 
             remove(path_to_datafile)
 
-#        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
+        #        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
         os.rmdir(directory)
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index 28b59ce9688966..e0d7dfebdd212a 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -15,10 +15,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...file_utils import is_torch_available, is_tokenizers_available
+from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
 from .tokenization_bart import BartTokenizer
 
+
 if is_tokenizers_available():
     from .tokenization_bart_fast import BartTokenizerFast
 
@@ -29,7 +30,9 @@
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
-        BartPreTrainedModel,
+        BartPretrainedModel,
+        PretrainedBartModel,
     )
 
-
+if is_tf_available():
+    from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 59235db6eaf123..0c5c7a395932dd 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -28,14 +28,13 @@
 
 class BartConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`.
-    It is used to instantiate an BART model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the BART `facebook/bart-large <https://huggingface.co/facebook/bart-large>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
+    instantiate an BART model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
+    <https://huggingface.co/facebook/bart-large>`__ architecture.
 
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-    for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
@@ -81,6 +80,9 @@ class BartConfig(PretrainedConfig):
             https://arxiv.org/abs/1909.11556>`__ for more details.
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (:obj:`int`, `optional`, defaults to 3):
+            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
+
         Example::
 
         >>> from transformers import BartModel, BartConfig
@@ -120,18 +122,21 @@ def __init__(
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
+        force_bos_token_to_be_generated=False,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
+        num_labels=3,
         **kwargs
     ):
         super().__init__(
+            num_labels=num_labels,
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
-            **kwargs
+            **kwargs,
         )
 
         self.vocab_size = vocab_size
@@ -155,8 +160,8 @@ def __init__(
         self.num_hidden_layers = encoder_layers
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated  # only relevant for CNN
 
-        
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 975812a3eecca8..0c57dae846071a 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -15,7 +15,9 @@
 """ PyTorch BART model. """
 
 
+import math
 import random
+import warnings
 from typing import Optional, Tuple
 
 import torch
@@ -86,9 +88,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
-def _expand_mask(
-    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
-):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -115,13 +115,16 @@ def BartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_a
 
 class BartLearnedPositionalEmbedding(nn.Embedding):
     """
-    This module learns positional embeddings up to a fixed maximum size. 
+    This module learns positional embeddings up to a fixed maximum size.
     """
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
         num_embeddings
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim, padding_idx=padding_idx)
 
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
@@ -129,7 +132,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         positions = torch.arange(
             past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
         )
-        return super().forward(positions)
+        return super().forward(positions + self.offset)
 
 
 class BartAttention(nn.Module):
@@ -451,7 +454,7 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-class BartPreTrainedModel(PreTrainedModel):
+class BartPretrainedModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
 
@@ -477,6 +480,14 @@ def dummy_inputs(self):
         return dummy_inputs
 
 
+class PretrainedBartModel(BartPretrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            FutureWarning,
+        )
+
+
 BART_START_DOCSTRING = r"""
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
@@ -488,10 +499,9 @@ def dummy_inputs(self):
 
     Parameters:
         config (:class:`~transformers.BartConfig`):
-            Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BART_GENERATION_EXAMPLE = r"""
@@ -575,7 +585,7 @@ def dummy_inputs(self):
 """
 
 
-class BartEncoder(BartPreTrainedModel):
+class BartEncoder(BartPretrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
     :class:`BartEncoderLayer`.
@@ -723,7 +733,7 @@ def custom_forward(*inputs):
         )
 
 
-class BartDecoder(BartPreTrainedModel):
+class BartDecoder(BartPretrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
 
@@ -949,7 +959,7 @@ def custom_forward(*inputs):
     "The bare BART Model outputting raw hidden-states without any specific head on top.",
     BART_START_DOCSTRING,
 )
-class BartModel(BartPreTrainedModel):
+class BartModel(BartPretrainedModel):
     def __init__(self, config: BartConfig):
         super().__init__(config)
 
@@ -997,6 +1007,14 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1053,7 +1071,7 @@ def forward(
 @add_start_docstrings(
     "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
 )
-class BartForConditionalGeneration(BartPreTrainedModel):
+class BartForConditionalGeneration(BartPretrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
         r"final_logits_bias",
@@ -1143,7 +1161,9 @@ def forward(
 
         if labels is not None:
             if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
 
         outputs = self.model(
             input_ids,
@@ -1198,6 +1218,18 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
+            self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
+        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
@@ -1213,7 +1245,7 @@ def _reorder_cache(past, beam_idx):
     """,
     BART_START_DOCSTRING,
 )
-class BartForSequenceClassification(BartPreTrainedModel):
+class BartForSequenceClassification(BartPretrainedModel):
     def __init__(self, config: BartConfig, **kwargs):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
@@ -1315,7 +1347,7 @@ def forward(
     """,
     BART_START_DOCSTRING,
 )
-class BartForQuestionAnswering(BartPreTrainedModel):
+class BartForQuestionAnswering(BartPretrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 564aba237ac126..9d242df95108fc 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -34,23 +34,23 @@
     import torch
 
     from transformers import (
+        AutoModelForSequenceClassification,
         BartConfig,
         BartForConditionalGeneration,
         BartForQuestionAnswering,
         BartForSequenceClassification,
         BartModel,
         BartTokenizer,
+        BartTokenizerFast,
+        pipeline,
     )
-    from transformers.models.bart.modeling_bart import (
-        BartDecoder,
-        BartEncoder,
-    )
+    from transformers.models.bart.modeling_bart import BartDecoder, BartEncoder, shift_tokens_right
 
 
 def prepare_bart_inputs_dict(
     config,
     input_ids,
-    decoder_input_ids,
+    decoder_input_ids=None,
     attention_mask=None,
     decoder_attention_mask=None,
 ):
@@ -157,7 +157,9 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
 
         output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -202,6 +204,185 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
         self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
 
 
+@require_torch
+class BartHeadTests(unittest.TestCase):
+    vocab_size = 99
+
+    def _get_config_and_data(self):
+        input_ids = torch.tensor(
+            [
+                [71, 82, 18, 33, 46, 91, 2],
+                [68, 34, 26, 58, 30, 82, 2],
+                [5, 97, 17, 39, 94, 40, 2],
+                [76, 83, 94, 25, 70, 78, 2],
+                [87, 59, 41, 35, 48, 66, 2],
+                [55, 13, 16, 58, 5, 2, 1],  # note padding
+                [64, 27, 31, 51, 12, 75, 2],
+                [52, 64, 86, 17, 83, 39, 2],
+                [48, 61, 9, 24, 71, 82, 2],
+                [26, 1, 60, 48, 22, 13, 2],
+                [21, 5, 62, 28, 14, 76, 2],
+                [45, 98, 37, 86, 59, 48, 2],
+                [70, 70, 50, 9, 28, 0, 2],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        batch_size = input_ids.shape[0]
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+        return config, input_ids, batch_size
+
+    def test_sequence_classification_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        labels = _long_tensor([2] * batch_size).to(torch_device)
+        model = BartForSequenceClassification(config)
+        model.to(torch_device)
+        outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
+        expected_shape = torch.Size((batch_size, config.num_labels))
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    def test_question_answering_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        sequence_labels = ids_tensor([batch_size], 2).to(torch_device)
+        model = BartForQuestionAnswering(config)
+        model.to(torch_device)
+        outputs = model(
+            input_ids=input_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+
+        self.assertEqual(outputs["start_logits"].shape, input_ids.shape)
+        self.assertEqual(outputs["end_logits"].shape, input_ids.shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    @timeout_decorator.timeout(1)
+    def test_lm_forward(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size).to(torch_device)
+        lm_model = BartForConditionalGeneration(config)
+        lm_model.to(torch_device)
+        outputs = lm_model(input_ids=input_ids, labels=lm_labels)
+        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+        self.assertIsInstance(outputs["loss"].item(), float)
+
+    def test_lm_uneven_forward(self):
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=14,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=8,
+            decoder_ffn_dim=8,
+            max_position_embeddings=48,
+        )
+        lm_model = BartForConditionalGeneration(config).to(torch_device)
+        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
+        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
+        outputs = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(outputs["logits"].shape, expected_shape)
+
+    def test_generate_beam_search(self):
+        input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device)
+        config = BartConfig(
+            vocab_size=self.vocab_size,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            eos_token_id=2,
+            pad_token_id=1,
+            bos_token_id=0,
+        )
+        lm_model = BartForConditionalGeneration(config).to(torch_device)
+        lm_model.eval()
+
+        max_length = 5
+        generated_ids = lm_model.generate(
+            input_ids.clone(),
+            do_sample=True,
+            num_return_sequences=1,
+            num_beams=2,
+            no_repeat_ngram_size=3,
+            max_length=max_length,
+        )
+        self.assertEqual(generated_ids.shape, (input_ids.shape[0], max_length))
+
+    def test_shift_tokens_right(self):
+        input_ids = torch.Tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]]).long()
+        shifted = shift_tokens_right(input_ids, 1, 2)
+        n_pad_before = input_ids.eq(1).float().sum()
+        n_pad_after = shifted.eq(1).float().sum()
+        self.assertEqual(shifted.shape, input_ids.shape)
+        self.assertEqual(n_pad_after, n_pad_before - 1)
+        self.assertTrue(torch.eq(shifted[:, 0], 2).all())
+
+    @slow
+    def test_tokenization(self):
+        tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+        examples = [" Hello world", " DomDramg"]  # need leading spaces for equality
+        fairseq_results = [
+            torch.Tensor([0, 20920, 232, 2]),
+            torch.Tensor([0, 11349, 495, 4040, 571, 2]),
+        ]
+        for ex, desired_result in zip(examples, fairseq_results):
+            bart_toks = tokenizer.encode(ex, return_tensors="pt").squeeze()
+            assert_tensors_close(desired_result.long(), bart_toks, prefix=ex)
+
+    def test_generate_fp16(self):
+        config, input_ids, batch_size = self._get_config_and_data()
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_dummy_inputs(self):
+        config, *_ = self._get_config_and_data()
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        model(**model.dummy_inputs)
+
+    def test_resize_tokens_embeddings_more(self):
+        config, input_ids, _ = self._get_config_and_data()
+
+        def _get_embs(m):
+            return (m.get_input_embeddings().weight.data.clone(), m.get_output_embeddings().weight.data.clone())
+
+        model = BartForConditionalGeneration(config).eval().to(torch_device)
+        input, output = _get_embs(model)
+        self.assertTrue(torch.eq(input, output).all())
+        new_vocab_size = 45
+        model.resize_token_embeddings(new_vocab_size)
+        input_new, output_new = _get_embs(model)
+        self.assertEqual(input_new.shape, (new_vocab_size, config.d_model))
+        self.assertEqual(output_new.shape, (new_vocab_size, config.d_model))
+        self.assertTrue(torch.eq(input_new, output_new).all())
+
+
 @require_torch
 class BartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (
@@ -304,85 +485,229 @@ def _long_tensor(tok_lst):
     return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
-TOLERANCE = 1e-4
+@require_torch
+@slow
+class FastIntegrationTests(unittest.TestCase):
+    """These tests are useful for debugging since they operate on a model with 1 encoder layer and 1 decoder layer."""
+
+    @cached_property
+    def tok(self):
+        return BartTokenizer.from_pretrained("facebook/bart-large")
+
+    @cached_property
+    def xsum_1_1_model(self):
+        return BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-xsum-1-1")
+
+    def test_xsum_1_1_generation(self):
+        hf = self.xsum_1_1_model
+        tok = self.tok
+        ARTICLE = 'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.'
+        EXPECTED = " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+
+        dct = tok(ARTICLE, return_tensors="pt")
+        generated_ids = hf.generate(**dct, num_beams=4)
+        result = tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        assert EXPECTED == result
+
+    def test_xsum_1_1_batch_generation(self):
+        # test batch
+
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+        )
+        generated_ids = self.xsum_1_1_model.generate(**batch, num_beams=4)
+        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)
+        assert (
+            result[0]
+            == " The International Criminal Court (ICC) has announced that it has been announced by the International Criminal court."
+        )
+        assert (
+            result[1]
+            == " An investigation into the crash that killed at least 10 people in the French capital has been released by the French police investigating the crash."
+        )
+
+    def test_encoder_equiv(self):
+        # test batch
+
+        batch = self.tok(
+            [
+                'The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes.',
+                'The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.',
+            ],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+        )
+        features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
+        expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
+        assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3)
 
 
 @require_torch
 @require_sentencepiece
 @require_tokenizers
-@slow
 class BartModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_tokenizer(self):
-        return BartTokenizer.from_pretrained('facebook/bart-large')
+        return BartTokenizer.from_pretrained("facebook/bart-large")
 
+    @cached_property
+    def default_tokenizer_fast(self):
+        return BartTokenizerFast.from_pretrained("facebook/bart-large")
+
+    @slow
     def test_inference_no_head(self):
-        model = BartModel.from_pretrained('facebook/bart-large').to(torch_device)
+        model = BartModel.from_pretrained("facebook/bart-large").to(torch_device)
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids, decoder_input_ids)
+        attention_mask = input_ids.ne(model.config.pad_token_id)
         with torch.no_grad():
-            output = model(**inputs_dict)[0]
+            output = model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
         expected_slice = torch.tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3))
+
+    @slow
+    def test_base_mask_filling(self):
+        pbase = pipeline(task="fill-mask", model="facebook/bart-base")
+        src_text = [" I went to the <mask>."]
+        results = [x["token_str"] for x in pbase(src_text)]
+        assert "Ġbathroom" in results
+
+    @slow
+    def test_large_mask_filling(self):
+        plarge = pipeline(task="fill-mask", model="facebook/bart-large")
+        src_text = [" I went to the <mask>."]
+        results = [x["token_str"] for x in plarge(src_text)]
+        expected_results = ["Ġbathroom", "Ġgym", "Ġwrong", "Ġmovies", "Ġhospital"]
+        self.assertListEqual(results, expected_results)
+
+    @slow
+    def test_mnli_inference(self):
+        example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
+
+        model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli").to(
+            torch_device
+        )  # eval called in from_pre
+        attention_mask = input_ids.ne(model.config.pad_token_id)
+        # Test that model hasn't changed
+        with torch.no_grad():
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
 
-    def test_inference_head(self):
-        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(torch_device)
+        batched_logits = outputs.logits
+        expected_shape = torch.Size((2, 3))
+        self.assertEqual(batched_logits.shape, expected_shape)
+        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)
+        logits_arr = batched_logits[0].detach()
+
+        # Test that padding does not change results
+        input_ids_no_pad = _long_tensor([example_b[:-1]])
+        attention_mask_no_pad = input_ids_no_pad.ne(model.config.pad_token_id)
 
-        # change to intended input
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids, decoder_input_ids)
         with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, model.config.vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+            logits2 = model(input_ids=input_ids_no_pad, attention_mask=attention_mask_no_pad).logits.squeeze()
+        assert_tensors_close(batched_logits[1], logits2, atol=1e-3)
+        assert_tensors_close(expected_slice, logits_arr, atol=1e-3)
+
+    @slow
+    def test_xsum_summarization_same_as_fairseq(self):
+        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum").to(torch_device)
+        tok = self.default_tokenizer
+
+        PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+
+        EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state."
+        dct = tok.batch_encode_plus(
+            [PGE_ARTICLE],
+            max_length=1024,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+            num_beams=2,
+            max_length=62,
+            min_length=11,
+            length_penalty=1.0,
+            no_repeat_ngram_size=3,
+            early_stopping=True,
+            decoder_start_token_id=model.config.eos_token_id,
         )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
-    def test_seq_to_seq_generation(self):
-        hf = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(torch_device)
-        tok = BartTokenizer.from_pretrained('facebook/bart-large')
+        decoded = tok.batch_decode(
+            hypotheses_batch,
+            skip_special_tokens=True,
+        )
+        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
 
-        batch_input = [
-            # string 1,
-            # string 2,
-            # string 3,
-            # string 4,
-        ]
+    def test_xsum_config_generation_params(self):
+        config = BartConfig.from_pretrained("facebook/bart-large-xsum")
+        expected_params = dict(num_beams=6, do_sample=False, early_stopping=True, length_penalty=1.0)
+        config_params = {k: getattr(config, k, "MISSING") for k, v in expected_params.items()}
+        self.assertDictEqual(expected_params, config_params)
+
+    @slow
+    def test_cnn_summarization_same_as_fairseq(self):
+        hf = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(torch_device)
+        tok = BartTokenizer.from_pretrained("facebook/bart-large")
+
+        FRANCE_ARTICLE = ' Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noq
+
+        SHORTER_ARTICLE = ' (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
 
         # The below article tests that we don't add any hypotheses outside of the top n_beams
+        IRAN_ARTICLE = " (CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+
+        ARTICLE_SUBWAY = ' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+
         dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
+            [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
+            max_length=1024,
             padding="max_length",
             truncation_strategy="only_first",
             truncation=True,
             return_tensors="pt",
         )
 
+        self.assertEqual(1024, dct["input_ids"].shape[1])
         hypotheses_batch = hf.generate(
             input_ids=dct["input_ids"].to(torch_device),
             attention_mask=dct["attention_mask"].to(torch_device),
             num_beams=2,
         )
+        assert hypotheses_batch[:, 1].eq(0).all().item()
 
         EXPECTED = [
-            # here expected 1,
-            # here expected 2,
-            # here expected 3,
-            # here expected 4,
+            "A French prosecutor says he is not aware of any video footage from on board the plane. Two German "
+            "magazines claim to have found a cell phone video showing the crash. The publications say they watched "
+            "the video, which was found by a source close to the investigation. All 150 on board Germanwings Flight "
+            "9525 were killed.",
+            "Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court "
+            "jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the "
+            "Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki said it was a "
+            "move toward greater justice.",
+            "U.S. and its negotiating partners reached a strong framework agreement with Iran. Peter Bergen: The "
+            "debate that has already begun will likely result in more heat than light. He says critics have made "
+            "dubious assumptions and doubtful assertions. Bergen says the goal was to block Iran from building a "
+            "nuclear weapon.",
+            "Liana Barrientos, 39, has been married 10 times, sometimes within two weeks of each other. Prosecutors "
+            "say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the "
+            "Bronx on Friday. If convicted, she faces up to four years in prison.",
         ]
 
-        generated = tok.batch_decode(
+        generated_summaries = tok.batch_decode(
             hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
         )
-        assert generated == EXPECTED
+        assert generated_summaries == EXPECTED

From ade222b6e69c4e01853b57ee8329dd754b9f0bb1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 30 Dec 2020 10:45:54 +0000
Subject: [PATCH 04/51] finish mbart

---
 cookiecutter-template-MBART/mbart.rst         |   80 +
 .../to_replace_mbart.py                       |  212 +++
 .../tokenization_fast_mbart.py                |   56 +
 .../tokenization_mbart.py                     |   54 +
 src/transformers/__init__.py                  |    7 +-
 src/transformers/models/auto/modeling_auto.py |    9 +-
 src/transformers/models/mbart/__init__.py     |   13 +-
 .../models/mbart/configuration_mbart.py       |  138 +-
 .../models/mbart/modeling_mbart.py            | 1438 ++++++++++++++++-
 src/transformers/models/old_mbart/__init__.py |   33 +
 .../models/old_mbart/configuration_mbart.py   |  105 ++
 ...rt_mbart_original_checkpoint_to_pytorch.py |   50 +
 .../models/old_mbart/modeling_mbart.py        |   70 +
 .../models/old_mbart/modeling_tf_mbart.py     |   36 +
 .../models/old_mbart/tokenization_mbart.py    |  232 +++
 .../old_mbart/tokenization_mbart_fast.py      |  248 +++
 tests/test_modeling_mbart.py                  |  292 +++-
 tests/test_modeling_old_mbart.py              |  192 +++
 18 files changed, 3155 insertions(+), 110 deletions(-)
 create mode 100644 cookiecutter-template-MBART/mbart.rst
 create mode 100644 cookiecutter-template-MBART/to_replace_mbart.py
 create mode 100644 cookiecutter-template-MBART/tokenization_fast_mbart.py
 create mode 100644 cookiecutter-template-MBART/tokenization_mbart.py
 mode change 100644 => 100755 src/transformers/models/mbart/modeling_mbart.py
 create mode 100644 src/transformers/models/old_mbart/__init__.py
 create mode 100644 src/transformers/models/old_mbart/configuration_mbart.py
 create mode 100644 src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/old_mbart/modeling_mbart.py
 create mode 100644 src/transformers/models/old_mbart/modeling_tf_mbart.py
 create mode 100644 src/transformers/models/old_mbart/tokenization_mbart.py
 create mode 100644 src/transformers/models/old_mbart/tokenization_mbart_fast.py
 create mode 100644 tests/test_modeling_old_mbart.py

diff --git a/cookiecutter-template-MBART/mbart.rst b/cookiecutter-template-MBART/mbart.rst
new file mode 100644
index 00000000000000..61fb1da06ac613
--- /dev/null
+++ b/cookiecutter-template-MBART/mbart.rst
@@ -0,0 +1,80 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MBART
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MBART model was proposed in `<INSERT PAPER NAME HERE>
+<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+MBartConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartConfig
+    :members:
+
+
+MBartTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MBartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartModel
+    :members: forward
+
+
+MBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForConditionalGeneration
+    :members: forward
+
+
+MBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForSequenceClassification
+    :members: forward
+
+
+MBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForQuestionAnswering
+    :members: forward
+
+
diff --git a/cookiecutter-template-MBART/to_replace_mbart.py b/cookiecutter-template-MBART/to_replace_mbart.py
new file mode 100644
index 00000000000000..e05f0c039a7ee1
--- /dev/null
+++ b/cookiecutter-template-MBART/to_replace_mbart.py
@@ -0,0 +1,212 @@
+## Copyright 2020 The HuggingFace Team. All rights reserved.
+##
+## Licensed under the Apache License, Version 2.0 (the "License");
+## you may not use this file except in compliance with the License.
+## You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+## This file is made so that specific statements may be copied inside existing files. This is useful to copy
+## import statements in __init__.py, or to complete model lists in the AUTO files.
+##
+## It is to be used as such:
+## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
+## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH
+## Put '# Replace with:' followed by the lines containing the content to define the content
+## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
+## content in that file.
+##
+## Put '## COMMENT' to comment on the file.
+
+
+# To replace in: "src/transformers/__init__.py"
+# Below: "if is_torch_available():" if generating PyTorch
+# Replace with:
+
+    from .models.mbart import (
+        MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+    )
+# End.
+
+# Below: "if is_tf_available():" if generating TensorFlow
+# Replace with:
+
+    from .models.mbart import (
+        TFMBartForConditionalGeneration,
+        TFMBartModel,
+        TFMBartPreTrainedModel,
+    )
+# End.
+
+# Below: "if is_tokenizers_available():"
+# Replace with:
+    from .models.mbart import MBartTokenizerFast
+# End.
+
+# Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
+# Replace with:
+from .models.mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig, MBartTokenizer
+# End.
+
+
+
+# To replace in: "src/transformers/models/auto/configuration_auto.py"
+# Below: "# Add configs here"
+# Replace with:
+        ("mbart", MBartConfig),
+# End.
+
+# Below: "# Add archive maps here"
+# Replace with:
+        MBART_PRETRAINED_CONFIG_ARCHIVE_MAP,
+# End.
+
+# Below: "from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
+# Replace with:
+from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+# End.
+
+# Below: "# Add full (and cased) model names here"
+# Replace with:
+        ("mbart", "MBart"),
+# End.
+
+
+
+# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
+# Below: "from .configuration_auto import ("
+# Replace with:
+    MBartConfig,
+# End.
+
+# Below: "# Add modeling imports here"
+# Replace with:
+from ..mbart.modeling_mbart import (
+    MBartForConditionalGeneration,
+    MBartForQuestionAnswering,
+    MBartForSequenceClassification,
+    MBartModel,
+)
+# End.
+
+# Below: "# Base model mapping"
+# Replace with:
+        (MBartConfig, MBartModel),
+# End.
+
+# Below: "# Model with LM heads mapping"
+# Replace with:
+
+        (MBartConfig, MBartForConditionalGeneration),
+# End.
+
+# Below: "# Model for Causal LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Masked LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Sequence Classification mapping"
+# Replace with:
+        (MBartConfig, MBartForSequenceClassification),
+# End.
+
+# Below: "# Model for Question Answering mapping"
+# Replace with:
+        (MBartConfig, MBartForQuestionAnswering),
+# End.
+
+# Below: "# Model for Token Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Multiple Choice mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+
+        (MBartConfig, MBartForConditionalGeneration),
+# End.
+
+# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
+# Below: "from .configuration_auto import ("
+# Replace with:
+    MBartConfig,
+# End.
+
+# Below: "# Add modeling imports here"
+# Replace with:
+from ..mbart.modeling_tf_mbart import (
+    TFMBartForConditionalGeneration,
+    TFMBartModel,
+)
+# End.
+
+# Below: "# Base model mapping"
+# Replace with:
+        (MBartConfig, TFMBartModel),
+# End.
+
+# Below: "# Model with LM heads mapping"
+# Replace with:
+
+        (MBartConfig, TFMBartForConditionalGeneration),
+# End.
+
+# Below: "# Model for Causal LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Masked LM mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Sequence Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Question Answering mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Token Classification mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Multiple Choice mapping"
+# Replace with:
+# End.
+
+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+
+        (MBartConfig, TFMBartForConditionalGeneration),
+# End.
+
+# To replace in: "utils/check_repo.py" if generating PyTorch
+
+# Below: "models to ignore for model xxx mapping"
+# Replace with:
+"MBartEncoder",
+    "MBartDecoder",
+# End.
+
+# Below: "models to ignore for not tested"
+# Replace with:
+"MBartEncoder",  # Building part of bigger (tested) model.
+    "MBartDecoder",  # Building part of bigger (tested) model.
+# End.
diff --git a/cookiecutter-template-MBART/tokenization_fast_mbart.py b/cookiecutter-template-MBART/tokenization_fast_mbart.py
new file mode 100644
index 00000000000000..c7b1a0f4a7cd7c
--- /dev/null
+++ b/cookiecutter-template-MBART/tokenization_fast_mbart.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MBART."""
+from ...utils import logging
+from ..bert.tokenization_bert_fast import BertTokenizerFast
+from .tokenization_mbart import MBartTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-cc25": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "facebook/mbart-large-cc25": {"do_lower_case": False},
+}
+
+
+class MBartTokenizerFast(BertTokenizerFast):
+    r"""
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.MBartTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = MBartTokenizer
diff --git a/cookiecutter-template-MBART/tokenization_mbart.py b/cookiecutter-template-MBART/tokenization_mbart.py
new file mode 100644
index 00000000000000..c3035cf5cea94e
--- /dev/null
+++ b/cookiecutter-template-MBART/tokenization_mbart.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MBART."""
+from ...utils import logging
+from ..bert.tokenization_bert import BertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/mbart-large-cc25": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "facebook/mbart-large-cc25": {"do_lower_case": False},
+}
+
+
+class MBartTokenizer(BertTokenizer):
+    r"""
+    Construct a MBART tokenizer.
+
+    :class:`~transformers.MBartTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 789090a12a449a..d4b766fffdb982 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -527,7 +527,12 @@
         LxmertXLayer,
     )
     from .models.marian import MarianMTModel
-    from .models.mbart import MBartForConditionalGeneration, MBartModel
+    from .models.mbart import (
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+    )
     from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
     from .models.mobilebert import (
         MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3fc5c702e7df0d..bb1a6258e6d873 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -111,7 +111,12 @@
 )
 from ..lxmert.modeling_lxmert import LxmertForPreTraining, LxmertForQuestionAnswering, LxmertModel
 from ..marian.modeling_marian import MarianMTModel
-from ..mbart.modeling_mbart import MBartForConditionalGeneration, MBartModel
+from ..mbart.modeling_mbart import (
+    MBartForConditionalGeneration,
+    MBartForQuestionAnswering,
+    MBartForSequenceClassification,
+    MBartModel,
+)
 from ..mobilebert.modeling_mobilebert import (
     MobileBertForMaskedLM,
     MobileBertForMultipleChoice,
@@ -428,6 +433,7 @@
         (AlbertConfig, AlbertForSequenceClassification),
         (CamembertConfig, CamembertForSequenceClassification),
         (XLMRobertaConfig, XLMRobertaForSequenceClassification),
+        (MBartConfig, MBartForSequenceClassification),
         (BartConfig, BartForSequenceClassification),
         (LongformerConfig, LongformerForSequenceClassification),
         (RobertaConfig, RobertaForSequenceClassification),
@@ -457,6 +463,7 @@
         (AlbertConfig, AlbertForQuestionAnswering),
         (CamembertConfig, CamembertForQuestionAnswering),
         (BartConfig, BartForQuestionAnswering),
+        (MBartConfig, MBartForQuestionAnswering),
         (LongformerConfig, LongformerForQuestionAnswering),
         (XLMRobertaConfig, XLMRobertaForQuestionAnswering),
         (RobertaConfig, RobertaForQuestionAnswering),
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index 2fa8876085ed72..7dab087f6fd425 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -15,9 +15,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_mbart import MBartConfig
+from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+from .tokenization_mbart import MBartTokenizer
 
 
 if is_sentencepiece_available():
@@ -27,7 +27,14 @@
     from .tokenization_mbart_fast import MBartTokenizerFast
 
 if is_torch_available():
-    from .modeling_mbart import MBartForConditionalGeneration, MBartModel
+    from .modeling_mbart import (
+        MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+        MBartPreTrainedModel,
+    )
 
 if is_tf_available():
     from .modeling_tf_mbart import TFMBartForConditionalGeneration
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index c8b4540e1efd53..c65823585bc9c5 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,33 +12,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MBART configuration """
+""" MBART model configuration """
 
+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..bart.configuration_bart import BartConfig
 
 
 logger = logging.get_logger(__name__)
 
 MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
     "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/config.json",
+    # See all MBART models at https://huggingface.co/models?filter=mbart
 }
 
 
-class MBartConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
-    specified arguments, defining the model architecture.
+class MBartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MBartModel`. It is used to
+    instantiate an MBART model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MBART `facebook/mbart-large-cc25
+    <https://huggingface.co/facebook/mbart-large-cc25>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
+
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 250027):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
             Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartForConditionalGeneration`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartModel` or
+            :class:`~transformers.TFMBartModel`.
         d_model (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
         encoder_layers (:obj:`int`, `optional`, defaults to 12):
@@ -50,9 +53,9 @@ class MBartConfig(BartConfig):
         decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
         decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -69,37 +72,94 @@ class MBartConfig(BartConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings. Only True for Bart.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+        Example::
+
+        >>> from transformers import MBartModel, MBartConfig
+
+        >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
+        >>> configuration = MBartConfig()
 
+        >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
+        >>> model = MBartModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
     model_type = "mbart"
-    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
old mode 100644
new mode 100755
index f4aa39b075145c..b4ebcf269a5601
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1,4 +1,5 @@
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+# coding=utf-8
+# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,60 +12,1421 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" PyTorch MBART model. """
 
-from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
+
+import math
+import random
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
 from .configuration_mbart import MBartConfig
 
 
+logger = logging.get_logger(__name__)
+
 _CONFIG_FOR_DOC = "MBartConfig"
 _TOKENIZER_FOR_DOC = "MBartTokenizer"
 
+
 MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/mbart-large-cc25",
-    "facebook/mbart-large-en-ro",
-    # See all multilingual BART models at https://huggingface.co/models?filter=mbart
+    # See all MBART models at https://huggingface.co/models?filter=mbart
 ]
 
 
-class MBartModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
     """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
 
-    config_class = MBartConfig
-    _keys_to_ignore_on_load_missing = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
+    return shifted_input_ids
 
-class MBartForConditionalGeneration(BartForConditionalGeneration):
-    r"""
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-
-    Examples::
-        >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-        >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
-        >>> article = "UN Chief Says There Is No Military Solution in Syria"
-        >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], return_tensors="pt")
-        >>> translated_tokens = model.generate(**batch)
-        >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-        >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
     """
-    model_type = "mbart"
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+def MBartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class MBartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
+        num_embeddings
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim, padding_idx=padding_idx)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+
+
+class MBartAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class MBartEncoderLayer(nn.Module):
+    def __init__(self, config: MBartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = MBartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = MBartLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class MBartDecoderLayer(nn.Module):
+    def __init__(self, config: MBartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = MBartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.encoder_attn = MBartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = MBartLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MBartClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class MBartPreTrainedModel(PreTrainedModel):
     config_class = MBartConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+MBART_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.MBartConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+MBART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import MBartTokenizer, MBartForConditionalGeneration, MBartConfig
+
+        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+"""
+
+MBART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_mbart._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class MBartEncoder(MBartPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`MBartEncoderLayer`.
+
+    Args:
+        config: MBartConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = MBartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([MBartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = MBartLayerNorm(embed_dim)
+        self.layer_norm = MBartLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class MBartDecoder(MBartPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`MBartDecoderLayer`
+
+    Args:
+        config: MBartConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = MBartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = MBartLayerNorm(config.d_model)
+        self.layer_norm = MBartLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare MBART Model outputting raw hidden-states without any specific head on top.",
+    MBART_START_DOCSTRING,
+)
+class MBartModel(MBartPreTrainedModel):
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = MBartEncoder(config, self.shared)
+        self.decoder = MBartDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/mbart-large-cc25",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The MBART Model with a language modeling head. Can be used for summarization.", MBART_START_DOCSTRING
+)
+class MBartForConditionalGeneration(MBartPreTrainedModel):
+    base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
     ]
+
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+        self.model = MBartModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(MBART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
+            >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    MBART_START_DOCSTRING,
+)
+class MBartForSequenceClassification(MBartPreTrainedModel):
+    def __init__(self, config: MBartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = MBartModel(config)
+        self.classification_head = MBartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/mbart-large-cc25",
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MBART_START_DOCSTRING,
+)
+class MBartForQuestionAnswering(MBartPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = MBartModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.model._init_weights(self.qa_outputs)
+
+    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/mbart-large-cc25",
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
diff --git a/src/transformers/models/old_mbart/__init__.py b/src/transformers/models/old_mbart/__init__.py
new file mode 100644
index 00000000000000..2fa8876085ed72
--- /dev/null
+++ b/src/transformers/models/old_mbart/__init__.py
@@ -0,0 +1,33 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_mbart import MBartConfig
+
+
+if is_sentencepiece_available():
+    from .tokenization_mbart import MBartTokenizer
+
+if is_tokenizers_available():
+    from .tokenization_mbart_fast import MBartTokenizerFast
+
+if is_torch_available():
+    from .modeling_mbart import MBartForConditionalGeneration, MBartModel
+
+if is_tf_available():
+    from .modeling_tf_mbart import TFMBartForConditionalGeneration
diff --git a/src/transformers/models/old_mbart/configuration_mbart.py b/src/transformers/models/old_mbart/configuration_mbart.py
new file mode 100644
index 00000000000000..c8b4540e1efd53
--- /dev/null
+++ b/src/transformers/models/old_mbart/configuration_mbart.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBART configuration """
+
+from ...utils import logging
+from ..bart.configuration_bart import BartConfig
+
+
+logger = logging.get_logger(__name__)
+
+MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
+    "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/config.json",
+}
+
+
+class MBartConfig(BartConfig):
+    """
+    This is the configuration class to store the configuration of a
+    :class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 250027):
+            Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartForConditionalGeneration`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            This should be completed, specific to marian.
+        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Call layernorm before attention ops.
+        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Call layernorm after embeddings. Only True for Bart.
+        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Don't learn positional embeddings, use sinusoidal.
+        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Why not add another layernorm?
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
+            How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
+    """
+
+    model_type = "mbart"
+    keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..46c933d7a4edc2
--- /dev/null
+++ b/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
@@ -0,0 +1,50 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from transformers import BartForConditionalGeneration, MBartConfig
+
+from ..bart.convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_
+
+
+def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"):
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    remove_ignore_keys_(state_dict)
+    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
+    mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
+    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
+    model = BartForConditionalGeneration(mbart_config)
+    model.model.load_state_dict(state_dict)
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
+    )
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--hf_config",
+        default="facebook/mbart-large-cc25",
+        type=str,
+        help="Which huggingface architecture to use: bart-large-xsum",
+    )
+    args = parser.parse_args()
+    model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config)
+    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/old_mbart/modeling_mbart.py b/src/transformers/models/old_mbart/modeling_mbart.py
new file mode 100644
index 00000000000000..f4aa39b075145c
--- /dev/null
+++ b/src/transformers/models/old_mbart/modeling_mbart.py
@@ -0,0 +1,70 @@
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
+from .configuration_mbart import MBartConfig
+
+
+_CONFIG_FOR_DOC = "MBartConfig"
+_TOKENIZER_FOR_DOC = "MBartTokenizer"
+
+MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/mbart-large-cc25",
+    "facebook/mbart-large-en-ro",
+    # See all multilingual BART models at https://huggingface.co/models?filter=mbart
+]
+
+
+class MBartModel(BartModel):
+    r"""
+    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = MBartConfig
+    _keys_to_ignore_on_load_missing = [
+        "encoder.embed_positions.weight",
+        "decoder.embed_positions.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        "encoder.embed_positions.weight",
+        "decoder.embed_positions.weight",
+    ]
+
+
+class MBartForConditionalGeneration(BartForConditionalGeneration):
+    r"""
+    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+
+    Examples::
+        >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
+        >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+        >>> article = "UN Chief Says There Is No Military Solution in Syria"
+        >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], return_tensors="pt")
+        >>> translated_tokens = model.generate(**batch)
+        >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+        >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
+    """
+    model_type = "mbart"
+    config_class = MBartConfig
+    _keys_to_ignore_on_load_missing = [
+        "model.encoder.embed_positions.weight",
+        "model.decoder.embed_positions.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        "model.encoder.embed_positions.weight",
+        "model.decoder.embed_positions.weight",
+    ]
diff --git a/src/transformers/models/old_mbart/modeling_tf_mbart.py b/src/transformers/models/old_mbart/modeling_tf_mbart.py
new file mode 100644
index 00000000000000..23b30fd4b36683
--- /dev/null
+++ b/src/transformers/models/old_mbart/modeling_tf_mbart.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF mBART model, originally from fairseq."""
+from ...file_utils import add_start_docstrings
+from ...utils import logging
+from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
+from .configuration_mbart import MBartConfig
+
+
+_CONFIG_FOR_DOC = "MBartConfig"
+
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING)
+class TFMBartForConditionalGeneration(TFBartForConditionalGeneration):
+    config_class = MBartConfig
+    # All the code is in src/transformers/models/bart/modeling_tf_bart.py
diff --git a/src/transformers/models/old_mbart/tokenization_mbart.py b/src/transformers/models/old_mbart/tokenization_mbart.py
new file mode 100644
index 00000000000000..e8425fe8c539e1
--- /dev/null
+++ b/src/transformers/models/old_mbart/tokenization_mbart.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from ...file_utils import add_start_docstrings
+from ...tokenization_utils import BatchEncoding
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
+from ...utils import logging
+from ..xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
+SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
+
+FAIRSEQ_LANGUAGE_CODES = [
+    "ar_AR",
+    "cs_CZ",
+    "de_DE",
+    "en_XX",
+    "es_XX",
+    "et_EE",
+    "fi_FI",
+    "fr_XX",
+    "gu_IN",
+    "hi_IN",
+    "it_IT",
+    "ja_XX",
+    "kk_KZ",
+    "ko_KR",
+    "lt_LT",
+    "lv_LV",
+    "my_MM",
+    "ne_NP",
+    "nl_XX",
+    "ro_RO",
+    "ru_RU",
+    "si_LK",
+    "tr_TR",
+    "vi_VN",
+    "zh_CN",
+]
+
+
+class MBartTokenizer(XLMRobertaTokenizer):
+    """
+    Construct an MBART tokenizer.
+
+    :class:`~transformers.MBartTokenizer` is a subclass of :class:`~transformers.XLMRobertaTokenizer` and adds a new
+    :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch`
+
+    Refer to superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning the
+    initialization parameters and other methods.
+
+    .. warning::
+
+        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
+        properly.
+
+    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
+    <tokens> <eos>``` for target language documents.
+
+    Examples::
+
+        >>> from transformers import MBartTokenizer
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_seq2seq_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
+        ... )
+
+    """
+
+    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
+    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
+    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
+
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+
+    def __init__(self, *args, tokenizer_file=None, **kwargs):
+        super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs)
+
+        self.sp_model_size = len(self.sp_model)
+        self.lang_code_to_id = {
+            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
+        }
+        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
+        self.cur_lang_code = self.lang_code_to_id["en_XX"]
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
+        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        src_lang: str = "en_XX",
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "ro_RO",
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        truncation: bool = True,
+        padding: str = "longest",
+        return_tensors: Optional[str] = None,
+        add_prefix_space: bool = False,  # ignored
+        **kwargs,
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.model_max_length
+        self.set_src_lang_special_tokens(src_lang)
+        model_inputs: BatchEncoding = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        self.set_tgt_lang_special_tokens(tgt_lang)
+
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=True,
+            **kwargs,
+        )["input_ids"]
+        model_inputs["labels"] = labels
+        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
+        return model_inputs
+
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        self.cur_lang_code = self.lang_code_to_id[src_lang]
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        self.cur_lang_code = self.lang_code_to_id[lang]
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
diff --git a/src/transformers/models/old_mbart/tokenization_mbart_fast.py b/src/transformers/models/old_mbart/tokenization_mbart_fast.py
new file mode 100644
index 00000000000000..56e7c065f39146
--- /dev/null
+++ b/src/transformers/models/old_mbart/tokenization_mbart_fast.py
@@ -0,0 +1,248 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from tokenizers import processors
+
+from ...file_utils import add_start_docstrings, is_sentencepiece_available
+from ...tokenization_utils import BatchEncoding
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
+from ...utils import logging
+from ..xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
+
+
+if is_sentencepiece_available():
+    from .tokenization_mbart import MBartTokenizer
+else:
+    MBartTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
+SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
+tokenizer_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json"
+
+FAIRSEQ_LANGUAGE_CODES = [
+    "ar_AR",
+    "cs_CZ",
+    "de_DE",
+    "en_XX",
+    "es_XX",
+    "et_EE",
+    "fi_FI",
+    "fr_XX",
+    "gu_IN",
+    "hi_IN",
+    "it_IT",
+    "ja_XX",
+    "kk_KZ",
+    "ko_KR",
+    "lt_LT",
+    "lv_LV",
+    "my_MM",
+    "ne_NP",
+    "nl_XX",
+    "ro_RO",
+    "ru_RU",
+    "si_LK",
+    "tr_TR",
+    "vi_VN",
+    "zh_CN",
+]
+
+
+class MBartTokenizerFast(XLMRobertaTokenizerFast):
+    """
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+
+    :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
+    a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
+
+    Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning
+    the initialization parameters and other methods.
+
+    .. warning::
+        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
+        properly.
+
+    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
+    <tokens> <eos>``` for target language documents.
+
+    Examples::
+
+        >>> from transformers import MBartTokenizerFast
+        >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro')
+        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> batch: dict = tokenizer.prepare_seq2seq_batch(
+        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
+        ... )
+    """
+
+    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
+    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
+    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
+    slow_tokenizer_class = MBartTokenizer
+
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+
+    def __init__(self, *args, tokenizer_file=None, **kwargs):
+        super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs)
+
+        self.cur_lang_code = self.convert_tokens_to_ids("en_XX")
+        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
+
+        self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. The special tokens depend on calling set_lang.
+
+        An MBART sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
+        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        src_lang: str = "en_XX",
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "ro_RO",
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        truncation: bool = True,
+        padding: str = "longest",
+        return_tensors: str = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        if max_length is None:
+            max_length = self.model_max_length
+        self.set_src_lang_special_tokens(src_lang)
+        model_inputs: BatchEncoding = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        self.set_tgt_lang_special_tokens(tgt_lang)
+
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=True,
+            **kwargs,
+        )["input_ids"]
+        model_inputs["labels"] = labels
+        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
+        return model_inputs
+
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index 2a43650febbd25..67e6df2ed5d46a 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -1,4 +1,5 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# coding=utf-8
+# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,61 +12,296 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch MBART model. """
 
+
+import copy
+import tempfile
 import unittest
 
+import timeout_decorator  # noqa
+
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
-from .test_modeling_common import ModelTesterMixin
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 if is_torch_available():
     import torch
 
     from transformers import (
-        AutoModelForSeq2SeqLM,
-        AutoTokenizer,
         BatchEncoding,
         MBartConfig,
         MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
         MBartModel,
+        MBartTokenizer,
     )
+    from transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
 
 
-EN_CODE = 250004
-RO_CODE = 250020
+def prepare_mbart_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
 
 
 @require_torch
-class ModelTester:
-    def __init__(self, parent):
-        self.config = MBartConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
+class MBartModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
         )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = MBartConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_mbart_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
 
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = MBartModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
 
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (MBartForConditionalGeneration, MBartModel) if is_torch_available() else ()
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = MBartModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
 
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = MBartEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = MBartDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class MBartModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (MBartModel, MBartForConditionalGeneration, MBartForSequenceClassification, MBartForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (MBartForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
 
     def setUp(self):
-        self.model_tester = ModelTester(self)
+        self.model_tester = MBartModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MBartConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # MBartForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (MBartModel, MBartForConditionalGeneration, MBartForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = MBartForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
 @require_torch
@@ -77,13 +313,13 @@ class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
+        cls.tokenizer = MBartTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
         return cls
 
     @cached_property
     def model(self):
         """Only load the model if needed."""
-        model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+        model = MBartForConditionalGeneration.from_pretrained(self.checkpoint_name).to(torch_device)
         if "cuda" in torch_device:
             model = model.half()
         return model
@@ -102,7 +338,7 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
         "Şeful ONU declară că nu există o soluţie militară în Siria",
         'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.',
     ]
-    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, EN_CODE]
+    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, 250004]
 
     @slow
     def test_enro_generate_one(self):
diff --git a/tests/test_modeling_old_mbart.py b/tests/test_modeling_old_mbart.py
new file mode 100644
index 00000000000000..2a43650febbd25
--- /dev/null
+++ b/tests/test_modeling_old_mbart.py
@@ -0,0 +1,192 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_modeling_common import ModelTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoModelForSeq2SeqLM,
+        AutoTokenizer,
+        BatchEncoding,
+        MBartConfig,
+        MBartForConditionalGeneration,
+        MBartModel,
+    )
+
+
+EN_CODE = 250004
+RO_CODE = 250020
+
+
+@require_torch
+class ModelTester:
+    def __init__(self, parent):
+        self.config = MBartConfig(
+            vocab_size=99,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            add_final_layer_norm=True,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        return self.config, {}
+
+
+@require_torch
+class SelectiveCommonTest(unittest.TestCase):
+    all_model_classes = (MBartForConditionalGeneration, MBartModel) if is_torch_available() else ()
+
+    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+
+    def setUp(self):
+        self.model_tester = ModelTester(self)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
+    maxDiff = 1000  # longer string compare tracebacks
+    checkpoint_name = None
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
+        return cls
+
+    @cached_property
+    def model(self):
+        """Only load the model if needed."""
+        model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+        if "cuda" in torch_device:
+            model = model.half()
+        return model
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "facebook/mbart-large-en-ro"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
+    ]
+    tgt_text = [
+        "Şeful ONU declară că nu există o soluţie militară în Siria",
+        'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.',
+    ]
+    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, EN_CODE]
+
+    @slow
+    def test_enro_generate_one(self):
+        batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(
+            ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="pt"
+        ).to(torch_device)
+        translated_tokens = self.model.generate(**batch)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        self.assertEqual(self.tgt_text[0], decoded[0])
+        # self.assertEqual(self.tgt_text[1], decoded[1])
+
+    @slow
+    def test_enro_generate_batch(self):
+        batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(self.src_text, return_tensors="pt").to(
+            torch_device
+        )
+        translated_tokens = self.model.generate(**batch)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        assert self.tgt_text == decoded
+
+    def test_mbart_enro_config(self):
+        mbart_models = ["facebook/mbart-large-en-ro"]
+        expected = {"scale_embedding": True, "output_past": True}
+        for name in mbart_models:
+            config = MBartConfig.from_pretrained(name)
+            self.assertTrue(config.is_valid_mbart())
+            for k, v in expected.items():
+                try:
+                    self.assertEqual(v, getattr(config, k))
+                except AssertionError as e:
+                    e.args += (name, k)
+                    raise
+
+    def test_mbart_fast_forward(self):
+        config = MBartConfig(
+            vocab_size=99,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            add_final_layer_norm=True,
+        )
+        lm_model = MBartForConditionalGeneration(config).to(torch_device)
+        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
+        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
+        result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
+        expected_shape = (*summary.shape, config.vocab_size)
+        self.assertEqual(result.logits.shape, expected_shape)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "facebook/mbart-large-cc25"
+    src_text = [
+        " UN Chief Says There Is No Military Solution in Syria",
+        " I ate lunch twice yesterday",
+    ]
+    tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
+
+    @unittest.skip("This test is broken, still generates english")
+    def test_cc25_generate(self):
+        inputs = self.tokenizer.prepare_seq2seq_batch([self.src_text[0]], return_tensors="pt").to(torch_device)
+        translated_tokens = self.model.generate(
+            input_ids=inputs["input_ids"].to(torch_device),
+            decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"],
+        )
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        self.assertEqual(self.tgt_text[0], decoded[0])
+
+    @slow
+    def test_fill_mask(self):
+        inputs = self.tokenizer.prepare_seq2seq_batch(["One of the best <mask> I ever read!"], return_tensors="pt").to(
+            torch_device
+        )
+        outputs = self.model.generate(
+            inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1
+        )
+        prediction: str = self.tokenizer.batch_decode(
+            outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )[0]
+        self.assertEqual(prediction, "of the best books I ever read!")

From 0c229e01e14d949ac5457c299feed05bca4d7d51 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 30 Dec 2020 10:48:55 +0000
Subject: [PATCH 05/51] delete unnecessary line

---
 tests/test_modeling_mbart.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index 67e6df2ed5d46a..d4ea6f400abad1 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -364,7 +364,6 @@ def test_mbart_enro_config(self):
         expected = {"scale_embedding": True, "output_past": True}
         for name in mbart_models:
             config = MBartConfig.from_pretrained(name)
-            self.assertTrue(config.is_valid_mbart())
             for k, v in expected.items():
                 try:
                     self.assertEqual(v, getattr(config, k))

From 69eec0cc89a744eb3d76776729b502109332f089 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 30 Dec 2020 12:28:56 +0000
Subject: [PATCH 06/51] init pegasus

---
 .../pegasus.rst                               |   32 +-
 .../to_replace_pegasus.py                     |   76 +-
 .../tokenization_fast_pegasus.py              |   39 +-
 .../tokenization_pegasus.py                   |   37 +-
 .../models/old_pegasus/__init__.py            |   33 +
 .../old_pegasus/configuration_pegasus.py      |  145 ++
 .../convert_pegasus_tf_to_pytorch.py          |  132 ++
 .../models/old_pegasus/modeling_pegasus.py    |   83 +
 .../models/old_pegasus/modeling_tf_pegasus.py |   41 +
 .../old_pegasus/tokenization_pegasus.py       |  294 ++++
 .../old_pegasus/tokenization_pegasus_fast.py  |  232 +++
 src/transformers/models/pegasus/__init__.py   |   22 +-
 .../models/pegasus/configuration_pegasus.py   |  191 ++-
 .../models/pegasus/modeling_pegasus.py        | 1438 ++++++++++++++++-
 tests/test_modeling_old_pegasus.py            |  126 ++
 tests/test_modeling_pegasus.py                |  426 ++++-
 16 files changed, 3026 insertions(+), 321 deletions(-)
 rename cookiecutter-template-MBART/mbart.rst => cookiecutter-template-PEGASUS/pegasus.rst (79%)
 rename cookiecutter-template-MBART/to_replace_mbart.py => cookiecutter-template-PEGASUS/to_replace_pegasus.py (71%)
 rename cookiecutter-template-MBART/tokenization_fast_mbart.py => cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py (50%)
 rename cookiecutter-template-MBART/tokenization_mbart.py => cookiecutter-template-PEGASUS/tokenization_pegasus.py (53%)
 create mode 100644 src/transformers/models/old_pegasus/__init__.py
 create mode 100644 src/transformers/models/old_pegasus/configuration_pegasus.py
 create mode 100644 src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
 create mode 100644 src/transformers/models/old_pegasus/modeling_pegasus.py
 create mode 100644 src/transformers/models/old_pegasus/modeling_tf_pegasus.py
 create mode 100644 src/transformers/models/old_pegasus/tokenization_pegasus.py
 create mode 100644 src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
 mode change 100644 => 100755 src/transformers/models/pegasus/modeling_pegasus.py
 create mode 100644 tests/test_modeling_old_pegasus.py

diff --git a/cookiecutter-template-MBART/mbart.rst b/cookiecutter-template-PEGASUS/pegasus.rst
similarity index 79%
rename from cookiecutter-template-MBART/mbart.rst
rename to cookiecutter-template-PEGASUS/pegasus.rst
index 61fb1da06ac613..434ef1a5df75e8 100644
--- a/cookiecutter-template-MBART/mbart.rst
+++ b/cookiecutter-template-PEGASUS/pegasus.rst
@@ -10,13 +10,13 @@
     an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
     specific language governing permissions and limitations under the License.
 
-MBART
+PEGASUS
 -----------------------------------------------------------------------------------------------------------------------
 
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The MBART model was proposed in `<INSERT PAPER NAME HERE>
+The PEGASUS model was proposed in `<INSERT PAPER NAME HERE>
 <<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
@@ -27,54 +27,54 @@ Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-MBartConfig
+PegasusConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartConfig
+.. autoclass:: transformers.PegasusConfig
     :members:
 
 
-MBartTokenizer
+PegasusTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartTokenizer
+.. autoclass:: transformers.PegasusTokenizer
     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
         create_token_type_ids_from_sequences, save_vocabulary
 
 
-MBartTokenizerFast
+PegasusTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartTokenizerFast
+.. autoclass:: transformers.PegasusTokenizerFast
     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
         create_token_type_ids_from_sequences, save_vocabulary
 
 
-MBartModel
+PegasusModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartModel
+.. autoclass:: transformers.PegasusModel
     :members: forward
 
 
-MBartForConditionalGeneration
+PegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartForConditionalGeneration
+.. autoclass:: transformers.PegasusForConditionalGeneration
     :members: forward
 
 
-MBartForSequenceClassification
+PegasusForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartForSequenceClassification
+.. autoclass:: transformers.PegasusForSequenceClassification
     :members: forward
 
 
-MBartForQuestionAnswering
+PegasusForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MBartForQuestionAnswering
+.. autoclass:: transformers.PegasusForQuestionAnswering
     :members: forward
 
 
diff --git a/cookiecutter-template-MBART/to_replace_mbart.py b/cookiecutter-template-PEGASUS/to_replace_pegasus.py
similarity index 71%
rename from cookiecutter-template-MBART/to_replace_mbart.py
rename to cookiecutter-template-PEGASUS/to_replace_pegasus.py
index e05f0c039a7ee1..0323f62749f407 100644
--- a/cookiecutter-template-MBART/to_replace_mbart.py
+++ b/cookiecutter-template-PEGASUS/to_replace_pegasus.py
@@ -29,33 +29,33 @@
 # Below: "if is_torch_available():" if generating PyTorch
 # Replace with:
 
-    from .models.mbart import (
-        MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
-        MBartForConditionalGeneration,
-        MBartForQuestionAnswering,
-        MBartForSequenceClassification,
-        MBartModel,
+    from .models.pegasus import (
+        PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
+        PegasusForConditionalGeneration,
+        PegasusForQuestionAnswering,
+        PegasusForSequenceClassification,
+        PegasusModel,
     )
 # End.
 
 # Below: "if is_tf_available():" if generating TensorFlow
 # Replace with:
 
-    from .models.mbart import (
-        TFMBartForConditionalGeneration,
-        TFMBartModel,
-        TFMBartPreTrainedModel,
+    from .models.pegasus import (
+        TFPegasusForConditionalGeneration,
+        TFPegasusModel,
+        TFPegasusPreTrainedModel,
     )
 # End.
 
 # Below: "if is_tokenizers_available():"
 # Replace with:
-    from .models.mbart import MBartTokenizerFast
+    from .models.pegasus import PegasusTokenizerFast
 # End.
 
 # Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
 # Replace with:
-from .models.mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig, MBartTokenizer
+from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
 # End.
 
 
@@ -63,22 +63,22 @@
 # To replace in: "src/transformers/models/auto/configuration_auto.py"
 # Below: "# Add configs here"
 # Replace with:
-        ("mbart", MBartConfig),
+        ("pegasus", PegasusConfig),
 # End.
 
 # Below: "# Add archive maps here"
 # Replace with:
-        MBART_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP,
 # End.
 
 # Below: "from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
 # Replace with:
-from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+from ..pegasus.configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
 # End.
 
 # Below: "# Add full (and cased) model names here"
 # Replace with:
-        ("mbart", "MBart"),
+        ("pegasus", "Pegasus"),
 # End.
 
 
@@ -86,28 +86,28 @@
 # To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
 # Below: "from .configuration_auto import ("
 # Replace with:
-    MBartConfig,
+    PegasusConfig,
 # End.
 
 # Below: "# Add modeling imports here"
 # Replace with:
-from ..mbart.modeling_mbart import (
-    MBartForConditionalGeneration,
-    MBartForQuestionAnswering,
-    MBartForSequenceClassification,
-    MBartModel,
+from ..pegasus.modeling_pegasus import (
+    PegasusForConditionalGeneration,
+    PegasusForQuestionAnswering,
+    PegasusForSequenceClassification,
+    PegasusModel,
 )
 # End.
 
 # Below: "# Base model mapping"
 # Replace with:
-        (MBartConfig, MBartModel),
+        (PegasusConfig, PegasusModel),
 # End.
 
 # Below: "# Model with LM heads mapping"
 # Replace with:
 
-        (MBartConfig, MBartForConditionalGeneration),
+        (PegasusConfig, PegasusForConditionalGeneration),
 # End.
 
 # Below: "# Model for Causal LM mapping"
@@ -120,12 +120,12 @@
 
 # Below: "# Model for Sequence Classification mapping"
 # Replace with:
-        (MBartConfig, MBartForSequenceClassification),
+        (PegasusConfig, PegasusForSequenceClassification),
 # End.
 
 # Below: "# Model for Question Answering mapping"
 # Replace with:
-        (MBartConfig, MBartForQuestionAnswering),
+        (PegasusConfig, PegasusForQuestionAnswering),
 # End.
 
 # Below: "# Model for Token Classification mapping"
@@ -139,32 +139,32 @@
 # Below: "# Model for Seq2Seq Causal LM mapping"
 # Replace with:
 
-        (MBartConfig, MBartForConditionalGeneration),
+        (PegasusConfig, PegasusForConditionalGeneration),
 # End.
 
 # To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
 # Below: "from .configuration_auto import ("
 # Replace with:
-    MBartConfig,
+    PegasusConfig,
 # End.
 
 # Below: "# Add modeling imports here"
 # Replace with:
-from ..mbart.modeling_tf_mbart import (
-    TFMBartForConditionalGeneration,
-    TFMBartModel,
+from ..pegasus.modeling_tf_pegasus import (
+    TFPegasusForConditionalGeneration,
+    TFPegasusModel,
 )
 # End.
 
 # Below: "# Base model mapping"
 # Replace with:
-        (MBartConfig, TFMBartModel),
+        (PegasusConfig, TFPegasusModel),
 # End.
 
 # Below: "# Model with LM heads mapping"
 # Replace with:
 
-        (MBartConfig, TFMBartForConditionalGeneration),
+        (PegasusConfig, TFPegasusForConditionalGeneration),
 # End.
 
 # Below: "# Model for Causal LM mapping"
@@ -194,19 +194,19 @@
 # Below: "# Model for Seq2Seq Causal LM mapping"
 # Replace with:
 
-        (MBartConfig, TFMBartForConditionalGeneration),
+        (PegasusConfig, TFPegasusForConditionalGeneration),
 # End.
 
 # To replace in: "utils/check_repo.py" if generating PyTorch
 
 # Below: "models to ignore for model xxx mapping"
 # Replace with:
-"MBartEncoder",
-    "MBartDecoder",
+"PegasusEncoder",
+    "PegasusDecoder",
 # End.
 
 # Below: "models to ignore for not tested"
 # Replace with:
-"MBartEncoder",  # Building part of bigger (tested) model.
-    "MBartDecoder",  # Building part of bigger (tested) model.
+"PegasusEncoder",  # Building part of bigger (tested) model.
+    "PegasusDecoder",  # Building part of bigger (tested) model.
 # End.
diff --git a/cookiecutter-template-MBART/tokenization_fast_mbart.py b/cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
similarity index 50%
rename from cookiecutter-template-MBART/tokenization_fast_mbart.py
rename to cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
index c7b1a0f4a7cd7c..9efe75cbf22685 100644
--- a/cookiecutter-template-MBART/tokenization_fast_mbart.py
+++ b/cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,45 +12,42 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for MBART."""
+"""Tokenization classes for PEGASUS."""
 from ...utils import logging
-from ..bert.tokenization_bert_fast import BertTokenizerFast
-from .tokenization_mbart import MBartTokenizer
+from ..bart.tokenization_bart_fast import BartTokenizerFast
+from .tokenization_pegasus import PegasusTokenizer
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/vocab.txt",
-    }
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/tokenizer.json",
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/mbart-large-cc25": 512,
-}
-
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "facebook/mbart-large-cc25": {"do_lower_case": False},
+    "google/pegasus-large": 1024,
 }
 
 
-class MBartTokenizerFast(BertTokenizerFast):
+class PegasusTokenizerFast(BartTokenizerFast):
     r"""
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library).
 
-    :class:`~transformers.MBartTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    :class:`~transformers.PegasusTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
     parameters.
     """
 
-    vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    slow_tokenizer_class = MBartTokenizer
+    slow_tokenizer_class = PegasusTokenizer
diff --git a/cookiecutter-template-MBART/tokenization_mbart.py b/cookiecutter-template-PEGASUS/tokenization_pegasus.py
similarity index 53%
rename from cookiecutter-template-MBART/tokenization_mbart.py
rename to cookiecutter-template-PEGASUS/tokenization_pegasus.py
index c3035cf5cea94e..c19a0a856020c8 100644
--- a/cookiecutter-template-MBART/tokenization_mbart.py
+++ b/cookiecutter-template-PEGASUS/tokenization_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,43 +12,40 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for MBART."""
+"""Tokenization classes for PEGASUS."""
 from ...utils import logging
-from ..bert.tokenization_bert import BertTokenizer
+from ..bart.tokenization_bart import BartTokenizer
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/vocab.txt",
-    }
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/tokenizer.json",
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/mbart-large-cc25": 512,
+    "google/pegasus-large": 1024,
 }
 
 
-PRETRAINED_INIT_CONFIGURATION = {
-    "facebook/mbart-large-cc25": {"do_lower_case": False},
-}
-
-
-class MBartTokenizer(BertTokenizer):
-    r"""
-    Construct a MBART tokenizer.
+class PegasusTokenizer(BartTokenizer):
+    """
+    Construct a PEGASUS tokenizer.
 
-    :class:`~transformers.MBartTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
     parameters.
     """
 
-    vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/src/transformers/models/old_pegasus/__init__.py b/src/transformers/models/old_pegasus/__init__.py
new file mode 100644
index 00000000000000..20d1c3872dc1c3
--- /dev/null
+++ b/src/transformers/models/old_pegasus/__init__.py
@@ -0,0 +1,33 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_pegasus import PegasusConfig
+
+
+if is_sentencepiece_available():
+    from .tokenization_pegasus import PegasusTokenizer
+
+if is_tokenizers_available():
+    from .tokenization_pegasus_fast import PegasusTokenizerFast
+
+if is_torch_available():
+    from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
+
+if is_tf_available():
+    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
diff --git a/src/transformers/models/old_pegasus/configuration_pegasus.py b/src/transformers/models/old_pegasus/configuration_pegasus.py
new file mode 100644
index 00000000000000..585f06ddb46e6c
--- /dev/null
+++ b/src/transformers/models/old_pegasus/configuration_pegasus.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PEGASUS model configuration """
+
+from ...utils import logging
+from ..bart.configuration_bart import BartConfig
+
+
+logger = logging.get_logger(__name__)
+
+# These config values do not vary between checkpoints
+DEFAULTS = dict(
+    vocab_size=96103,
+    max_position_embeddings=512,
+    d_model=1024,
+    encoder_ffn_dim=4096,
+    decoder_ffn_dim=4096,
+    encoder_attention_heads=16,
+    decoder_attention_heads=16,
+    encoder_layers=16,
+    decoder_layers=16,
+    dropout=0.1,
+    attention_dropout=0.1,
+    activation_dropout=0.1,
+    pad_token_id=0,
+    eos_token_id=1,
+    is_encoder_decoder=True,
+    normalize_before=True,
+    scale_embedding=True,
+    normalize_embedding=False,
+    add_final_layer_norm=True,
+    static_position_embeddings=True,
+    num_beams=8,
+    activation_function="relu",
+)
+# Config values that vary between checkpoints: for testing and conversion
+task_specific_params = {
+    # These are task specific params for pegasus-large and normal params for finetuned checkpoints
+    "summarization_xsum": {"length_penalty": 0.6, "max_length": 64, "max_position_embeddings": 512},
+    "summarization_cnn_dailymail": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 1024},
+    "summarization_newsroom": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 512},
+    "summarization_wikihow": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 512},
+    "summarization_multi_news": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+    "summarization_reddit_tifu": {"length_penalty": 0.6, "max_length": 128, "max_position_embeddings": 512},
+    "summarization_big_patent": {"length_penalty": 0.7, "max_length": 256, "max_position_embeddings": 1024},
+    "summarization_arxiv": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+    "summarization_pubmed": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+    "summarization_gigaword": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 128},
+    "summarization_aeslc": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 512},
+    "summarization_billsum": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 1024},
+    # this last entry is useless -- just for consistency
+    "summarization_large": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+}
+
+
+class PegasusConfig(BartConfig):
+    """
+    This is the configuration class to store the configuration of a
+    :class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 96103):
+            Vocabulary size of the Pegasus model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusForConditionalGeneration`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 16):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 16):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            This should be completed, specific to marian.
+        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Call layernorm before attention ops.
+        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Call layernorm after embeddings.
+        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Don't learn positional embeddings, use sinusoidal.
+        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Why not add another layernorm?
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Scale embeddings by diving by sqrt(d_model).
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
+            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
+    """
+
+    model_type = "pegasus"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # The implementation of the config object is in BartConfig
diff --git a/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
new file mode 100644
index 00000000000000..9254a0ba941100
--- /dev/null
+++ b/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from pathlib import Path
+from typing import Dict
+
+import tensorflow as tf
+import torch
+from tqdm import tqdm
+
+from transformers import PegasusConfig, PegasusForConditionalGeneration, PegasusTokenizer
+from transformers.models.pegasus.configuration_pegasus import DEFAULTS, task_specific_params
+
+
+PATTERNS = [
+    # replace left string with right string to get the relevant state_dict key (identical state dict to bart)
+    ["memory_attention", "encoder_attn"],
+    ["attention", "attn"],
+    ["/", "."],
+    [".LayerNorm.gamma", "_layer_norm.weight"],
+    [".LayerNorm.beta", "_layer_norm.bias"],
+    ["r.layer_", "r.layers."],
+    ["output_proj", "out_proj"],
+    ["ffn.dense_1.", "fc2."],
+    ["ffn.dense.", "fc1."],
+    ["ffn_layer_norm", "final_layer_norm"],
+    ["kernel", "weight"],
+    ["encoder_layer_norm.", "encoder.layer_norm."],
+    ["decoder_layer_norm.", "decoder.layer_norm."],
+    ["embeddings.weights", "shared.weight"],
+]
+
+
+def rename_state_dict_key(k):
+
+    for pegasus_name, hf_name in PATTERNS:
+        k = k.replace(pegasus_name, hf_name)
+    return k
+
+
+# See appendix C of paper for all hyperparams
+
+
+def convert_pegasus(tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration:
+    cfg_kwargs = DEFAULTS.copy()
+    cfg_kwargs.update(cfg_updates)
+    cfg = PegasusConfig(**cfg_kwargs)
+    torch_model = PegasusForConditionalGeneration(cfg)
+    sd = torch_model.model.state_dict()
+    mapping = {}
+    for k, v in tf_weights.items():
+        new_k = rename_state_dict_key(k)
+        if new_k not in sd:
+            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
+
+        if "dense" in k or "proj" in new_k:
+            v = v.T
+        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
+        assert v.shape == sd[new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
+    # make sure embedding.padding_idx is respected
+    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(mapping["shared.weight"][cfg.pad_token_id + 1])
+    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
+    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
+    empty_biases = {k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping}
+    mapping.update(**empty_biases)
+    missing, extra = torch_model.model.load_state_dict(mapping, strict=False)
+    unexpected_missing = [
+        k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
+    ]
+    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
+    assert extra == [], f"no matches found for the following tf keys {extra}"
+    return torch_model
+
+
+def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
+    init_vars = tf.train.list_variables(path)
+    tf_weights = {}
+    ignore_name = ["Adafactor", "global_step"]
+    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
+        skip_key = any([pat in name for pat in ignore_name])
+        if skip_key:
+            continue
+        array = tf.train.load_variable(path, name)
+        tf_weights[name] = array
+    return tf_weights
+
+
+def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
+    # save tokenizer first
+    dataset = Path(ckpt_path).parent.name
+    desired_max_model_length = task_specific_params[f"summarization_{dataset}"]["max_position_embeddings"]
+    tok = PegasusTokenizer.from_pretrained("sshleifer/pegasus", model_max_length=desired_max_model_length)
+    assert tok.model_max_length == desired_max_model_length
+    tok.save_pretrained(save_dir)
+
+    # convert model
+    tf_weights = get_tf_weights_as_numpy(ckpt_path)
+    cfg_updates = task_specific_params[f"summarization_{dataset}"]
+    if dataset == "large":
+        cfg_updates["task_specific_params"] = task_specific_params
+    torch_model = convert_pegasus(tf_weights, cfg_updates)
+    torch_model.save_pretrained(save_dir)
+    sd = torch_model.state_dict()
+    sd.pop("model.decoder.embed_positions.weight")
+    sd.pop("model.encoder.embed_positions.weight")
+    torch.save(sd, Path(save_dir) / "pytorch_model.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
+    parser.add_argument("save_dir", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+    if args.save_dir is None:
+        dataset = Path(args.tf_ckpt_path).parent.name
+        args.save_dir = os.path.join("pegasus", dataset)
+    convert_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir)
diff --git a/src/transformers/models/old_pegasus/modeling_pegasus.py b/src/transformers/models/old_pegasus/modeling_pegasus.py
new file mode 100644
index 00000000000000..c7fde4164330db
--- /dev/null
+++ b/src/transformers/models/old_pegasus/modeling_pegasus.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Pegasus model, ported from https://github.com/google-research/pegasus"""
+
+
+from ...file_utils import add_start_docstrings
+from ..bart.modeling_bart import BART_START_DOCSTRING, BartForConditionalGeneration, BartModel
+from .configuration_pegasus import PegasusConfig
+
+
+@add_start_docstrings(
+    "The bare Pegasus Model transformer outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+class PegasusModel(BartModel):
+    r"""
+    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = PegasusConfig
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        "encoder.embed_positions",
+        "decoder.embed_positions",
+    ]
+    _keys_to_ignore_on_save = [
+        "encoder.embed_positions.weight",
+        "decoder.embed_positions.weight",
+    ]
+
+
+@add_start_docstrings("The Pegasus Model for summarization ", BART_START_DOCSTRING)
+class PegasusForConditionalGeneration(BartForConditionalGeneration):
+    r"""
+    Pytorch version of google's pegasus model for summarization. Available models are listed `here
+    <https://huggingface.co/models?search=pegasus>`__.
+
+    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+
+    Examples::
+
+        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
+        >>> from typing import List
+        >>> PGE_ARTICLE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+        >>> mname = "google/pegasus-xsum"
+
+        >>> model = PegasusForConditionalGeneration.from_pretrained(mname)
+        >>> tok = PegasusTokenizer.from_pretrained(mname)
+        >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE], return_tensors="pt")  # don't need tgt_text for inference
+        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
+        >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True)
+        >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers."
+
+    """
+    # All the code is in src/transformers/models/bart/modeling_bart.py
+    config_class = PegasusConfig
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        "model.encoder.embed_positions",
+        "model.decoder.embed_positions",
+    ]
+    _keys_to_ignore_on_save = [
+        "model.encoder.embed_positions.weight",
+        "model.decoder.embed_positions.weight",
+    ]
diff --git a/src/transformers/models/old_pegasus/modeling_tf_pegasus.py b/src/transformers/models/old_pegasus/modeling_tf_pegasus.py
new file mode 100644
index 00000000000000..bec856575d191a
--- /dev/null
+++ b/src/transformers/models/old_pegasus/modeling_tf_pegasus.py
@@ -0,0 +1,41 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF Pegasus model, ported from the fairseq repo."""
+from ...file_utils import add_start_docstrings
+from ...utils import logging
+from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
+from .configuration_pegasus import PegasusConfig
+
+
+_CONFIG_FOR_DOC = "PegasusConfig"
+
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_start_docstrings("Pegasus model for summarization", START_DOCSTRING)
+class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration):
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
+    ]
+    config_class = PegasusConfig
+    # All the code is in src/transformers/models/bart/modeling_tf_bart.py
diff --git a/src/transformers/models/old_pegasus/tokenization_pegasus.py b/src/transformers/models/old_pegasus/tokenization_pegasus.py
new file mode 100644
index 00000000000000..099bdf3e7b3136
--- /dev/null
+++ b/src/transformers/models/old_pegasus/tokenization_pegasus.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...file_utils import add_start_docstrings
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
+from ...utils import logging
+
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/pegasus-xsum": 512,
+}
+
+
+logger = logging.get_logger(__name__)
+
+
+class PegasusTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    offset = 103  # entries 2 - 104 are only used for pretraining
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.mask_token_sent = mask_token_sent
+
+        # add special tokens to encoder dict
+        self.encoder: Dict[int, str] = {
+            0: self.pad_token,
+            1: self.eos_token,
+            2: self.mask_token_sent,
+            3: self.mask_token,
+        }
+        # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
+        # mask_token_sent is already added to list -> so start at 1
+        self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
+        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + self.offset
+
+    def get_vocab(self) -> Dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, sample=False):
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        return pieces
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """ Converts a token (str) to an id using the vocab. """
+        if token in self.decoder:
+            return self.decoder[token]
+        elif token in self.added_tokens_decoder:
+            return self.added_tokens_decoder[token]
+        sp_id = self.sp_model.piece_to_id(token)
+        return sp_id + self.offset
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) to a token (str) using the vocab."""
+        if index in self.encoder:
+            return self.encoder[index]
+        elif index in self.added_tokens_encoder:
+            return self.added_tokens_encoder[index]
+        else:
+            token = self.sp_model.IdToPiece(index - self.offset)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """Just EOS"""
+        return 1
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
+        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
+
+        - single sequence: ``X </s>``
+        - pair of sequences: ``A B </s>`` (not intended use)
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        return_tensors: str = None,
+        truncation=True,
+        padding="longest",
+        **unused,
+    ) -> BatchEncoding:
+        if "" in src_texts:
+            raise ValueError(f"found empty string in src_texts: {src_texts}")
+        tokenizer_kwargs = dict(
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+        )
+        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
+        if tgt_texts is None:
+            return model_inputs
+        if max_target_length is not None:
+            tokenizer_kwargs["max_length"] = max_target_length
+        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
+        model_inputs["labels"] = labels
+        return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py b/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
new file mode 100644
index 00000000000000..c9b0d076314057
--- /dev/null
+++ b/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
@@ -0,0 +1,232 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model PEGASUS."""
+
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from ...file_utils import add_start_docstrings, is_sentencepiece_available
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_pegasus import PegasusTokenizer
+else:
+    PegasusTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"},
+    "tokenizer_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/tokenizer.json"},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/pegasus-xsum": 512,
+}
+
+
+class PegasusTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
+    """
+    offset = 103  # entries 2-104 are only used for pretraining
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = PegasusTokenizer
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        tokenizer_file=None,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """
+        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
+
+        - single sequence: ``X </s>``
+        - pair of sequences: ``A B </s>`` (not intended use)
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        return_tensors: str = None,
+        truncation=True,
+        padding="longest",
+        **unused,
+    ) -> BatchEncoding:
+        if "" in src_texts:
+            raise ValueError(f"found empty string in src_texts: {src_texts}")
+        tokenizer_kwargs = dict(
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+        )
+        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
+        if tgt_texts is None:
+            return model_inputs
+        if max_target_length is not None:
+            tokenizer_kwargs["max_length"] = max_target_length
+        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
+        model_inputs["labels"] = labels
+        return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index 20d1c3872dc1c3..ebb3152a34abf4 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -15,19 +15,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_pegasus import PegasusConfig
-
-
-if is_sentencepiece_available():
-    from .tokenization_pegasus import PegasusTokenizer
+from ...file_utils import is_torch_available, is_tokenizers_available
+from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
+from .tokenization_pegasus import PegasusTokenizer
 
 if is_tokenizers_available():
     from .tokenization_pegasus_fast import PegasusTokenizerFast
 
 if is_torch_available():
-    from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
+    from .modeling_pegasus import (
+        PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
+        PegasusForConditionalGeneration,
+        PegasusForQuestionAnswering,
+        PegasusForSequenceClassification,
+        PegasusModel,
+        PegasusPreTrainedModel,
+    )
+
 
-if is_tf_available():
-    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 585f06ddb46e6c..aab4d1987ddd58 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
+# Copyright Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,84 +14,49 @@
 # limitations under the License.
 """ PEGASUS model configuration """
 
+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..bart.configuration_bart import BartConfig
 
 
 logger = logging.get_logger(__name__)
 
-# These config values do not vary between checkpoints
-DEFAULTS = dict(
-    vocab_size=96103,
-    max_position_embeddings=512,
-    d_model=1024,
-    encoder_ffn_dim=4096,
-    decoder_ffn_dim=4096,
-    encoder_attention_heads=16,
-    decoder_attention_heads=16,
-    encoder_layers=16,
-    decoder_layers=16,
-    dropout=0.1,
-    attention_dropout=0.1,
-    activation_dropout=0.1,
-    pad_token_id=0,
-    eos_token_id=1,
-    is_encoder_decoder=True,
-    normalize_before=True,
-    scale_embedding=True,
-    normalize_embedding=False,
-    add_final_layer_norm=True,
-    static_position_embeddings=True,
-    num_beams=8,
-    activation_function="relu",
-)
-# Config values that vary between checkpoints: for testing and conversion
-task_specific_params = {
-    # These are task specific params for pegasus-large and normal params for finetuned checkpoints
-    "summarization_xsum": {"length_penalty": 0.6, "max_length": 64, "max_position_embeddings": 512},
-    "summarization_cnn_dailymail": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 1024},
-    "summarization_newsroom": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_wikihow": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 512},
-    "summarization_multi_news": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_reddit_tifu": {"length_penalty": 0.6, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_big_patent": {"length_penalty": 0.7, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_arxiv": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_pubmed": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_gigaword": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 128},
-    "summarization_aeslc": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 512},
-    "summarization_billsum": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 1024},
-    # this last entry is useless -- just for consistency
-    "summarization_large": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/config.json",
+    # See all PEGASUS models at https://huggingface.co/models?filter=pegasus
 }
 
 
-class PegasusConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
-    specified arguments, defining the model architecture.
+class PegasusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`.
+    It is used to instantiate an PEGASUS model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the PEGASUS `google/pegasus-large <https://huggingface.co/google/pegasus-large>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 96103):
-            Vocabulary size of the Pegasus model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusForConditionalGeneration`.
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
+            :class:`~transformers.TFPegasusModel`.
         d_model (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 16):
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 16):
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
             Number of decoder layers.
         encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
         decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -108,38 +73,94 @@ class PegasusConfig(BartConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        Example::
+
+        >>> from transformers import PegasusModel, PegasusConfig
+
+        >>> # Initializing a PEGASUS google/pegasus-large style configuration
+        >>> configuration = PegasusConfig()
+
+        >>> # Initializing a model from the google/pegasus-large style configuration
+        >>> model = PegasusModel(configuration)
 
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
     model_type = "pegasus"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # The implementation of the config object is in BartConfig
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
old mode 100644
new mode 100755
index c7fde4164330db..7fa0ee66c3c0e2
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
+# Copyright Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,72 +12,1412 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Pegasus model, ported from https://github.com/google-research/pegasus"""
+""" PyTorch PEGASUS model. """
 
 
-from ...file_utils import add_start_docstrings
-from ..bart.modeling_bart import BART_START_DOCSTRING, BartForConditionalGeneration, BartModel
-from .configuration_pegasus import PegasusConfig
+import random
+from typing import Optional, Tuple
 
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
 
-@add_start_docstrings(
-    "The bare Pegasus Model transformer outputting raw hidden-states without any specific head on top.",
-    BART_START_DOCSTRING,
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
 )
-class PegasusModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_pegasus import PegasusConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PegasusConfig"
+_TOKENIZER_FOR_DOC = "PegasusTokenizer"
+
+
+PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/pegasus-large",
+    # See all PEGASUS models at https://huggingface.co/models?filter=pegasus
+]
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
     """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
 
-    config_class = PegasusConfig
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-        "encoder.embed_positions",
-        "decoder.embed_positions",
-    ]
-    _keys_to_ignore_on_save = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(
+    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
 
+    inverted_mask = 1.0 - expanded_mask
 
-@add_start_docstrings("The Pegasus Model for summarization ", BART_START_DOCSTRING)
-class PegasusForConditionalGeneration(BartForConditionalGeneration):
-    r"""
-    Pytorch version of google's pegasus model for summarization. Available models are listed `here
-    <https://huggingface.co/models?search=pegasus>`__.
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
 
-    Examples::
+def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
 
-        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
-        >>> from typing import List
-        >>> PGE_ARTICLE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
-        >>> mname = "google/pegasus-xsum"
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
-        >>> model = PegasusForConditionalGeneration.from_pretrained(mname)
-        >>> tok = PegasusTokenizer.from_pretrained(mname)
-        >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE], return_tensors="pt")  # don't need tgt_text for inference
-        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
-        >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True)
-        >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers."
 
+class PegasusLearnedPositionalEmbedding(nn.Embedding):
     """
-    # All the code is in src/transformers/models/bart/modeling_bart.py
+    This module learns positional embeddings up to a fixed maximum size. 
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
+        num_embeddings
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class PegasusAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class PegasusEncoderLayer(nn.Module):
+    def __init__(self, config: PegasusConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = PegasusAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = PegasusLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class PegasusDecoderLayer(nn.Module):
+    def __init__(self, config: PegasusConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = PegasusAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.encoder_attn = PegasusAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = PegasusLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class PegasusClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class PegasusPreTrainedModel(PreTrainedModel):
     config_class = PegasusConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+PEGASUS_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.PegasusConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+PEGASUS_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration, PegasusConfig
+
+        >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+"""
+
+PEGASUS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_pegasus._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class PegasusEncoder(PegasusPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`PegasusEncoderLayer`.
+
+    Args:
+        config: PegasusConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = PegasusLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = PegasusLayerNorm(embed_dim)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class PegasusDecoder(PegasusPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`PegasusDecoderLayer`
+
+    Args:
+        config: PegasusConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = PegasusLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([PegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = PegasusLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare PEGASUS Model outputting raw hidden-states without any specific head on top.",
+    PEGASUS_START_DOCSTRING,
+)
+class PegasusModel(PegasusPreTrainedModel):
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = PegasusEncoder(config, self.shared)
+        self.decoder = PegasusDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="google/pegasus-large",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The PEGASUS Model with a language modeling head. Can be used for summarization.", PEGASUS_START_DOCSTRING
+)
+class PegasusForConditionalGeneration(PegasusPreTrainedModel):
+    base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
         r"final_logits_bias",
         r"encoder\.version",
         r"decoder\.version",
-        "model.encoder.embed_positions",
-        "model.decoder.embed_positions",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
+        r"lm_head\.weight",
     ]
+
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+        self.model = PegasusModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(PEGASUS_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
+            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    Pegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    PEGASUS_START_DOCSTRING,
+)
+class PegasusForSequenceClassification(PegasusPreTrainedModel):
+    def __init__(self, config: PegasusConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = PegasusModel(config)
+        self.classification_head = PegasusClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="google/pegasus-large",
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    PEGASUS Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    PEGASUS_START_DOCSTRING,
+)
+class PegasusForQuestionAnswering(PegasusPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = PegasusModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.model._init_weights(self.qa_outputs)
+
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="google/pegasus-large",
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
diff --git a/tests/test_modeling_old_pegasus.py b/tests/test_modeling_old_pegasus.py
new file mode 100644
index 00000000000000..dc9fdf522547cf
--- /dev/null
+++ b/tests/test_modeling_old_pegasus.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import AutoConfig, AutoTokenizer, is_torch_available
+from transformers.file_utils import cached_property
+from transformers.models.pegasus.configuration_pegasus import task_specific_params
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils.logging import ERROR, set_verbosity
+
+from .test_modeling_bart import PGE_ARTICLE
+from .test_modeling_common import ModelTesterMixin
+from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
+
+
+if is_torch_available():
+    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
+
+XSUM_ENTRY_LONGER = """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """
+
+set_verbosity(ERROR)
+
+
+@require_torch
+class ModelTester:
+    def __init__(self, parent):
+        self.config = PegasusConfig(
+            vocab_size=99,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            add_final_layer_norm=True,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        return self.config, {}
+
+
+@require_torch
+class SelectiveCommonTest(unittest.TestCase):
+    all_model_classes = (PegasusForConditionalGeneration, PegasusModel) if is_torch_available() else ()
+
+    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+
+    def setUp(self):
+        self.model_tester = ModelTester(self)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "google/pegasus-xsum"
+    src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER]
+    tgt_text = [
+        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
+        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
+    ]
+
+    @cached_property
+    def model(self):
+        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+
+    @slow
+    def test_pegasus_xsum_summary(self):
+        assert self.tokenizer.model_max_length == 512
+        inputs = self.tokenizer(self.src_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
+            torch_device
+        )
+        assert inputs.input_ids.shape == (2, 421)
+        translated_tokens = self.model.generate(**inputs, num_beams=2)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        assert self.tgt_text == decoded
+
+        if "cuda" not in torch_device:
+            return
+        # Demonstrate fp16 issue, Contributions welcome!
+        self.model.half()
+        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
+        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
+        assert decoded_fp16 == [
+            "California's largest electricity provider has begun",
+            "N-Dubz have revealed they were",
+        ]
+
+
+class PegasusConfigTests(unittest.TestCase):
+    @slow
+    def test_task_specific_params(self):
+        """Test that task_specific params['summarization_xsum'] == config['pegasus_xsum'] """
+        failures = []
+        pegasus_prefix = "google/pegasus"
+        n_prefix_chars = len("summarization_")
+        for task, desired_settings in task_specific_params.items():
+            dataset = task[n_prefix_chars:]
+            mname = f"{pegasus_prefix}-{dataset}"
+            cfg = AutoConfig.from_pretrained(mname)
+            for k, v in desired_settings.items():
+                actual_value = getattr(cfg, k)
+                if actual_value != v:
+                    failures.append(f"config for {mname} had {k}: {actual_value}, expected {v}")
+            tokenizer = AutoTokenizer.from_pretrained(mname)
+            n_pos_embeds = desired_settings["max_position_embeddings"]
+            if n_pos_embeds != tokenizer.model_max_length:
+                failures.append(f"tokenizer.model_max_length {tokenizer.model_max_length} expected {n_pos_embeds}")
+
+        # error
+        all_fails = "\n".join(failures)
+        assert not failures, f"The following configs have unexpected settings: {all_fails}"
diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
index dc9fdf522547cf..835fb148544f4f 100644
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -1,4 +1,5 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# coding=utf-8
+# Copyright Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,116 +12,377 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch PEGASUS model. """
 
+
+import copy
+import tempfile
 import unittest
 
-from transformers import AutoConfig, AutoTokenizer, is_torch_available
+import timeout_decorator  # noqa
+
+from transformers import is_torch_available
 from transformers.file_utils import cached_property
-from transformers.models.pegasus.configuration_pegasus import task_specific_params
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-from transformers.utils.logging import ERROR, set_verbosity
 
-from .test_modeling_bart import PGE_ARTICLE
-from .test_modeling_common import ModelTesterMixin
-from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 if is_torch_available():
-    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
+    import torch
 
-XSUM_ENTRY_LONGER = """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """
+    from transformers import (
+        PegasusConfig,
+        PegasusForConditionalGeneration,
+        PegasusForQuestionAnswering,
+        PegasusForSequenceClassification,
+        PegasusModel,
+        PegasusTokenizer,
+    )
+    from transformers.models.pegasus.modeling_pegasus import (
+        PegasusDecoder,
+        PegasusEncoder,
+    )
 
-set_verbosity(ERROR)
+
+def prepare_pegasus_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
 
 
 @require_torch
-class ModelTester:
-    def __init__(self, parent):
-        self.config = PegasusConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
+class PegasusModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
         )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = PegasusConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_pegasus_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
 
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = PegasusModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
 
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (PegasusForConditionalGeneration, PegasusModel) if is_torch_available() else ()
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
 
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = PegasusModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = PegasusEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = PegasusDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (PegasusModel, PegasusForConditionalGeneration, PegasusForSequenceClassification, PegasusForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (PegasusForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
 
     def setUp(self):
-        self.model_tester = ModelTester(self)
+        self.model_tester = PegasusModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PegasusConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # PegasusForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (PegasusModel, PegasusForConditionalGeneration, PegasusForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = PegasusForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
 
 
 @require_torch
 @require_sentencepiece
 @require_tokenizers
-class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "google/pegasus-xsum"
-    src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER]
-    tgt_text = [
-        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
-        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
-    ]
-
+@slow
+class PegasusModelIntegrationTests(unittest.TestCase):
     @cached_property
-    def model(self):
-        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
-
-    @slow
-    def test_pegasus_xsum_summary(self):
-        assert self.tokenizer.model_max_length == 512
-        inputs = self.tokenizer(self.src_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
-            torch_device
+    def default_tokenizer(self):
+        return PegasusTokenizer.from_pretrained('google/pegasus-large')
+
+    def test_inference_no_head(self):
+        model = PegasusModel.from_pretrained('google/pegasus-large').to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
+        inputs_dict = prepare_pegasus_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
-        assert inputs.input_ids.shape == (2, 421)
-        translated_tokens = self.model.generate(**inputs, num_beams=2)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-        if "cuda" not in torch_device:
-            return
-        # Demonstrate fp16 issue, Contributions welcome!
-        self.model.half()
-        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
-        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
-        assert decoded_fp16 == [
-            "California's largest electricity provider has begun",
-            "N-Dubz have revealed they were",
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large').to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_pegasus_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        hf = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large').to(torch_device)
+        tok = PegasusTokenizer.from_pretrained('google/pegasus-large')
+
+        batch_input = [
+            # string 1,
+            # string 2,
+            # string 3,
+            # string 4,
         ]
 
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        dct = tok.batch_encode_plus(
+            batch_input,
+            max_length=512,
+            padding="max_length",
+            truncation_strategy="only_first",
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=2,
+        )
 
-class PegasusConfigTests(unittest.TestCase):
-    @slow
-    def test_task_specific_params(self):
-        """Test that task_specific params['summarization_xsum'] == config['pegasus_xsum'] """
-        failures = []
-        pegasus_prefix = "google/pegasus"
-        n_prefix_chars = len("summarization_")
-        for task, desired_settings in task_specific_params.items():
-            dataset = task[n_prefix_chars:]
-            mname = f"{pegasus_prefix}-{dataset}"
-            cfg = AutoConfig.from_pretrained(mname)
-            for k, v in desired_settings.items():
-                actual_value = getattr(cfg, k)
-                if actual_value != v:
-                    failures.append(f"config for {mname} had {k}: {actual_value}, expected {v}")
-            tokenizer = AutoTokenizer.from_pretrained(mname)
-            n_pos_embeds = desired_settings["max_position_embeddings"]
-            if n_pos_embeds != tokenizer.model_max_length:
-                failures.append(f"tokenizer.model_max_length {tokenizer.model_max_length} expected {n_pos_embeds}")
-
-        # error
-        all_fails = "\n".join(failures)
-        assert not failures, f"The following configs have unexpected settings: {all_fails}"
+        EXPECTED = [
+            # here expected 1,
+            # here expected 2,
+            # here expected 3,
+            # here expected 4,
+        ]
+
+        generated = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == EXPECTED

From 47a5d9a7694a10f215aee3d998a00e24e3bc103e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 30 Dec 2020 12:47:41 +0000
Subject: [PATCH 07/51] save intermediate

---
 src/transformers/models/pegasus/__init__.py   |  5 +-
 .../models/pegasus/modeling_pegasus.py        | 59 +++++++++++++------
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index ebb3152a34abf4..b0825881c9a537 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -15,7 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...file_utils import is_torch_available, is_tokenizers_available
+from ...file_utils import is_torch_available, is_tokenizers_available, is_tf_available
 from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
 from .tokenization_pegasus import PegasusTokenizer
 
@@ -32,4 +32,5 @@
         PegasusPreTrainedModel,
     )
 
-
+if is_tf_available():
+    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 7fa0ee66c3c0e2..c7dfd956e00cd5 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -21,6 +21,8 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+import numpy as np
+import math
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
@@ -113,16 +115,31 @@ def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwis
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
-class PegasusLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size. 
-    """
+class PegasusSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
 
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
-        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
 
+    @staticmethod
+    def _init_weight(out: nn.Parameter):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         bsz, seq_len = input_ids_shape[:2]
@@ -293,20 +310,20 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
                 This requires the attentions tensor to be reshaped in this function.
         """
         residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.self_attn(
             hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
         )
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
 
         residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
         hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
 
         if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
@@ -370,6 +387,7 @@ def forward(
                 This requires the attentions tensor to be reshaped in this function.
         """
         residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
 
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
@@ -383,13 +401,13 @@ def forward(
         )
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
 
         # Cross-Attention Block
         cross_attn_present_key_value = None
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -402,19 +420,18 @@ def forward(
             )
             hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
-            hidden_states = self.encoder_attn_layer_norm(hidden_states)
 
             # add cross-attn to positions 3,4 of present_key_value tuple
             present_key_value = present_key_value + cross_attn_present_key_value
 
         # Fully Connected
         residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
         hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
 
         outputs = (hidden_states,)
 
@@ -461,6 +478,8 @@ def _init_weights(self, module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, PegasusSinusoidalPositionalEmbedding):
+            pass
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
@@ -601,13 +620,13 @@ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] =
         else:
             self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
 
-        self.embed_positions = PegasusLearnedPositionalEmbedding(
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
             config.max_position_embeddings,
             embed_dim,
             self.padding_idx,
         )
         self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = PegasusLayerNorm(embed_dim)
+        self.layer_norm = PegasusLayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -674,7 +693,6 @@ def forward(
         embed_pos = self.embed_positions(input_shape)
 
         hidden_states = inputs_embeds + embed_pos
-        hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # expand attention_mask
@@ -713,6 +731,8 @@ def custom_forward(*inputs):
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 
+        hidden_states = self.layer_norm(hidden_states)
+
         if output_hidden_states:
             encoder_states = encoder_states + (hidden_states,)
 
@@ -745,13 +765,13 @@ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] =
         else:
             self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
 
-        self.embed_positions = PegasusLearnedPositionalEmbedding(
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
             config.max_position_embeddings,
             config.d_model,
             self.padding_idx,
         )
         self.layers = nn.ModuleList([PegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = PegasusLayerNorm(config.d_model)
+        self.layer_norm = PegasusLayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -865,7 +885,6 @@ def forward(
         positions = self.embed_positions(input_shape, past_key_values_length)
 
         hidden_states = inputs_embeds + positions
-        hidden_states = self.layernorm_embedding(hidden_states)
 
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
 
@@ -925,6 +944,8 @@ def custom_forward(*inputs):
                 all_self_attns += (layer_outputs[1],)
                 all_cross_attentions += (layer_outputs[2],)
 
+        hidden_states = self.layer_norm(hidden_states)
+
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)

From 047f6c9b6d57b0eb8d52c96cfd83e67f97b4426d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 15:45:50 +0000
Subject: [PATCH 08/51] correct pegasus

---
 src/transformers/__init__.py                  |   7 +-
 src/transformers/models/auto/modeling_auto.py |   9 +-
 .../models/mbart/modeling_mbart.py            |  10 ++
 src/transformers/models/pegasus/__init__.py   |   3 +-
 .../models/pegasus/configuration_pegasus.py   |  19 ++-
 .../models/pegasus/modeling_pegasus.py        |  34 +++--
 tests/test_modeling_mbart.py                  |   4 +-
 tests/test_modeling_pegasus.py                | 125 ++++++------------
 8 files changed, 102 insertions(+), 109 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d4b766fffdb982..3e1e55f6c8ebca 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -569,7 +569,12 @@
         OpenAIGPTPreTrainedModel,
         load_tf_weights_in_openai_gpt,
     )
-    from .models.pegasus import PegasusForConditionalGeneration, PegasusModel
+    from .models.pegasus import (
+        PegasusForConditionalGeneration,
+        PegasusForQuestionAnswering,
+        PegasusForSequenceClassification,
+        PegasusModel,
+    )
     from .models.prophetnet import (
         PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
         ProphetNetDecoder,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index bb1a6258e6d873..a5e4e01040e64f 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -137,7 +137,12 @@
 )
 from ..mt5.modeling_mt5 import MT5ForConditionalGeneration, MT5Model
 from ..openai.modeling_openai import OpenAIGPTForSequenceClassification, OpenAIGPTLMHeadModel, OpenAIGPTModel
-from ..pegasus.modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
+from ..pegasus.modeling_pegasus import (
+    PegasusForConditionalGeneration,
+    PegasusForQuestionAnswering,
+    PegasusForSequenceClassification,
+    PegasusModel,
+)
 from ..prophetnet.modeling_prophetnet import ProphetNetForCausalLM, ProphetNetForConditionalGeneration, ProphetNetModel
 from ..rag.modeling_rag import (  # noqa: F401 - need to import all RagModels to be in globals() function
     RagModel,
@@ -453,6 +458,7 @@
         (TransfoXLConfig, TransfoXLForSequenceClassification),
         (MPNetConfig, MPNetForSequenceClassification),
         (TapasConfig, TapasForSequenceClassification),
+        (PegasusConfig, PegasusForSequenceClassification),
     ]
 )
 
@@ -478,6 +484,7 @@
         (FunnelConfig, FunnelForQuestionAnswering),
         (LxmertConfig, LxmertForQuestionAnswering),
         (MPNetConfig, MPNetForQuestionAnswering),
+        (PegasusConfig, PegasusForQuestionAnswering),
     ]
 )
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index b4ebcf269a5601..af2b2fa9d807b7 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1207,6 +1207,16 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index b0825881c9a537..2ea286ab201ee9 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -15,10 +15,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...file_utils import is_torch_available, is_tokenizers_available, is_tf_available
+from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
 from .tokenization_pegasus import PegasusTokenizer
 
+
 if is_tokenizers_available():
     from .tokenization_pegasus_fast import PegasusTokenizerFast
 
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index aab4d1987ddd58..745a66d11d81cf 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -28,14 +28,13 @@
 
 class PegasusConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`.
-    It is used to instantiate an PEGASUS model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the PEGASUS `google/pegasus-large <https://huggingface.co/google/pegasus-large>`__ architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
+    instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
+    <https://huggingface.co/google/pegasus-large>`__ architecture.
 
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-    for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
@@ -80,7 +79,8 @@ class PegasusConfig(PretrainedConfig):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
         Example::
 
         >>> from transformers import PegasusModel, PegasusConfig
@@ -131,7 +131,7 @@ def __init__(
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
-            **kwargs
+            **kwargs,
         )
 
         self.vocab_size = vocab_size
@@ -156,7 +156,6 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
-        
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index c7dfd956e00cd5..7a06b68e916761 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -15,14 +15,14 @@
 """ PyTorch PEGASUS model. """
 
 
+import math
 import random
 from typing import Optional, Tuple
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
-import numpy as np
-import math
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
@@ -88,9 +88,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
-def _expand_mask(
-    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
-):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -507,10 +505,9 @@ def dummy_inputs(self):
 
     Parameters:
         config (:class:`~transformers.PegasusConfig`):
-            Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 PEGASUS_GENERATION_EXAMPLE = r"""
@@ -554,8 +551,8 @@ def dummy_inputs(self):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
-            If you want to change padding behavior, you should read :func:`modeling_pegasus._prepare_decoder_inputs` and
-            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            If you want to change padding behavior, you should read :func:`modeling_pegasus._prepare_decoder_inputs`
+            and modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
             information on the default strategy.
         encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
             Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
@@ -1081,6 +1078,7 @@ class PegasusForConditionalGeneration(PegasusPreTrainedModel):
         r"encoder\.version",
         r"decoder\.version",
         r"lm_head\.weight",
+        r"embed_positions\.weight",
     ]
 
     def __init__(self, config: PegasusConfig):
@@ -1164,7 +1162,9 @@ def forward(
 
         if labels is not None:
             if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
 
         outputs = self.model(
             input_ids,
@@ -1219,6 +1219,16 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index d4ea6f400abad1..6dc3a3bac19a32 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -34,13 +34,13 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         BatchEncoding,
         MBartConfig,
         MBartForConditionalGeneration,
         MBartForQuestionAnswering,
         MBartForSequenceClassification,
         MBartModel,
-        MBartTokenizer,
     )
     from transformers.models.mbart.modeling_mbart import MBartDecoder, MBartEncoder
 
@@ -313,7 +313,7 @@ class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.tokenizer = MBartTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
         return cls
 
     @cached_property
diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
index 835fb148544f4f..ae34e0b1b3a375 100644
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -28,23 +28,21 @@
 from .test_configuration_common import ConfigTester
 from .test_generation_utils import GenerationTesterMixin
 from .test_modeling_common import ModelTesterMixin, ids_tensor
+from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
 
 
 if is_torch_available():
     import torch
 
     from transformers import (
+        AutoModelForSeq2SeqLM,
         PegasusConfig,
         PegasusForConditionalGeneration,
         PegasusForQuestionAnswering,
         PegasusForSequenceClassification,
         PegasusModel,
-        PegasusTokenizer,
-    )
-    from transformers.models.pegasus.modeling_pegasus import (
-        PegasusDecoder,
-        PegasusEncoder,
     )
+    from transformers.models.pegasus.modeling_pegasus import PegasusDecoder, PegasusEncoder
 
 
 def prepare_pegasus_inputs_dict(
@@ -157,7 +155,9 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
         next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
 
         output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
 
         # select random slice
         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -304,85 +304,46 @@ def _long_tensor(tok_lst):
     return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
-TOLERANCE = 1e-4
-
-
 @require_torch
 @require_sentencepiece
 @require_tokenizers
-@slow
-class PegasusModelIntegrationTests(unittest.TestCase):
-    @cached_property
-    def default_tokenizer(self):
-        return PegasusTokenizer.from_pretrained('google/pegasus-large')
-
-    def test_inference_no_head(self):
-        model = PegasusModel.from_pretrained('google/pegasus-large').to(torch_device)
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
-        inputs_dict = prepare_pegasus_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, 1024))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_inference_head(self):
-        model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large').to(torch_device)
-
-        # change to intended input
-        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_pegasus_inputs_dict(model.config, input_ids, decoder_input_ids)
-        with torch.no_grad():
-            output = model(**inputs_dict)[0]
-        expected_shape = torch.Size((1, 11, model.config.vocab_size))
-        self.assertEqual(output.shape, expected_shape)
-        # change to expected output here
-        expected_slice = torch.tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
-        )
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
-
-    def test_seq_to_seq_generation(self):
-        hf = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large').to(torch_device)
-        tok = PegasusTokenizer.from_pretrained('google/pegasus-large')
+class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
+    checkpoint_name = "google/pegasus-xsum"
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
+        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
+    ]
+
+    tgt_text = [
+        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
+        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
+    ]
 
-        batch_input = [
-            # string 1,
-            # string 2,
-            # string 3,
-            # string 4,
-        ]
-
-        # The below article tests that we don't add any hypotheses outside of the top n_beams
-        dct = tok.batch_encode_plus(
-            batch_input,
-            max_length=512,
-            padding="max_length",
-            truncation_strategy="only_first",
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        hypotheses_batch = hf.generate(
-            input_ids=dct["input_ids"].to(torch_device),
-            attention_mask=dct["attention_mask"].to(torch_device),
-            num_beams=2,
+    @cached_property
+    def model(self):
+        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
+
+    @slow
+    def test_pegasus_xsum_summary(self):
+        assert self.tokenizer.model_max_length == 512
+        inputs = self.tokenizer(self.src_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
+            torch_device
         )
-
-        EXPECTED = [
-            # here expected 1,
-            # here expected 2,
-            # here expected 3,
-            # here expected 4,
+        assert inputs.input_ids.shape == (2, 421)
+        translated_tokens = self.model.generate(**inputs, num_beams=2)
+        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        import ipdb
+
+        ipdb.set_trace()
+        assert self.tgt_text == decoded
+
+        if "cuda" not in torch_device:
+            return
+        # Demonstrate fp16 issue, Contributions welcome!
+        self.model.half()
+        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
+        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
+        assert decoded_fp16 == [
+            "California's largest electricity provider has begun",
+            "N-Dubz have revealed they were",
         ]
-
-        generated = tok.batch_decode(
-            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )
-        assert generated == EXPECTED

From e4988c35c6b25ddb116abe25c6f58afe98e174ef Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 16:01:21 +0000
Subject: [PATCH 09/51] finish pegasus

---
 src/transformers/models/pegasus/configuration_pegasus.py | 2 +-
 src/transformers/models/pegasus/modeling_pegasus.py      | 1 +
 tests/test_modeling_pegasus.py                           | 3 ---
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 745a66d11d81cf..2bfbc40d55b2f2 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -116,7 +116,7 @@ def __init__(
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        decoder_start_token_id=2,
+        decoder_start_token_id=0,
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 7a06b68e916761..bd72de47c15ad2 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -690,6 +690,7 @@ def forward(
         embed_pos = self.embed_positions(input_shape)
 
         hidden_states = inputs_embeds + embed_pos
+
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # expand attention_mask
diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
index ae34e0b1b3a375..7f152fa1e84404 100644
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -332,9 +332,6 @@ def test_pegasus_xsum_summary(self):
         assert inputs.input_ids.shape == (2, 421)
         translated_tokens = self.model.generate(**inputs, num_beams=2)
         decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        import ipdb
-
-        ipdb.set_trace()
         assert self.tgt_text == decoded
 
         if "cuda" not in torch_device:

From 62692917fa5eba0258eeb8b90753f8b8b1a61334 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 16:01:52 +0000
Subject: [PATCH 10/51] remove cookie cutter leftover

---
 cookiecutter-template-PEGASUS/pegasus.rst     |  80 -------
 .../to_replace_pegasus.py                     | 212 ------------------
 .../tokenization_fast_pegasus.py              |  53 -----
 .../tokenization_pegasus.py                   |  51 -----
 4 files changed, 396 deletions(-)
 delete mode 100644 cookiecutter-template-PEGASUS/pegasus.rst
 delete mode 100644 cookiecutter-template-PEGASUS/to_replace_pegasus.py
 delete mode 100644 cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
 delete mode 100644 cookiecutter-template-PEGASUS/tokenization_pegasus.py

diff --git a/cookiecutter-template-PEGASUS/pegasus.rst b/cookiecutter-template-PEGASUS/pegasus.rst
deleted file mode 100644
index 434ef1a5df75e8..00000000000000
--- a/cookiecutter-template-PEGASUS/pegasus.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-PEGASUS
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The PEGASUS model was proposed in `<INSERT PAPER NAME HERE>
-<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
-
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
-
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
-PegasusConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusConfig
-    :members:
-
-
-PegasusTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-PegasusTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-PegasusModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusModel
-    :members: forward
-
-
-PegasusForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForConditionalGeneration
-    :members: forward
-
-
-PegasusForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForSequenceClassification
-    :members: forward
-
-
-PegasusForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForQuestionAnswering
-    :members: forward
-
-
diff --git a/cookiecutter-template-PEGASUS/to_replace_pegasus.py b/cookiecutter-template-PEGASUS/to_replace_pegasus.py
deleted file mode 100644
index 0323f62749f407..00000000000000
--- a/cookiecutter-template-PEGASUS/to_replace_pegasus.py
+++ /dev/null
@@ -1,212 +0,0 @@
-## Copyright 2020 The HuggingFace Team. All rights reserved.
-##
-## Licensed under the Apache License, Version 2.0 (the "License");
-## you may not use this file except in compliance with the License.
-## You may obtain a copy of the License at
-##
-##     http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-## This file is made so that specific statements may be copied inside existing files. This is useful to copy
-## import statements in __init__.py, or to complete model lists in the AUTO files.
-##
-## It is to be used as such:
-## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH
-## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH
-## Put '# Replace with:' followed by the lines containing the content to define the content
-## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting
-## content in that file.
-##
-## Put '## COMMENT' to comment on the file.
-
-
-# To replace in: "src/transformers/__init__.py"
-# Below: "if is_torch_available():" if generating PyTorch
-# Replace with:
-
-    from .models.pegasus import (
-        PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
-        PegasusForConditionalGeneration,
-        PegasusForQuestionAnswering,
-        PegasusForSequenceClassification,
-        PegasusModel,
-    )
-# End.
-
-# Below: "if is_tf_available():" if generating TensorFlow
-# Replace with:
-
-    from .models.pegasus import (
-        TFPegasusForConditionalGeneration,
-        TFPegasusModel,
-        TFPegasusPreTrainedModel,
-    )
-# End.
-
-# Below: "if is_tokenizers_available():"
-# Replace with:
-    from .models.pegasus import PegasusTokenizerFast
-# End.
-
-# Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
-# Replace with:
-from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/configuration_auto.py"
-# Below: "# Add configs here"
-# Replace with:
-        ("pegasus", PegasusConfig),
-# End.
-
-# Below: "# Add archive maps here"
-# Replace with:
-        PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP,
-# End.
-
-# Below: "from ..albert.configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig",
-# Replace with:
-from ..pegasus.configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
-# End.
-
-# Below: "# Add full (and cased) model names here"
-# Replace with:
-        ("pegasus", "Pegasus"),
-# End.
-
-
-
-# To replace in: "src/transformers/models/auto/modeling_auto.py" if generating PyTorch
-# Below: "from .configuration_auto import ("
-# Replace with:
-    PegasusConfig,
-# End.
-
-# Below: "# Add modeling imports here"
-# Replace with:
-from ..pegasus.modeling_pegasus import (
-    PegasusForConditionalGeneration,
-    PegasusForQuestionAnswering,
-    PegasusForSequenceClassification,
-    PegasusModel,
-)
-# End.
-
-# Below: "# Base model mapping"
-# Replace with:
-        (PegasusConfig, PegasusModel),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-
-        (PegasusConfig, PegasusForConditionalGeneration),
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-        (PegasusConfig, PegasusForSequenceClassification),
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-        (PegasusConfig, PegasusForQuestionAnswering),
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-
-        (PegasusConfig, PegasusForConditionalGeneration),
-# End.
-
-# To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
-# Below: "from .configuration_auto import ("
-# Replace with:
-    PegasusConfig,
-# End.
-
-# Below: "# Add modeling imports here"
-# Replace with:
-from ..pegasus.modeling_tf_pegasus import (
-    TFPegasusForConditionalGeneration,
-    TFPegasusModel,
-)
-# End.
-
-# Below: "# Base model mapping"
-# Replace with:
-        (PegasusConfig, TFPegasusModel),
-# End.
-
-# Below: "# Model with LM heads mapping"
-# Replace with:
-
-        (PegasusConfig, TFPegasusForConditionalGeneration),
-# End.
-
-# Below: "# Model for Causal LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Masked LM mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Sequence Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Question Answering mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Token Classification mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Multiple Choice mapping"
-# Replace with:
-# End.
-
-# Below: "# Model for Seq2Seq Causal LM mapping"
-# Replace with:
-
-        (PegasusConfig, TFPegasusForConditionalGeneration),
-# End.
-
-# To replace in: "utils/check_repo.py" if generating PyTorch
-
-# Below: "models to ignore for model xxx mapping"
-# Replace with:
-"PegasusEncoder",
-    "PegasusDecoder",
-# End.
-
-# Below: "models to ignore for not tested"
-# Replace with:
-"PegasusEncoder",  # Building part of bigger (tested) model.
-    "PegasusDecoder",  # Building part of bigger (tested) model.
-# End.
diff --git a/cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py b/cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
deleted file mode 100644
index 9efe75cbf22685..00000000000000
--- a/cookiecutter-template-PEGASUS/tokenization_fast_pegasus.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# coding=utf-8
-# Copyright Google and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for PEGASUS."""
-from ...utils import logging
-from ..bart.tokenization_bart_fast import BartTokenizerFast
-from .tokenization_pegasus import PegasusTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/merges.txt",
-    },
-    "tokenizer_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/tokenizer.json",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/pegasus-large": 1024,
-}
-
-
-class PegasusTokenizerFast(BartTokenizerFast):
-    r"""
-    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.PegasusTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    slow_tokenizer_class = PegasusTokenizer
diff --git a/cookiecutter-template-PEGASUS/tokenization_pegasus.py b/cookiecutter-template-PEGASUS/tokenization_pegasus.py
deleted file mode 100644
index c19a0a856020c8..00000000000000
--- a/cookiecutter-template-PEGASUS/tokenization_pegasus.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright Google and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for PEGASUS."""
-from ...utils import logging
-from ..bart.tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/vocab.json",
-    },
-    "merges_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/merges.txt",
-    },
-    "tokenizer_file": {
-        "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/tokenizer.json",
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/pegasus-large": 1024,
-}
-
-
-class PegasusTokenizer(BartTokenizer):
-    """
-    Construct a PEGASUS tokenizer.
-
-    :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
-    tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
-    parameters.
-    """
-
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

From 88792faa7556e6c0b0c76b669cd85fcbc7f130f0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 17:04:00 +0000
Subject: [PATCH 11/51] add marian

---
 src/transformers/__init__.py                  |    2 +-
 src/transformers/models/auto/modeling_auto.py |    3 +-
 src/transformers/models/marian/__init__.py    |   16 +-
 .../models/marian/configuration_marian.py     |  155 +-
 .../models/marian/modeling_marian.py          | 1220 +++++++++++++++-
 .../models/old_marian/__init__.py             |   30 +
 .../models/old_marian/configuration_marian.py |  100 ++
 .../convert_marian_tatoeba_to_pytorch.py      | 1268 +++++++++++++++++
 .../old_marian/convert_marian_to_pytorch.py   |  632 ++++++++
 .../models/old_marian/modeling_marian.py      |   63 +
 .../models/old_marian/modeling_tf_marian.py   |   52 +
 .../models/old_marian/tokenization_marian.py  |  291 ++++
 tests/test_modeling_marian.py                 |  271 +++-
 tests/test_modeling_old_marian.py             |  334 +++++
 14 files changed, 4332 insertions(+), 105 deletions(-)
 mode change 100644 => 100755 src/transformers/models/marian/modeling_marian.py
 create mode 100644 src/transformers/models/old_marian/__init__.py
 create mode 100644 src/transformers/models/old_marian/configuration_marian.py
 create mode 100644 src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
 create mode 100644 src/transformers/models/old_marian/convert_marian_to_pytorch.py
 create mode 100644 src/transformers/models/old_marian/modeling_marian.py
 create mode 100644 src/transformers/models/old_marian/modeling_tf_marian.py
 create mode 100644 src/transformers/models/old_marian/tokenization_marian.py
 create mode 100644 tests/test_modeling_old_marian.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3e1e55f6c8ebca..8121682297f3f8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -526,7 +526,7 @@
         LxmertVisualFeatureEncoder,
         LxmertXLayer,
     )
-    from .models.marian import MarianMTModel
+    from .models.marian import MarianModel, MarianMTModel
     from .models.mbart import (
         MBartForConditionalGeneration,
         MBartForQuestionAnswering,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a5e4e01040e64f..79042008815a05 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -110,7 +110,7 @@
     LongformerModel,
 )
 from ..lxmert.modeling_lxmert import LxmertForPreTraining, LxmertForQuestionAnswering, LxmertModel
-from ..marian.modeling_marian import MarianMTModel
+from ..marian.modeling_marian import MarianModel, MarianMTModel
 from ..mbart.modeling_mbart import (
     MBartForConditionalGeneration,
     MBartForQuestionAnswering,
@@ -299,6 +299,7 @@
         (ProphetNetConfig, ProphetNetModel),
         (MPNetConfig, MPNetModel),
         (TapasConfig, TapasModel),
+        (MarianConfig, MarianModel),
     ]
 )
 
diff --git a/src/transformers/models/marian/__init__.py b/src/transformers/models/marian/__init__.py
index bf7be4bd92579c..e9e2ee4cc7d112 100644
--- a/src/transformers/models/marian/__init__.py
+++ b/src/transformers/models/marian/__init__.py
@@ -15,16 +15,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig
+from .tokenization_marian import MarianTokenizer
 
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
-from .configuration_marian import MarianConfig
-
-
-if is_sentencepiece_available():
-    from .tokenization_marian import MarianTokenizer
 
 if is_torch_available():
-    from .modeling_marian import MarianMTModel
+    from .modeling_marian import (
+        MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MarianModel,
+        MarianMTModel,
+        MarianPreTrainedModel,
+    )
 
 if is_tf_available():
     from .modeling_tf_marian import TFMarianMTModel
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index a17531bb2f4d8f..885ef5e8a67a85 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
+# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,40 +14,48 @@
 # limitations under the License.
 """ Marian model configuration """
 
-from ..bart.configuration_bart import BartConfig
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+logger = logging.get_logger(__name__)
+
+MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/config.json",
+    # See all Marian models at https://huggingface.co/models?filter=marian
 }
 
 
-class MarianConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
-    instantiate a Marian model according to the specified arguments, defining the model architecture.
+class MarianConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MarianModel`. It is used to
+    instantiate an Marian model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Marian
+    `Helsinki-NLP/opus-mt-en-de <https://huggingface.co/Helsinki-NLP/opus-mt-en-de>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
+
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 58101):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
             Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianMTModel`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianModel` or
+            :class:`~transformers.TFMarianModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
             :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -59,42 +67,99 @@ class MarianConfig(BartConfig):
             The dropout ratio for activations inside the fully connected layer.
         classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+        Example::
+
+        >>> from transformers import MarianModel, MarianConfig
+
+        >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
+        >>> configuration = MarianConfig()
 
+        >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
+        >>> model = MarianModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
     model_type = "marian"
-    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=58100,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
old mode 100644
new mode 100755
index 25d3dc1ea969c5..d79db98a7c4b3f
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 Marian Team Authors and The HuggingFace Inc. team.
+# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,49 +15,1223 @@
 """PyTorch MarianMTModel model, ported from the Marian C++ repo."""
 
 
-from ..bart.modeling_bart import BartForConditionalGeneration
+import math
+import random
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
 from .configuration_marian import MarianConfig
 
 
-# See all Marian models at https://huggingface.co/models?search=Helsinki-NLP
+logger = logging.get_logger(__name__)
 
+_CONFIG_FOR_DOC = "MarianConfig"
+_TOKENIZER_FOR_DOC = "MarianTokenizer"
 
-class MarianMTModel(BartForConditionalGeneration):
-    r"""
-    Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
-    models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
 
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
+MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Helsinki-NLP/opus-mt-en-de",
+    # See all Marian models at https://huggingface.co/models?filter=marian
+]
 
-    Examples::
 
-        >>> from transformers import MarianTokenizer, MarianMTModel
-        >>> from typing import List
-        >>> src = 'fr'  # source language
-        >>> trg = 'en'  # target language
-        >>> sample_text = "où est l'arrêt de bus ?"
-        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
 
-        >>> model = MarianMTModel.from_pretrained(mname)
-        >>> tok = MarianTokenizer.from_pretrained(mname)
-        >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt")  # don't need tgt_text for inference
-        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
-        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the bus stop ?"
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
 
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
     """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+def MarianLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class MarianSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out: nn.Parameter):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        return out
+
+    @torch.no_grad()
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class MarianAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class MarianEncoderLayer(nn.Module):
+    def __init__(self, config: MarianConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = MarianAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = MarianLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class MarianDecoderLayer(nn.Module):
+    def __init__(self, config: MarianConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = MarianAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.encoder_attn = MarianAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = MarianLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MarianClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class MarianPreTrainedModel(PreTrainedModel):
     config_class = MarianConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, MarianSinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+MARIAN_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.MarianConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+MARIAN_GENERATION_EXAMPLE = r"""
+        Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
+        Available models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
+
+        Examples::
+
+            >>> from transformers import MarianTokenizer, MarianMTModel
+            >>> from typing import List
+            >>> src = 'fr'  # source language
+            >>> trg = 'en'  # target language
+            >>> sample_text = "où est l'arrêt de bus ?"
+            >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+
+            >>> model = MarianMTModel.from_pretrained(mname)
+            >>> tok = MarianTokenizer.from_pretrained(mname)
+            >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt")  # don't need tgt_text for inference
+            >>> gen = model.generate(**batch)
+            >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the bus stop ?"
+"""
+
+MARIAN_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_marian._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class MarianEncoder(MarianPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`MarianEncoderLayer`.
+
+    Args:
+        config: MarianConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = MarianSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class MarianDecoder(MarianPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`MarianDecoderLayer`
+
+    Args:
+        config: MarianConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = MarianSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Marian Model outputting raw hidden-states without any specific head on top.",
+    MARIAN_START_DOCSTRING,
+)
+class MarianModel(MarianPreTrainedModel):
+    def __init__(self, config: MarianConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = MarianEncoder(config, self.shared)
+        self.decoder = MarianDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(MARIAN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="Helsinki-NLP/opus-mt-en-de",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Marian Model with a language modeling head. Can be used for summarization.", MARIAN_START_DOCSTRING
+)
+class MarianMTModel(MarianPreTrainedModel):
+    base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+        r"embed_positions",
     ]
+
     _keys_to_ignore_on_save = [
         "model.encoder.embed_positions.weight",
         "model.decoder.embed_positions.weight",
     ]
 
+    def __init__(self, config: MarianConfig):
+        super().__init__(config)
+        self.model = MarianModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MARIAN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(MARIAN_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> from transformers import MarianTokenizer, MarianMTModel
+            >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
     def adjust_logits_during_generation(self, logits, cur_len, max_length):
         logits[:, self.config.pad_token_id] = float("-inf")  # never predict pad token.
         if cur_len == max_length - 1 and self.config.eos_token_id is not None:
             self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
         return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/src/transformers/models/old_marian/__init__.py b/src/transformers/models/old_marian/__init__.py
new file mode 100644
index 00000000000000..bf7be4bd92579c
--- /dev/null
+++ b/src/transformers/models/old_marian/__init__.py
@@ -0,0 +1,30 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
+from .configuration_marian import MarianConfig
+
+
+if is_sentencepiece_available():
+    from .tokenization_marian import MarianTokenizer
+
+if is_torch_available():
+    from .modeling_marian import MarianMTModel
+
+if is_tf_available():
+    from .modeling_tf_marian import TFMarianMTModel
diff --git a/src/transformers/models/old_marian/configuration_marian.py b/src/transformers/models/old_marian/configuration_marian.py
new file mode 100644
index 00000000000000..a17531bb2f4d8f
--- /dev/null
+++ b/src/transformers/models/old_marian/configuration_marian.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Marian model configuration """
+
+from ..bart.configuration_bart import BartConfig
+
+
+PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/config.json",
+}
+
+
+class MarianConfig(BartConfig):
+    """
+    This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
+    instantiate a Marian model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 58101):
+            Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianMTModel`.
+        d_model (:obj:`int`, `optional`, defaults to 512):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            This should be completed, specific to marian.
+        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Call layernorm before attention ops.
+        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Call layernorm after embeddings.
+        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Don't learn positional embeddings, use sinusoidal.
+        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Why not add another layernorm?
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
+            How many extra learned positional embeddings to use.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
+    """
+
+    model_type = "marian"
+    keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
new file mode 100644
index 00000000000000..0ab653e9a23a0b
--- /dev/null
+++ b/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
@@ -0,0 +1,1268 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from pathlib import Path
+from typing import List, Tuple
+
+from transformers.models.marian.convert_marian_to_pytorch import (
+    FRONT_MATTER_TEMPLATE,
+    _parse_readme,
+    convert_all_sentencepiece_models,
+    get_system_metadata,
+    remove_prefix,
+    remove_suffix,
+)
+
+
+try:
+    import pandas as pd
+except ImportError:
+    pass
+
+DEFAULT_REPO = "Tatoeba-Challenge"
+DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
+LANG_CODE_URL = "https://datahub.io/core/language-codes/r/language-codes-3b2.csv"
+ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
+ISO_PATH = "lang_code_data/iso-639-3.csv"
+LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
+
+
+class TatoebaConverter:
+    """
+    Convert Tatoeba-Challenge models to huggingface format.
+
+    Steps:
+
+        1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
+        2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
+           one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
+        3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
+           members.
+    """
+
+    def __init__(self, save_dir="marian_converted"):
+        assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
+        reg = self.make_tatoeba_registry()
+        self.download_metadata()
+        self.registry = reg
+        reg_df = pd.DataFrame(reg, columns=["id", "prepro", "url_model", "url_test_set"])
+        assert reg_df.id.value_counts().max() == 1
+        reg_df = reg_df.set_index("id")
+        reg_df["src"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[0]).values
+        reg_df["tgt"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[1]).values
+
+        released_cols = [
+            "url_base",
+            "pair",  # (ISO639-3/ISO639-5 codes),
+            "short_pair",  # (reduced codes),
+            "chrF2_score",
+            "bleu",
+            "brevity_penalty",
+            "ref_len",
+            "src_name",
+            "tgt_name",
+        ]
+
+        released = pd.read_csv("Tatoeba-Challenge/models/released-models.txt", sep="\t", header=None).iloc[:-1]
+        released.columns = released_cols
+        released["fname"] = released["url_base"].apply(
+            lambda x: remove_suffix(remove_prefix(x, "https://object.pouta.csc.fi/Tatoeba-Challenge/opus"), ".zip")
+        )
+
+        released["2m"] = released.fname.str.startswith("2m")
+        released["date"] = pd.to_datetime(
+            released["fname"].apply(lambda x: remove_prefix(remove_prefix(x, "2m-"), "-"))
+        )
+
+        released["base_ext"] = released.url_base.apply(lambda x: Path(x).name)
+        reg_df["base_ext"] = reg_df.url_model.apply(lambda x: Path(x).name)
+
+        metadata_new = reg_df.reset_index().merge(released.rename(columns={"pair": "id"}), on=["base_ext", "id"])
+
+        metadata_renamer = {"src": "src_alpha3", "tgt": "tgt_alpha3", "id": "long_pair", "date": "train_date"}
+        metadata_new = metadata_new.rename(columns=metadata_renamer)
+
+        metadata_new["src_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[0])
+        metadata_new["tgt_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[1])
+        DROP_COLS_BOTH = ["url_base", "base_ext", "fname"]
+
+        metadata_new = metadata_new.drop(DROP_COLS_BOTH, 1)
+        metadata_new["prefer_old"] = metadata_new.long_pair.isin([])
+        self.metadata = metadata_new
+        assert self.metadata.short_pair.value_counts().max() == 1, "Multiple metadata entries for a short pair"
+        self.metadata = self.metadata.set_index("short_pair")
+
+        # wget.download(LANG_CODE_URL)
+        mapper = pd.read_csv(LANG_CODE_PATH)
+        mapper.columns = ["a3", "a2", "ref"]
+        self.iso_table = pd.read_csv(ISO_PATH, sep="\t").rename(columns=lambda x: x.lower())
+        more_3_to_2 = self.iso_table.set_index("id").part1.dropna().to_dict()
+        more_3_to_2.update(mapper.set_index("a3").a2.to_dict())
+        self.alpha3_to_alpha2 = more_3_to_2
+        self.model_card_dir = Path(save_dir)
+        self.constituents = GROUP_MEMBERS
+
+    def convert_models(self, tatoeba_ids, dry_run=False):
+        entries_to_convert = [x for x in self.registry if x[0] in tatoeba_ids]
+        converted_paths = convert_all_sentencepiece_models(entries_to_convert, dest_dir=self.model_card_dir)
+
+        for path in converted_paths:
+            long_pair = remove_prefix(path.name, "opus-mt-").split("-")  # eg. heb-eng
+            assert len(long_pair) == 2
+            new_p_src = self.get_two_letter_code(long_pair[0])
+            new_p_tgt = self.get_two_letter_code(long_pair[1])
+            hf_model_id = f"opus-mt-{new_p_src}-{new_p_tgt}"
+            new_path = path.parent.joinpath(hf_model_id)  # opus-mt-he-en
+            os.rename(str(path), str(new_path))
+            self.write_model_card(hf_model_id, dry_run=dry_run)
+
+    def get_two_letter_code(self, three_letter_code):
+        return self.alpha3_to_alpha2.get(three_letter_code, three_letter_code)
+
+    def expand_group_to_two_letter_codes(self, grp_name):
+        return [self.get_two_letter_code(x) for x in self.constituents[grp_name]]
+
+    def get_tags(self, code, ref_name):
+        if len(code) == 2:
+            assert "languages" not in ref_name, f"{code}: {ref_name}"
+            return [code], False
+        elif "languages" in ref_name or len(self.constituents.get(code, [])) > 1:
+            group = self.expand_group_to_two_letter_codes(code)
+            group.append(code)
+            return group, True
+        else:  # zho-> zh
+            print(f"Three letter monolingual code: {code}")
+            return [code], False
+
+    def resolve_lang_code(self, r) -> Tuple[List[str], str, str]:
+        """R is a row in ported"""
+        short_pair = r.short_pair
+        src, tgt = short_pair.split("-")
+        src_tags, src_multilingual = self.get_tags(src, r.src_name)
+        assert isinstance(src_tags, list)
+        tgt_tags, tgt_multilingual = self.get_tags(tgt, r.tgt_name)
+        assert isinstance(tgt_tags, list)
+
+        return dedup(src_tags + tgt_tags), src_multilingual, tgt_multilingual
+
+    def write_model_card(
+        self,
+        hf_model_id: str,
+        repo_root=DEFAULT_REPO,
+        dry_run=False,
+    ) -> str:
+        """
+        Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync
+        model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
+        """
+        short_pair = remove_prefix(hf_model_id, "opus-mt-")
+        extra_metadata = self.metadata.loc[short_pair].drop("2m")
+        extra_metadata["short_pair"] = short_pair
+        lang_tags, src_multilingual, tgt_multilingual = self.resolve_lang_code(extra_metadata)
+        opus_name = f"{extra_metadata.src_alpha3}-{extra_metadata.tgt_alpha3}"
+        # opus_name: str = self.convert_hf_name_to_opus_name(hf_model_name)
+
+        assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
+        opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
+        assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
+
+        opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
+
+        readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
+
+        s, t = ",".join(opus_src), ",".join(opus_tgt)
+
+        metadata = {
+            "hf_name": short_pair,
+            "source_languages": s,
+            "target_languages": t,
+            "opus_readme_url": readme_url,
+            "original_repo": repo_root,
+            "tags": ["translation"],
+            "languages": lang_tags,
+        }
+        lang_tags = l2front_matter(lang_tags)
+        metadata["src_constituents"] = self.constituents[s]
+        metadata["tgt_constituents"] = self.constituents[t]
+        metadata["src_multilingual"] = src_multilingual
+        metadata["tgt_multilingual"] = tgt_multilingual
+
+        metadata.update(extra_metadata)
+        metadata.update(get_system_metadata(repo_root))
+
+        # combine with Tatoeba markdown
+
+        extra_markdown = f"### {short_pair}\n\n* source group: {metadata['src_name']} \n* target group: {metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
+
+        content = opus_readme_path.open().read()
+        content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
+        splat = content.split("*")[2:]
+
+        content = "*".join(splat)
+        # BETTER FRONT MATTER LOGIC
+
+        content = (
+            FRONT_MATTER_TEMPLATE.format(lang_tags)
+            + extra_markdown
+            + "\n* "
+            + content.replace("download", "download original " "weights")
+        )
+
+        items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
+        sec3 = "\n### System Info: \n" + items
+        content += sec3
+        if dry_run:
+            return content, metadata
+        sub_dir = self.model_card_dir / hf_model_id
+        sub_dir.mkdir(exist_ok=True)
+        dest = sub_dir / "README.md"
+        dest.open("w").write(content)
+        pd.Series(metadata).to_json(sub_dir / "metadata.json")
+        return content, metadata
+
+    def download_metadata(self):
+        Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True)
+        import wget
+
+        if not os.path.exists(ISO_PATH):
+            wget.download(ISO_URL, ISO_PATH)
+        if not os.path.exists(LANG_CODE_PATH):
+            wget.download(LANG_CODE_URL, LANG_CODE_PATH)
+
+    @staticmethod
+    def make_tatoeba_registry(repo_path=DEFAULT_MODEL_DIR):
+        if not (Path(repo_path) / "zho-eng" / "README.md").exists():
+            raise ValueError(
+                f"repo_path:{repo_path} does not exist: "
+                "You must run: git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git before calling."
+            )
+        results = {}
+        for p in Path(repo_path).iterdir():
+            if len(p.name) != 7:
+                continue
+            lns = list(open(p / "README.md").readlines())
+            results[p.name] = _parse_readme(lns)
+        return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
+
+
+GROUP_MEMBERS = {
+    # three letter code -> (group/language name, {constituents...}
+    # if this language is on the target side the constituents can be used as target language codes.
+    # if the language is on the source side they are supported natively without special codes.
+    "aav": ("Austro-Asiatic languages", {"hoc", "hoc_Latn", "kha", "khm", "khm_Latn", "mnw", "vie", "vie_Hani"}),
+    "afa": (
+        "Afro-Asiatic languages",
+        {
+            "acm",
+            "afb",
+            "amh",
+            "apc",
+            "ara",
+            "arq",
+            "ary",
+            "arz",
+            "hau_Latn",
+            "heb",
+            "kab",
+            "mlt",
+            "rif_Latn",
+            "shy_Latn",
+            "som",
+            "thv",
+            "tir",
+        },
+    ),
+    "afr": ("Afrikaans", {"afr"}),
+    "alv": (
+        "Atlantic-Congo languages",
+        {
+            "ewe",
+            "fuc",
+            "fuv",
+            "ibo",
+            "kin",
+            "lin",
+            "lug",
+            "nya",
+            "run",
+            "sag",
+            "sna",
+            "swh",
+            "toi_Latn",
+            "tso",
+            "umb",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        },
+    ),
+    "ara": ("Arabic", {"afb", "apc", "apc_Latn", "ara", "ara_Latn", "arq", "arq_Latn", "arz"}),
+    "art": (
+        "Artificial languages",
+        {
+            "afh_Latn",
+            "avk_Latn",
+            "dws_Latn",
+            "epo",
+            "ido",
+            "ido_Latn",
+            "ile_Latn",
+            "ina_Latn",
+            "jbo",
+            "jbo_Cyrl",
+            "jbo_Latn",
+            "ldn_Latn",
+            "lfn_Cyrl",
+            "lfn_Latn",
+            "nov_Latn",
+            "qya",
+            "qya_Latn",
+            "sjn_Latn",
+            "tlh_Latn",
+            "tzl",
+            "tzl_Latn",
+            "vol_Latn",
+        },
+    ),
+    "aze": ("Azerbaijani", {"aze_Latn"}),
+    "bat": ("Baltic languages", {"lit", "lav", "prg_Latn", "ltg", "sgs"}),
+    "bel": ("Belarusian", {"bel", "bel_Latn"}),
+    "ben": ("Bengali", {"ben"}),
+    "bnt": (
+        "Bantu languages",
+        {"kin", "lin", "lug", "nya", "run", "sna", "swh", "toi_Latn", "tso", "umb", "xho", "zul"},
+    ),
+    "bul": ("Bulgarian", {"bul", "bul_Latn"}),
+    "cat": ("Catalan", {"cat"}),
+    "cau": ("Caucasian languages", {"abk", "kat", "che", "ady"}),
+    "ccs": ("South Caucasian languages", {"kat"}),
+    "ceb": ("Cebuano", {"ceb"}),
+    "cel": ("Celtic languages", {"gla", "gle", "bre", "cor", "glv", "cym"}),
+    "ces": ("Czech", {"ces"}),
+    "cpf": ("Creoles and pidgins, French‑based", {"gcf_Latn", "hat", "mfe"}),
+    "cpp": (
+        "Creoles and pidgins, Portuguese-based",
+        {"zsm_Latn", "ind", "pap", "min", "tmw_Latn", "max_Latn", "zlm_Latn"},
+    ),
+    "cus": ("Cushitic languages", {"som"}),
+    "dan": ("Danish", {"dan"}),
+    "deu": ("German", {"deu"}),
+    "dra": ("Dravidian languages", {"tam", "kan", "mal", "tel"}),
+    "ell": ("Modern Greek (1453-)", {"ell"}),
+    "eng": ("English", {"eng"}),
+    "epo": ("Esperanto", {"epo"}),
+    "est": ("Estonian", {"est"}),
+    "euq": ("Basque (family)", {"eus"}),
+    "eus": ("Basque", {"eus"}),
+    "fin": ("Finnish", {"fin"}),
+    "fiu": (
+        "Finno-Ugrian languages",
+        {
+            "est",
+            "fin",
+            "fkv_Latn",
+            "hun",
+            "izh",
+            "kpv",
+            "krl",
+            "liv_Latn",
+            "mdf",
+            "mhr",
+            "myv",
+            "sma",
+            "sme",
+            "udm",
+            "vep",
+            "vro",
+        },
+    ),
+    "fra": ("French", {"fra"}),
+    "gem": (
+        "Germanic languages",
+        {
+            "afr",
+            "ang_Latn",
+            "dan",
+            "deu",
+            "eng",
+            "enm_Latn",
+            "fao",
+            "frr",
+            "fry",
+            "gos",
+            "got_Goth",
+            "gsw",
+            "isl",
+            "ksh",
+            "ltz",
+            "nds",
+            "nld",
+            "nno",
+            "nob",
+            "nob_Hebr",
+            "non_Latn",
+            "pdc",
+            "sco",
+            "stq",
+            "swe",
+            "swg",
+            "yid",
+        },
+    ),
+    "gle": ("Irish", {"gle"}),
+    "glg": ("Galician", {"glg"}),
+    "gmq": ("North Germanic languages", {"dan", "nob", "nob_Hebr", "swe", "isl", "nno", "non_Latn", "fao"}),
+    "gmw": (
+        "West Germanic languages",
+        {
+            "afr",
+            "ang_Latn",
+            "deu",
+            "eng",
+            "enm_Latn",
+            "frr",
+            "fry",
+            "gos",
+            "gsw",
+            "ksh",
+            "ltz",
+            "nds",
+            "nld",
+            "pdc",
+            "sco",
+            "stq",
+            "swg",
+            "yid",
+        },
+    ),
+    "grk": ("Greek languages", {"grc_Grek", "ell"}),
+    "hbs": ("Serbo-Croatian", {"hrv", "srp_Cyrl", "bos_Latn", "srp_Latn"}),
+    "heb": ("Hebrew", {"heb"}),
+    "hin": ("Hindi", {"hin"}),
+    "hun": ("Hungarian", {"hun"}),
+    "hye": ("Armenian", {"hye", "hye_Latn"}),
+    "iir": (
+        "Indo-Iranian languages",
+        {
+            "asm",
+            "awa",
+            "ben",
+            "bho",
+            "gom",
+            "guj",
+            "hif_Latn",
+            "hin",
+            "jdt_Cyrl",
+            "kur_Arab",
+            "kur_Latn",
+            "mai",
+            "mar",
+            "npi",
+            "ori",
+            "oss",
+            "pan_Guru",
+            "pes",
+            "pes_Latn",
+            "pes_Thaa",
+            "pnb",
+            "pus",
+            "rom",
+            "san_Deva",
+            "sin",
+            "snd_Arab",
+            "tgk_Cyrl",
+            "tly_Latn",
+            "urd",
+            "zza",
+        },
+    ),
+    "ilo": ("Iloko", {"ilo"}),
+    "inc": (
+        "Indic languages",
+        {
+            "asm",
+            "awa",
+            "ben",
+            "bho",
+            "gom",
+            "guj",
+            "hif_Latn",
+            "hin",
+            "mai",
+            "mar",
+            "npi",
+            "ori",
+            "pan_Guru",
+            "pnb",
+            "rom",
+            "san_Deva",
+            "sin",
+            "snd_Arab",
+            "urd",
+        },
+    ),
+    "ine": (
+        "Indo-European languages",
+        {
+            "afr",
+            "afr_Arab",
+            "aln",
+            "ang_Latn",
+            "arg",
+            "asm",
+            "ast",
+            "awa",
+            "bel",
+            "bel_Latn",
+            "ben",
+            "bho",
+            "bjn",
+            "bos_Latn",
+            "bre",
+            "bul",
+            "bul_Latn",
+            "cat",
+            "ces",
+            "cor",
+            "cos",
+            "csb_Latn",
+            "cym",
+            "dan",
+            "deu",
+            "dsb",
+            "egl",
+            "ell",
+            "eng",
+            "enm_Latn",
+            "ext",
+            "fao",
+            "fra",
+            "frm_Latn",
+            "frr",
+            "fry",
+            "gcf_Latn",
+            "gla",
+            "gle",
+            "glg",
+            "glv",
+            "gom",
+            "gos",
+            "got_Goth",
+            "grc_Grek",
+            "gsw",
+            "guj",
+            "hat",
+            "hif_Latn",
+            "hin",
+            "hrv",
+            "hsb",
+            "hye",
+            "hye_Latn",
+            "ind",
+            "isl",
+            "ita",
+            "jdt_Cyrl",
+            "ksh",
+            "kur_Arab",
+            "kur_Latn",
+            "lad",
+            "lad_Latn",
+            "lat_Grek",
+            "lat_Latn",
+            "lav",
+            "lij",
+            "lit",
+            "lld_Latn",
+            "lmo",
+            "ltg",
+            "ltz",
+            "mai",
+            "mar",
+            "max_Latn",
+            "mfe",
+            "min",
+            "mkd",
+            "mwl",
+            "nds",
+            "nld",
+            "nno",
+            "nob",
+            "nob_Hebr",
+            "non_Latn",
+            "npi",
+            "oci",
+            "ori",
+            "orv_Cyrl",
+            "oss",
+            "pan_Guru",
+            "pap",
+            "pcd",
+            "pdc",
+            "pes",
+            "pes_Latn",
+            "pes_Thaa",
+            "pms",
+            "pnb",
+            "pol",
+            "por",
+            "prg_Latn",
+            "pus",
+            "roh",
+            "rom",
+            "ron",
+            "rue",
+            "rus",
+            "rus_Latn",
+            "san_Deva",
+            "scn",
+            "sco",
+            "sgs",
+            "sin",
+            "slv",
+            "snd_Arab",
+            "spa",
+            "sqi",
+            "srd",
+            "srp_Cyrl",
+            "srp_Latn",
+            "stq",
+            "swe",
+            "swg",
+            "tgk_Cyrl",
+            "tly_Latn",
+            "tmw_Latn",
+            "ukr",
+            "urd",
+            "vec",
+            "wln",
+            "yid",
+            "zlm_Latn",
+            "zsm_Latn",
+            "zza",
+        },
+    ),
+    "isl": ("Icelandic", {"isl"}),
+    "ita": ("Italian", {"ita"}),
+    "itc": (
+        "Italic languages",
+        {
+            "arg",
+            "ast",
+            "bjn",
+            "cat",
+            "cos",
+            "egl",
+            "ext",
+            "fra",
+            "frm_Latn",
+            "gcf_Latn",
+            "glg",
+            "hat",
+            "ind",
+            "ita",
+            "lad",
+            "lad_Latn",
+            "lat_Grek",
+            "lat_Latn",
+            "lij",
+            "lld_Latn",
+            "lmo",
+            "max_Latn",
+            "mfe",
+            "min",
+            "mwl",
+            "oci",
+            "pap",
+            "pcd",
+            "pms",
+            "por",
+            "roh",
+            "ron",
+            "scn",
+            "spa",
+            "srd",
+            "tmw_Latn",
+            "vec",
+            "wln",
+            "zlm_Latn",
+            "zsm_Latn",
+        },
+    ),
+    "jpn": ("Japanese", {"jpn", "jpn_Bopo", "jpn_Hang", "jpn_Hani", "jpn_Hira", "jpn_Kana", "jpn_Latn", "jpn_Yiii"}),
+    "jpx": ("Japanese (family)", {"jpn"}),
+    "kat": ("Georgian", {"kat"}),
+    "kor": ("Korean", {"kor_Hani", "kor_Hang", "kor_Latn", "kor"}),
+    "lav": ("Latvian", {"lav"}),
+    "lit": ("Lithuanian", {"lit"}),
+    "mkd": ("Macedonian", {"mkd"}),
+    "mkh": ("Mon-Khmer languages", {"vie_Hani", "mnw", "vie", "kha", "khm_Latn", "khm"}),
+    "msa": ("Malay (macrolanguage)", {"zsm_Latn", "ind", "max_Latn", "zlm_Latn", "min"}),
+    "mul": (
+        "Multiple languages",
+        {
+            "abk",
+            "acm",
+            "ady",
+            "afb",
+            "afh_Latn",
+            "afr",
+            "akl_Latn",
+            "aln",
+            "amh",
+            "ang_Latn",
+            "apc",
+            "ara",
+            "arg",
+            "arq",
+            "ary",
+            "arz",
+            "asm",
+            "ast",
+            "avk_Latn",
+            "awa",
+            "aze_Latn",
+            "bak",
+            "bam_Latn",
+            "bel",
+            "bel_Latn",
+            "ben",
+            "bho",
+            "bod",
+            "bos_Latn",
+            "bre",
+            "brx",
+            "brx_Latn",
+            "bul",
+            "bul_Latn",
+            "cat",
+            "ceb",
+            "ces",
+            "cha",
+            "che",
+            "chr",
+            "chv",
+            "cjy_Hans",
+            "cjy_Hant",
+            "cmn",
+            "cmn_Hans",
+            "cmn_Hant",
+            "cor",
+            "cos",
+            "crh",
+            "crh_Latn",
+            "csb_Latn",
+            "cym",
+            "dan",
+            "deu",
+            "dsb",
+            "dtp",
+            "dws_Latn",
+            "egl",
+            "ell",
+            "enm_Latn",
+            "epo",
+            "est",
+            "eus",
+            "ewe",
+            "ext",
+            "fao",
+            "fij",
+            "fin",
+            "fkv_Latn",
+            "fra",
+            "frm_Latn",
+            "frr",
+            "fry",
+            "fuc",
+            "fuv",
+            "gan",
+            "gcf_Latn",
+            "gil",
+            "gla",
+            "gle",
+            "glg",
+            "glv",
+            "gom",
+            "gos",
+            "got_Goth",
+            "grc_Grek",
+            "grn",
+            "gsw",
+            "guj",
+            "hat",
+            "hau_Latn",
+            "haw",
+            "heb",
+            "hif_Latn",
+            "hil",
+            "hin",
+            "hnj_Latn",
+            "hoc",
+            "hoc_Latn",
+            "hrv",
+            "hsb",
+            "hun",
+            "hye",
+            "iba",
+            "ibo",
+            "ido",
+            "ido_Latn",
+            "ike_Latn",
+            "ile_Latn",
+            "ilo",
+            "ina_Latn",
+            "ind",
+            "isl",
+            "ita",
+            "izh",
+            "jav",
+            "jav_Java",
+            "jbo",
+            "jbo_Cyrl",
+            "jbo_Latn",
+            "jdt_Cyrl",
+            "jpn",
+            "kab",
+            "kal",
+            "kan",
+            "kat",
+            "kaz_Cyrl",
+            "kaz_Latn",
+            "kek_Latn",
+            "kha",
+            "khm",
+            "khm_Latn",
+            "kin",
+            "kir_Cyrl",
+            "kjh",
+            "kpv",
+            "krl",
+            "ksh",
+            "kum",
+            "kur_Arab",
+            "kur_Latn",
+            "lad",
+            "lad_Latn",
+            "lao",
+            "lat_Latn",
+            "lav",
+            "ldn_Latn",
+            "lfn_Cyrl",
+            "lfn_Latn",
+            "lij",
+            "lin",
+            "lit",
+            "liv_Latn",
+            "lkt",
+            "lld_Latn",
+            "lmo",
+            "ltg",
+            "ltz",
+            "lug",
+            "lzh",
+            "lzh_Hans",
+            "mad",
+            "mah",
+            "mai",
+            "mal",
+            "mar",
+            "max_Latn",
+            "mdf",
+            "mfe",
+            "mhr",
+            "mic",
+            "min",
+            "mkd",
+            "mlg",
+            "mlt",
+            "mnw",
+            "moh",
+            "mon",
+            "mri",
+            "mwl",
+            "mww",
+            "mya",
+            "myv",
+            "nan",
+            "nau",
+            "nav",
+            "nds",
+            "niu",
+            "nld",
+            "nno",
+            "nob",
+            "nob_Hebr",
+            "nog",
+            "non_Latn",
+            "nov_Latn",
+            "npi",
+            "nya",
+            "oci",
+            "ori",
+            "orv_Cyrl",
+            "oss",
+            "ota_Arab",
+            "ota_Latn",
+            "pag",
+            "pan_Guru",
+            "pap",
+            "pau",
+            "pdc",
+            "pes",
+            "pes_Latn",
+            "pes_Thaa",
+            "pms",
+            "pnb",
+            "pol",
+            "por",
+            "ppl_Latn",
+            "prg_Latn",
+            "pus",
+            "quc",
+            "qya",
+            "qya_Latn",
+            "rap",
+            "rif_Latn",
+            "roh",
+            "rom",
+            "ron",
+            "rue",
+            "run",
+            "rus",
+            "sag",
+            "sah",
+            "san_Deva",
+            "scn",
+            "sco",
+            "sgs",
+            "shs_Latn",
+            "shy_Latn",
+            "sin",
+            "sjn_Latn",
+            "slv",
+            "sma",
+            "sme",
+            "smo",
+            "sna",
+            "snd_Arab",
+            "som",
+            "spa",
+            "sqi",
+            "srp_Cyrl",
+            "srp_Latn",
+            "stq",
+            "sun",
+            "swe",
+            "swg",
+            "swh",
+            "tah",
+            "tam",
+            "tat",
+            "tat_Arab",
+            "tat_Latn",
+            "tel",
+            "tet",
+            "tgk_Cyrl",
+            "tha",
+            "tir",
+            "tlh_Latn",
+            "tly_Latn",
+            "tmw_Latn",
+            "toi_Latn",
+            "ton",
+            "tpw_Latn",
+            "tso",
+            "tuk",
+            "tuk_Latn",
+            "tur",
+            "tvl",
+            "tyv",
+            "tzl",
+            "tzl_Latn",
+            "udm",
+            "uig_Arab",
+            "uig_Cyrl",
+            "ukr",
+            "umb",
+            "urd",
+            "uzb_Cyrl",
+            "uzb_Latn",
+            "vec",
+            "vie",
+            "vie_Hani",
+            "vol_Latn",
+            "vro",
+            "war",
+            "wln",
+            "wol",
+            "wuu",
+            "xal",
+            "xho",
+            "yid",
+            "yor",
+            "yue",
+            "yue_Hans",
+            "yue_Hant",
+            "zho",
+            "zho_Hans",
+            "zho_Hant",
+            "zlm_Latn",
+            "zsm_Latn",
+            "zul",
+            "zza",
+        },
+    ),
+    "nic": (
+        "Niger-Kordofanian languages",
+        {
+            "bam_Latn",
+            "ewe",
+            "fuc",
+            "fuv",
+            "ibo",
+            "kin",
+            "lin",
+            "lug",
+            "nya",
+            "run",
+            "sag",
+            "sna",
+            "swh",
+            "toi_Latn",
+            "tso",
+            "umb",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        },
+    ),
+    "nld": ("Dutch", {"nld"}),
+    "nor": ("Norwegian", {"nob", "nno"}),
+    "phi": ("Philippine languages", {"ilo", "akl_Latn", "war", "hil", "pag", "ceb"}),
+    "pol": ("Polish", {"pol"}),
+    "por": ("Portuguese", {"por"}),
+    "pqe": (
+        "Eastern Malayo-Polynesian languages",
+        {"fij", "gil", "haw", "mah", "mri", "nau", "niu", "rap", "smo", "tah", "ton", "tvl"},
+    ),
+    "roa": (
+        "Romance languages",
+        {
+            "arg",
+            "ast",
+            "cat",
+            "cos",
+            "egl",
+            "ext",
+            "fra",
+            "frm_Latn",
+            "gcf_Latn",
+            "glg",
+            "hat",
+            "ind",
+            "ita",
+            "lad",
+            "lad_Latn",
+            "lij",
+            "lld_Latn",
+            "lmo",
+            "max_Latn",
+            "mfe",
+            "min",
+            "mwl",
+            "oci",
+            "pap",
+            "pms",
+            "por",
+            "roh",
+            "ron",
+            "scn",
+            "spa",
+            "tmw_Latn",
+            "vec",
+            "wln",
+            "zlm_Latn",
+            "zsm_Latn",
+        },
+    ),
+    "ron": ("Romanian", {"ron"}),
+    "run": ("Rundi", {"run"}),
+    "rus": ("Russian", {"rus"}),
+    "sal": ("Salishan languages", {"shs_Latn"}),
+    "sem": ("Semitic languages", {"acm", "afb", "amh", "apc", "ara", "arq", "ary", "arz", "heb", "mlt", "tir"}),
+    "sla": (
+        "Slavic languages",
+        {
+            "bel",
+            "bel_Latn",
+            "bos_Latn",
+            "bul",
+            "bul_Latn",
+            "ces",
+            "csb_Latn",
+            "dsb",
+            "hrv",
+            "hsb",
+            "mkd",
+            "orv_Cyrl",
+            "pol",
+            "rue",
+            "rus",
+            "slv",
+            "srp_Cyrl",
+            "srp_Latn",
+            "ukr",
+        },
+    ),
+    "slv": ("Slovenian", {"slv"}),
+    "spa": ("Spanish", {"spa"}),
+    "swe": ("Swedish", {"swe"}),
+    "taw": ("Tai", {"lao", "tha"}),
+    "tgl": ("Tagalog", {"tgl_Latn"}),
+    "tha": ("Thai", {"tha"}),
+    "trk": (
+        "Turkic languages",
+        {
+            "aze_Latn",
+            "bak",
+            "chv",
+            "crh",
+            "crh_Latn",
+            "kaz_Cyrl",
+            "kaz_Latn",
+            "kir_Cyrl",
+            "kjh",
+            "kum",
+            "ota_Arab",
+            "ota_Latn",
+            "sah",
+            "tat",
+            "tat_Arab",
+            "tat_Latn",
+            "tuk",
+            "tuk_Latn",
+            "tur",
+            "tyv",
+            "uig_Arab",
+            "uig_Cyrl",
+            "uzb_Cyrl",
+            "uzb_Latn",
+        },
+    ),
+    "tur": ("Turkish", {"tur"}),
+    "ukr": ("Ukrainian", {"ukr"}),
+    "urd": ("Urdu", {"urd"}),
+    "urj": (
+        "Uralic languages",
+        {
+            "est",
+            "fin",
+            "fkv_Latn",
+            "hun",
+            "izh",
+            "kpv",
+            "krl",
+            "liv_Latn",
+            "mdf",
+            "mhr",
+            "myv",
+            "sma",
+            "sme",
+            "udm",
+            "vep",
+            "vro",
+        },
+    ),
+    "vie": ("Vietnamese", {"vie", "vie_Hani"}),
+    "war": ("Waray (Philippines)", {"war"}),
+    "zho": (
+        "Chinese",
+        {
+            "cjy_Hans",
+            "cjy_Hant",
+            "cmn",
+            "cmn_Bopo",
+            "cmn_Hang",
+            "cmn_Hani",
+            "cmn_Hans",
+            "cmn_Hant",
+            "cmn_Hira",
+            "cmn_Kana",
+            "cmn_Latn",
+            "cmn_Yiii",
+            "gan",
+            "hak_Hani",
+            "lzh",
+            "lzh_Bopo",
+            "lzh_Hang",
+            "lzh_Hani",
+            "lzh_Hans",
+            "lzh_Hira",
+            "lzh_Kana",
+            "lzh_Yiii",
+            "nan",
+            "nan_Hani",
+            "wuu",
+            "wuu_Bopo",
+            "wuu_Hani",
+            "wuu_Latn",
+            "yue",
+            "yue_Bopo",
+            "yue_Hang",
+            "yue_Hani",
+            "yue_Hans",
+            "yue_Hant",
+            "yue_Hira",
+            "yue_Kana",
+            "zho",
+            "zho_Hans",
+            "zho_Hant",
+        },
+    ),
+    "zle": ("East Slavic languages", {"bel", "orv_Cyrl", "bel_Latn", "rus", "ukr", "rue"}),
+    "zls": ("South Slavic languages", {"bos_Latn", "bul", "bul_Latn", "hrv", "mkd", "slv", "srp_Cyrl", "srp_Latn"}),
+    "zlw": ("West Slavic languages", {"csb_Latn", "dsb", "hsb", "pol", "ces"}),
+}
+
+
+def l2front_matter(langs):
+    return "".join(f"- {l}\n" for l in langs)
+
+
+def dedup(lst):
+    """Preservers order"""
+    new_lst = []
+    for item in lst:
+        if not item:
+            continue
+        elif item in new_lst:
+            continue
+        else:
+            new_lst.append(item)
+    return new_lst
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m", "--models", action="append", help="<Required> Set flag", required=True, nargs="+", dest="models"
+    )
+    parser.add_argument("-save_dir", "--save_dir", default="marian_converted", help="where to save converted models")
+    args = parser.parse_args()
+    resolver = TatoebaConverter(save_dir=args.save_dir)
+    resolver.convert_models(args.models[0])
diff --git a/src/transformers/models/old_marian/convert_marian_to_pytorch.py b/src/transformers/models/old_marian/convert_marian_to_pytorch.py
new file mode 100644
index 00000000000000..a7faef942e97e3
--- /dev/null
+++ b/src/transformers/models/old_marian/convert_marian_to_pytorch.py
@@ -0,0 +1,632 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import socket
+import time
+import warnings
+from pathlib import Path
+from typing import Dict, List, Union
+from zipfile import ZipFile
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from transformers import MarianConfig, MarianMTModel, MarianTokenizer
+from transformers.hf_api import HfApi
+
+
+def remove_suffix(text: str, suffix: str):
+    if text.endswith(suffix):
+        return text[: -len(suffix)]
+    return text  # or whatever
+
+
+def remove_prefix(text: str, prefix: str):
+    if text.startswith(prefix):
+        return text[len(prefix) :]
+    return text  # or whatever
+
+
+def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
+    sd = {}
+    for k in opus_dict:
+        if not k.startswith(layer_prefix):
+            continue
+        stripped = remove_prefix(k, layer_prefix)
+        v = opus_dict[k].T  # besides embeddings, everything must be transposed.
+        sd[converter[stripped]] = torch.tensor(v).squeeze()
+    return sd
+
+
+def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is_decoder=False):
+    for i, layer in enumerate(layer_lst):
+        layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_"
+        sd = convert_encoder_layer(opus_state, layer_tag, converter)
+        layer.load_state_dict(sd, strict=True)
+
+
+def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
+    """Find models that can accept src_lang as input and return tgt_lang as output."""
+    prefix = "Helsinki-NLP/opus-mt-"
+    api = HfApi()
+    model_list = api.model_list()
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith("Helsinki-NLP")]
+    src_and_targ = [
+        remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
+    ]  # + cant be loaded.
+    matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
+    return matching
+
+
+def add_emb_entries(wemb, final_bias, n_special_tokens=1):
+    vsize, d_model = wemb.shape
+    embs_to_add = np.zeros((n_special_tokens, d_model))
+    new_embs = np.concatenate([wemb, embs_to_add])
+    bias_to_add = np.zeros((n_special_tokens, 1))
+    new_bias = np.concatenate((final_bias, bias_to_add), axis=1)
+    return new_embs, new_bias
+
+
+def _cast_yaml_str(v):
+    bool_dct = {"true": True, "false": False}
+    if not isinstance(v, str):
+        return v
+    elif v in bool_dct:
+        return bool_dct[v]
+    try:
+        return int(v)
+    except (TypeError, ValueError):
+        return v
+
+
+def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict:
+    return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()}
+
+
+CONFIG_KEY = "special:model.yml"
+
+
+def load_config_from_state_dict(opus_dict):
+    import yaml
+
+    cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]])
+    yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader)
+    return cast_marian_config(yaml_cfg)
+
+
+def find_model_file(dest_dir):  # this one better
+    model_files = list(Path(dest_dir).glob("*.npz"))
+    assert len(model_files) == 1, model_files
+    model_file = model_files[0]
+    return model_file
+
+
+# Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE
+ROM_GROUP = (
+    "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT"
+    "+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co"
+    "+nap+scn+vec+sc+ro+la"
+)
+GROUPS = [
+    ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"),
+    (ROM_GROUP, "ROMANCE"),
+    ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"),
+    ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"),
+    ("se+sma+smj+smn+sms", "SAMI"),
+    ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"),
+    ("ga+cy+br+gd+kw+gv", "CELTIC"),  # https://en.wikipedia.org/wiki/Insular_Celtic_languages
+]
+GROUP_TO_OPUS_NAME = {
+    "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de",
+    "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
+    "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv",
+    "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv",
+    "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv",
+    "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
+    "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi",
+    "opus-mt-en-ROMANCE": "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
+    "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
+    "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la",
+    "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv",
+    "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
+    "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms",
+    "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
+    "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
+    "opus-mt-ROMANCE-en": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
+    "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
+    "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en",
+    "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en",
+    "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
+    "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
+}
+OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/"
+ORG_NAME = "Helsinki-NLP/"
+
+
+def convert_opus_name_to_hf_name(x):
+    """For OPUS-MT-Train/ DEPRECATED"""
+    for substr, grp_name in GROUPS:
+        x = x.replace(substr, grp_name)
+    return x.replace("+", "_")
+
+
+def convert_hf_name_to_opus_name(hf_model_name):
+    """
+    Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.
+    """
+    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
+    if hf_model_name in GROUP_TO_OPUS_NAME:
+        opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name]
+    else:
+        opus_w_prefix = hf_model_name.replace("_", "+")
+    return remove_prefix(opus_w_prefix, "opus-mt-")
+
+
+def get_system_metadata(repo_root):
+    import git
+
+    return dict(
+        helsinki_git_sha=git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
+        transformers_git_sha=git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
+        port_machine=socket.gethostname(),
+        port_time=time.strftime("%Y-%m-%d-%H:%M"),
+    )
+
+
+# docstyle-ignore
+FRONT_MATTER_TEMPLATE = """---
+language:
+{}
+tags:
+- translation
+
+license: apache-2.0
+---
+"""
+DEFAULT_REPO = "Tatoeba-Challenge"
+DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
+
+
+def write_model_card(
+    hf_model_name: str,
+    repo_root=DEFAULT_REPO,
+    save_dir=Path("marian_converted"),
+    dry_run=False,
+    extra_metadata={},
+) -> str:
+    """
+    Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir
+    s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
+    """
+    import pandas as pd
+
+    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
+    opus_name: str = convert_hf_name_to_opus_name(hf_model_name)
+    assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
+    opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
+    assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
+
+    opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
+
+    readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
+
+    s, t = ",".join(opus_src), ",".join(opus_tgt)
+    metadata = {
+        "hf_name": hf_model_name,
+        "source_languages": s,
+        "target_languages": t,
+        "opus_readme_url": readme_url,
+        "original_repo": repo_root,
+        "tags": ["translation"],
+    }
+    metadata.update(extra_metadata)
+    metadata.update(get_system_metadata(repo_root))
+
+    # combine with opus markdown
+
+    extra_markdown = (
+        f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: "
+        f"{metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
+    )
+
+    content = opus_readme_path.open().read()
+    content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
+    splat = content.split("*")[2:]
+    print(splat[3])
+    content = "*".join(splat)
+    content = (
+        FRONT_MATTER_TEMPLATE.format(metadata["src_alpha2"])
+        + extra_markdown
+        + "\n* "
+        + content.replace("download", "download original weights")
+    )
+
+    items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
+    sec3 = "\n### System Info: \n" + items
+    content += sec3
+    if dry_run:
+        return content, metadata
+    sub_dir = save_dir / f"opus-mt-{hf_model_name}"
+    sub_dir.mkdir(exist_ok=True)
+    dest = sub_dir / "README.md"
+    dest.open("w").write(content)
+    pd.Series(metadata).to_json(sub_dir / "metadata.json")
+
+    # if dry_run:
+    return content, metadata
+
+
+def make_registry(repo_path="Opus-MT-train/models"):
+    if not (Path(repo_path) / "fr-en" / "README.md").exists():
+        raise ValueError(
+            f"repo_path:{repo_path} does not exist: "
+            "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
+        )
+    results = {}
+    for p in Path(repo_path).iterdir():
+        n_dash = p.name.count("-")
+        if n_dash == 0:
+            continue
+        else:
+            lns = list(open(p / "README.md").readlines())
+            results[p.name] = _parse_readme(lns)
+    return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
+
+
+def convert_all_sentencepiece_models(model_list=None, repo_path=None, dest_dir=Path("marian_converted")):
+    """Requires 300GB"""
+    save_dir = Path("marian_ckpt")
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(exist_ok=True)
+    save_paths = []
+    if model_list is None:
+        model_list: list = make_registry(repo_path=repo_path)
+    for k, prepro, download, test_set_url in tqdm(model_list):
+        if "SentencePiece" not in prepro:  # dont convert BPE models.
+            continue
+        if not os.path.exists(save_dir / k):
+            download_and_unzip(download, save_dir / k)
+        pair_name = convert_opus_name_to_hf_name(k)
+        convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}")
+
+        save_paths.append(dest_dir / f"opus-mt-{pair_name}")
+    return save_paths
+
+
+def lmap(f, x) -> List:
+    return list(map(f, x))
+
+
+def fetch_test_set(test_set_url):
+    import wget
+
+    fname = wget.download(test_set_url, "opus_test.txt")
+    lns = Path(fname).open().readlines()
+    src = lmap(str.strip, lns[::4])
+    gold = lmap(str.strip, lns[1::4])
+    mar_model = lmap(str.strip, lns[2::4])
+    assert (
+        len(gold) == len(mar_model) == len(src)
+    ), f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched"
+    os.remove(fname)
+    return src, mar_model, gold
+
+
+def convert_whole_dir(path=Path("marian_ckpt/")):
+    for subdir in tqdm(list(path.ls())):
+        dest_dir = f"marian_converted/{subdir.name}"
+        if (dest_dir / "pytorch_model.bin").exists():
+            continue
+        convert(source_dir, dest_dir)
+
+
+def _parse_readme(lns):
+    """Get link and metadata from opus model card equivalent."""
+    subres = {}
+    for ln in [x.strip() for x in lns]:
+        if not ln.startswith("*"):
+            continue
+        ln = ln[1:].strip()
+
+        for k in ["download", "dataset", "models", "model", "pre-processing"]:
+            if ln.startswith(k):
+                break
+        else:
+            continue
+        if k in ["dataset", "model", "pre-processing"]:
+            splat = ln.split(":")
+            _, v = splat
+            subres[k] = v
+        elif k == "download":
+            v = ln.split("(")[-1][:-1]
+            subres[k] = v
+    return subres
+
+
+def save_tokenizer_config(dest_dir: Path):
+    dname = dest_dir.name.split("-")
+    dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1]))
+    save_json(dct, dest_dir / "tokenizer_config.json")
+
+
+def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]):
+    start = max(vocab.values()) + 1
+    added = 0
+    for tok in special_tokens:
+        if tok in vocab:
+            continue
+        vocab[tok] = start + added
+        added += 1
+    return added
+
+
+def find_vocab_file(model_dir):
+    return list(model_dir.glob("*vocab.yml"))[0]
+
+
+def add_special_tokens_to_vocab(model_dir: Path) -> None:
+    vocab = load_yaml(find_vocab_file(model_dir))
+    vocab = {k: int(v) for k, v in vocab.items()}
+    num_added = add_to_vocab_(vocab, ["<pad>"])
+    print(f"added {num_added} tokens to vocab")
+    save_json(vocab, model_dir / "vocab.json")
+    save_tokenizer_config(model_dir)
+
+
+def check_equal(marian_cfg, k1, k2):
+    v1, v2 = marian_cfg[k1], marian_cfg[k2]
+    assert v1 == v2, f"hparams {k1},{k2} differ: {v1} != {v2}"
+
+
+def check_marian_cfg_assumptions(marian_cfg):
+    assumed_settings = {
+        "tied-embeddings-all": True,
+        "layer-normalization": False,
+        "right-left": False,
+        "transformer-ffn-depth": 2,
+        "transformer-aan-depth": 2,
+        "transformer-no-projection": False,
+        "transformer-postprocess-emb": "d",
+        "transformer-postprocess": "dan",  # Dropout, add, normalize
+        "transformer-preprocess": "",
+        "type": "transformer",
+        "ulr-dim-emb": 0,
+        "dec-cell-base-depth": 2,
+        "dec-cell-high-depth": 1,
+        "transformer-aan-nogate": False,
+    }
+    for k, v in assumed_settings.items():
+        actual = marian_cfg[k]
+        assert actual == v, f"Unexpected config value for {k} expected {v} got {actual}"
+    check_equal(marian_cfg, "transformer-ffn-activation", "transformer-aan-activation")
+    check_equal(marian_cfg, "transformer-ffn-depth", "transformer-aan-depth")
+    check_equal(marian_cfg, "transformer-dim-ffn", "transformer-dim-aan")
+
+
+BIAS_KEY = "decoder_ff_logit_out_b"
+BART_CONVERTER = {  # for each encoder and decoder layer
+    "self_Wq": "self_attn.q_proj.weight",
+    "self_Wk": "self_attn.k_proj.weight",
+    "self_Wv": "self_attn.v_proj.weight",
+    "self_Wo": "self_attn.out_proj.weight",
+    "self_bq": "self_attn.q_proj.bias",
+    "self_bk": "self_attn.k_proj.bias",
+    "self_bv": "self_attn.v_proj.bias",
+    "self_bo": "self_attn.out_proj.bias",
+    "self_Wo_ln_scale": "self_attn_layer_norm.weight",
+    "self_Wo_ln_bias": "self_attn_layer_norm.bias",
+    "ffn_W1": "fc1.weight",
+    "ffn_b1": "fc1.bias",
+    "ffn_W2": "fc2.weight",
+    "ffn_b2": "fc2.bias",
+    "ffn_ffn_ln_scale": "final_layer_norm.weight",
+    "ffn_ffn_ln_bias": "final_layer_norm.bias",
+    # Decoder Cross Attention
+    "context_Wk": "encoder_attn.k_proj.weight",
+    "context_Wo": "encoder_attn.out_proj.weight",
+    "context_Wq": "encoder_attn.q_proj.weight",
+    "context_Wv": "encoder_attn.v_proj.weight",
+    "context_bk": "encoder_attn.k_proj.bias",
+    "context_bo": "encoder_attn.out_proj.bias",
+    "context_bq": "encoder_attn.q_proj.bias",
+    "context_bv": "encoder_attn.v_proj.bias",
+    "context_Wo_ln_scale": "encoder_attn_layer_norm.weight",
+    "context_Wo_ln_bias": "encoder_attn_layer_norm.bias",
+}
+
+
+class OpusState:
+    def __init__(self, source_dir):
+        npz_path = find_model_file(source_dir)
+        self.state_dict = np.load(npz_path)
+        cfg = load_config_from_state_dict(self.state_dict)
+        assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1]
+        assert "Wpos" not in self.state_dict, "Wpos key in state dictionary"
+        self.state_dict = dict(self.state_dict)
+        self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1)
+        self.pad_token_id = self.wemb.shape[0] - 1
+        cfg["vocab_size"] = self.pad_token_id + 1
+        # self.state_dict['Wemb'].sha
+        self.state_keys = list(self.state_dict.keys())
+        assert "Wtype" not in self.state_dict, "Wtype key in state dictionary"
+        self._check_layer_entries()
+        self.source_dir = source_dir
+        self.cfg = cfg
+        hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape
+        assert (
+            hidden_size == cfg["dim-emb"] == 512
+        ), f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched or not 512"
+
+        # Process decoder.yml
+        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
+        check_marian_cfg_assumptions(cfg)
+        self.hf_config = MarianConfig(
+            vocab_size=cfg["vocab_size"],
+            decoder_layers=cfg["dec-depth"],
+            encoder_layers=cfg["enc-depth"],
+            decoder_attention_heads=cfg["transformer-heads"],
+            encoder_attention_heads=cfg["transformer-heads"],
+            decoder_ffn_dim=cfg["transformer-dim-ffn"],
+            encoder_ffn_dim=cfg["transformer-dim-ffn"],
+            d_model=cfg["dim-emb"],
+            activation_function=cfg["transformer-aan-activation"],
+            pad_token_id=self.pad_token_id,
+            eos_token_id=0,
+            bos_token_id=0,
+            max_position_embeddings=cfg["dim-emb"],
+            scale_embedding=True,
+            normalize_embedding="n" in cfg["transformer-preprocess"],
+            static_position_embeddings=not cfg["transformer-train-position-embeddings"],
+            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
+            # default: add_final_layer_norm=False,
+            num_beams=decoder_yml["beam-size"],
+            decoder_start_token_id=self.pad_token_id,
+            bad_words_ids=[[self.pad_token_id]],
+            max_length=512,
+        )
+
+    def _check_layer_entries(self):
+        self.encoder_l1 = self.sub_keys("encoder_l1")
+        self.decoder_l1 = self.sub_keys("decoder_l1")
+        self.decoder_l2 = self.sub_keys("decoder_l2")
+        if len(self.encoder_l1) != 16:
+            warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}")
+        if len(self.decoder_l1) != 26:
+            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
+        if len(self.decoder_l2) != 26:
+            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
+
+    @property
+    def extra_keys(self):
+        extra = []
+        for k in self.state_keys:
+            if (
+                k.startswith("encoder_l")
+                or k.startswith("decoder_l")
+                or k in [CONFIG_KEY, "Wemb", "Wpos", "decoder_ff_logit_out_b"]
+            ):
+                continue
+            else:
+                extra.append(k)
+        return extra
+
+    def sub_keys(self, layer_prefix):
+        return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)]
+
+    def load_marian_model(self) -> MarianMTModel:
+        state_dict, cfg = self.state_dict, self.hf_config
+
+        assert cfg.static_position_embeddings, "config.static_position_embeddings should be True"
+        model = MarianMTModel(cfg)
+
+        assert "hidden_size" not in cfg.to_dict()
+        load_layers_(
+            model.model.encoder.layers,
+            state_dict,
+            BART_CONVERTER,
+        )
+        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
+
+        # handle tensors not associated with layers
+        wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb))
+        bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias))
+        model.model.shared.weight = wemb_tensor
+        model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared
+
+        model.final_logits_bias = bias_tensor
+
+        if "Wpos" in state_dict:
+            print("Unexpected: got Wpos")
+            wpos_tensor = torch.tensor(state_dict["Wpos"])
+            model.model.encoder.embed_positions.weight = wpos_tensor
+            model.model.decoder.embed_positions.weight = wpos_tensor
+
+        if cfg.normalize_embedding:
+            assert "encoder_emb_ln_scale_pre" in state_dict
+            raise NotImplementedError("Need to convert layernorm_embedding")
+
+        assert not self.extra_keys, f"Failed to convert {self.extra_keys}"
+        assert (
+            model.model.shared.padding_idx == self.pad_token_id
+        ), f"Padding tokens {model.model.shared.padding_idx} and {self.pad_token_id} mismatched"
+        return model
+
+
+def download_and_unzip(url, dest_dir):
+    try:
+        import wget
+    except ImportError:
+        raise ImportError("you must pip install wget")
+
+    filename = wget.download(url)
+    unzip(filename, dest_dir)
+    os.remove(filename)
+
+
+def convert(source_dir: Path, dest_dir):
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(exist_ok=True)
+
+    add_special_tokens_to_vocab(source_dir)
+    tokenizer = MarianTokenizer.from_pretrained(str(source_dir))
+    tokenizer.save_pretrained(dest_dir)
+
+    opus_state = OpusState(source_dir)
+    assert opus_state.cfg["vocab_size"] == len(
+        tokenizer.encoder
+    ), f"Original vocab size {opus_state.cfg['vocab_size']} and new vocab size {len(tokenizer.encoder)} mismatched"
+    # save_json(opus_state.cfg, dest_dir / "marian_original_config.json")
+    # ^^ Uncomment to save human readable marian config for debugging
+
+    model = opus_state.load_marian_model()
+    model = model.half()
+    model.save_pretrained(dest_dir)
+    model.from_pretrained(dest_dir)  # sanity check
+
+
+def load_yaml(path):
+    import yaml
+
+    with open(path) as f:
+        return yaml.load(f, Loader=yaml.BaseLoader)
+
+
+def save_json(content: Union[Dict, List], path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(content, f)
+
+
+def unzip(zip_path: str, dest_dir: str) -> None:
+    with ZipFile(zip_path, "r") as zipObj:
+        zipObj.extractall(dest_dir)
+
+
+if __name__ == "__main__":
+    """
+    Tatoeba conversion instructions in scripts/tatoeba/README.md
+    """
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--src", type=str, help="path to marian model sub dir", default="en-de")
+    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+
+    source_dir = Path(args.src)
+    assert source_dir.exists(), f"Source directory {source_dir} not found"
+    dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
+    convert(source_dir, dest_dir)
diff --git a/src/transformers/models/old_marian/modeling_marian.py b/src/transformers/models/old_marian/modeling_marian.py
new file mode 100644
index 00000000000000..25d3dc1ea969c5
--- /dev/null
+++ b/src/transformers/models/old_marian/modeling_marian.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2020 Marian Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MarianMTModel model, ported from the Marian C++ repo."""
+
+
+from ..bart.modeling_bart import BartForConditionalGeneration
+from .configuration_marian import MarianConfig
+
+
+# See all Marian models at https://huggingface.co/models?search=Helsinki-NLP
+
+
+class MarianMTModel(BartForConditionalGeneration):
+    r"""
+    Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
+    models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
+
+    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+
+    Examples::
+
+        >>> from transformers import MarianTokenizer, MarianMTModel
+        >>> from typing import List
+        >>> src = 'fr'  # source language
+        >>> trg = 'en'  # target language
+        >>> sample_text = "où est l'arrêt de bus ?"
+        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+
+        >>> model = MarianMTModel.from_pretrained(mname)
+        >>> tok = MarianTokenizer.from_pretrained(mname)
+        >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt")  # don't need tgt_text for inference
+        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
+        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the bus stop ?"
+
+    """
+    config_class = MarianConfig
+    _keys_to_ignore_on_load_missing = [
+        "model.encoder.embed_positions.weight",
+        "model.decoder.embed_positions.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        "model.encoder.embed_positions.weight",
+        "model.decoder.embed_positions.weight",
+    ]
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        logits[:, self.config.pad_token_id] = float("-inf")  # never predict pad token.
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
diff --git a/src/transformers/models/old_marian/modeling_tf_marian.py b/src/transformers/models/old_marian/modeling_tf_marian.py
new file mode 100644
index 00000000000000..f17182306eee18
--- /dev/null
+++ b/src/transformers/models/old_marian/modeling_tf_marian.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF Marian model, ported from the fairseq repo."""
+
+from ...file_utils import add_start_docstrings, is_tf_available
+from ...utils import logging
+from ..bart.modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
+from .configuration_marian import MarianConfig
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+_CONFIG_FOR_DOC = "MarianConfig"
+
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
+class TFMarianMTModel(TFBartForConditionalGeneration):
+    _keys_to_ignore_on_load_missing = [
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
+    ]
+    config_class = MarianConfig
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        """Never predict pad_token_id. Predict </s> when max_length is reached."""
+        vocab_range = tf.constant(range(self.config.vocab_size))
+        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
+        if cur_len == max_length - 1:
+            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
+        return logits
diff --git a/src/transformers/models/old_marian/tokenization_marian.py b/src/transformers/models/old_marian/tokenization_marian.py
new file mode 100644
index 00000000000000..3b4ede693f0baa
--- /dev/null
+++ b/src/transformers/models/old_marian/tokenization_marian.py
@@ -0,0 +1,291 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple, Union
+
+import sentencepiece
+
+from ...file_utils import add_start_docstrings
+from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
+from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
+
+
+vocab_files_names = {
+    "source_spm": "source.spm",
+    "target_spm": "target.spm",
+    "vocab": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
+    "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
+    "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
+    "tokenizer_config_file": {
+        "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
+PRETRAINED_INIT_CONFIGURATION = {}
+
+# Example URL https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json
+
+
+class MarianTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a Marian tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        source_spm (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary for the source language.
+        target_spm (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            contains the vocabulary for the target language.
+        source_lang (:obj:`str`, `optional`):
+            A string representing the source language.
+        target_lang (:obj:`str`, `optional`):
+            A string representing the target language.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        model_max_length (:obj:`int`, `optional`, defaults to 512):
+            The maximum sentence length the model accepts.
+        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            Additional special tokens used by the tokenizer.
+
+    Examples::
+
+        >>> from transformers import MarianTokenizer
+        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
+        >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
+        >>> # keys  [input_ids, attention_mask, labels].
+        >>> # model(**batch) should work
+    """
+
+    vocab_files_names = vocab_files_names
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    language_code_re = re.compile(">>.+<<")  # type: re.Pattern
+
+    def __init__(
+        self,
+        vocab,
+        source_spm,
+        target_spm,
+        source_lang=None,
+        target_lang=None,
+        unk_token="<unk>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        model_max_length=512,
+        **kwargs
+    ):
+        super().__init__(
+            # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
+            source_lang=source_lang,
+            target_lang=target_lang,
+            unk_token=unk_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+        assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
+        self.encoder = load_json(vocab)
+        if self.unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in vocab")
+        assert self.pad_token in self.encoder
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.source_lang = source_lang
+        self.target_lang = target_lang
+        self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")]
+        self.spm_files = [source_spm, target_spm]
+
+        # load SentencePiece model for pre-processing
+        self.spm_source = load_spm(source_spm)
+        self.spm_target = load_spm(target_spm)
+        self.current_spm = self.spm_source
+
+        # Multilingual target side: default to using first supported language code.
+
+        self._setup_normalizer()
+
+    def _setup_normalizer(self):
+        try:
+            from sacremoses import MosesPunctNormalizer
+
+            self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
+        except (ImportError, FileNotFoundError):
+            warnings.warn("Recommended: pip install sacremoses.")
+            self.punc_normalizer = lambda x: x
+
+    def normalize(self, x: str) -> str:
+        """Cover moses empty string edge case. They return empty list for '' input!"""
+        return self.punc_normalizer(x) if x else ""
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder[self.unk_token])
+
+    def remove_language_code(self, text: str):
+        """Remove language codes like <<fr>> before sentencepiece"""
+        match = self.language_code_re.match(text)
+        code: list = [match.group(0)] if match else []
+        return code, self.language_code_re.sub("", text)
+
+    def _tokenize(self, text: str) -> List[str]:
+        code, text = self.remove_language_code(text)
+        pieces = self.current_spm.EncodeAsPieces(text)
+        return code + pieces
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the encoder."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Uses target language sentencepiece model"""
+        return self.spm_target.DecodePieces(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        truncation=True,
+        padding="longest",
+        **unused,
+    ) -> BatchEncoding:
+        if "" in src_texts:
+            raise ValueError(f"found empty string in src_texts: {src_texts}")
+        self.current_spm = self.spm_source
+        src_texts = [self.normalize(t) for t in src_texts]  # this does not appear to do much
+        tokenizer_kwargs = dict(
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+        )
+        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
+
+        if tgt_texts is None:
+            return model_inputs
+        if max_target_length is not None:
+            tokenizer_kwargs["max_length"] = max_target_length
+
+        self.current_spm = self.spm_target
+        model_inputs["labels"] = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
+        self.current_spm = self.spm_source
+        return model_inputs
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        save_dir = Path(save_directory)
+        assert save_dir.is_dir(), f"{save_directory} should be a directory"
+        save_json(
+            self.encoder,
+            save_dir / ((filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab"]),
+        )
+
+        for orig, f in zip(["source.spm", "target.spm"], self.spm_files):
+            dest_path = save_dir / ((filename_prefix + "-" if filename_prefix else "") + Path(f).name)
+            if not dest_path.exists():
+                copyfile(f, save_dir / orig)
+
+        return tuple(
+            save_dir / ((filename_prefix + "-" if filename_prefix else "") + f) for f in self.vocab_files_names
+        )
+
+    def get_vocab(self) -> Dict:
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self) -> Dict:
+        state = self.__dict__.copy()
+        state.update({k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer"]})
+        return state
+
+    def __setstate__(self, d: Dict) -> None:
+        self.__dict__ = d
+        self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files)
+        self.current_spm = self.spm_source
+        self._setup_normalizer()
+
+    def num_special_tokens_to_add(self, **unused):
+        """Just EOS"""
+        return 1
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+
+def load_spm(path: str) -> sentencepiece.SentencePieceProcessor:
+    spm = sentencepiece.SentencePieceProcessor()
+    spm.Load(path)
+    return spm
+
+
+def save_json(data, path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+
+
+def load_json(path: str) -> Union[Dict, List]:
+    with open(path, "r") as f:
+        return json.load(f)
diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py
index 3fc3338fec6f49..e877817394dac6 100644
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 HuggingFace Inc. team.
+# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,58 +12,271 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+""" Testing suite for the PyTorch Marian model. """
 
 
+import tempfile
 import unittest
 
-from transformers import AutoConfig, AutoTokenizer, MarianConfig, MarianTokenizer, is_torch_available
+import timeout_decorator  # noqa
+
+from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.hf_api import HfApi
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
-from .test_modeling_common import ModelTesterMixin
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
 
 
 if is_torch_available():
     import torch
 
-    from transformers import AutoModelWithLMHead, MarianMTModel
-    from transformers.models.bart.modeling_bart import shift_tokens_right
+    from transformers import (
+        AutoConfig,
+        AutoModelWithLMHead,
+        AutoTokenizer,
+        MarianConfig,
+        MarianModel,
+        MarianMTModel,
+        MarianTokenizer,
+        TranslationPipeline,
+    )
     from transformers.models.marian.convert_marian_to_pytorch import (
         ORG_NAME,
         convert_hf_name_to_opus_name,
         convert_opus_name_to_hf_name,
     )
-    from transformers.pipelines import TranslationPipeline
-
-
-class ModelTester:
-    def __init__(self, parent):
-        self.config = MarianConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
+    from transformers.models.marian.modeling_marian import MarianDecoder, MarianEncoder, shift_tokens_right
+
+
+def prepare_marian_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+
+
+@require_torch
+class MarianModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        decoder_start_token_id=3,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = MarianConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id,
         )
+        inputs_dict = prepare_marian_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
 
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = MarianModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
 
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (MarianMTModel,) if is_torch_available() else ()
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = MarianModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
 
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = MarianEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = MarianDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (MarianModel, MarianMTModel) if is_torch_available() else ()
+    all_generative_model_classes = (MarianMTModel,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
 
     def setUp(self):
-        self.model_tester = ModelTester(self)
+        self.model_tester = MarianModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MarianConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = MarianMTModel(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
 class ModelManagementTests(unittest.TestCase):
@@ -163,7 +376,9 @@ def test_forward(self):
             "labels",
         }
         self.assertSetEqual(desired_keys, set(model_inputs.keys()))
-        model_inputs["decoder_input_ids"] = shift_tokens_right(model_inputs.labels, self.tokenizer.pad_token_id)
+        model_inputs["decoder_input_ids"] = shift_tokens_right(
+            model_inputs.labels, self.tokenizer.pad_token_id, self.model.config.decoder_start_token_id
+        )
         model_inputs["return_dict"] = True
         model_inputs["use_cache"] = False
         with torch.no_grad():
diff --git a/tests/test_modeling_old_marian.py b/tests/test_modeling_old_marian.py
new file mode 100644
index 00000000000000..3fc3338fec6f49
--- /dev/null
+++ b/tests/test_modeling_old_marian.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import AutoConfig, AutoTokenizer, MarianConfig, MarianTokenizer, is_torch_available
+from transformers.file_utils import cached_property
+from transformers.hf_api import HfApi
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_modeling_common import ModelTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelWithLMHead, MarianMTModel
+    from transformers.models.bart.modeling_bart import shift_tokens_right
+    from transformers.models.marian.convert_marian_to_pytorch import (
+        ORG_NAME,
+        convert_hf_name_to_opus_name,
+        convert_opus_name_to_hf_name,
+    )
+    from transformers.pipelines import TranslationPipeline
+
+
+class ModelTester:
+    def __init__(self, parent):
+        self.config = MarianConfig(
+            vocab_size=99,
+            d_model=24,
+            encoder_layers=2,
+            decoder_layers=2,
+            encoder_attention_heads=2,
+            decoder_attention_heads=2,
+            encoder_ffn_dim=32,
+            decoder_ffn_dim=32,
+            max_position_embeddings=48,
+            add_final_layer_norm=True,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        return self.config, {}
+
+
+@require_torch
+class SelectiveCommonTest(unittest.TestCase):
+    all_model_classes = (MarianMTModel,) if is_torch_available() else ()
+
+    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
+
+    def setUp(self):
+        self.model_tester = ModelTester(self)
+
+
+class ModelManagementTests(unittest.TestCase):
+    @slow
+    @require_torch
+    def test_model_names(self):
+        model_list = HfApi().model_list()
+        model_ids = [x.modelId for x in model_list if x.modelId.startswith(ORG_NAME)]
+        bad_model_ids = [mid for mid in model_ids if "+" in model_ids]
+        self.assertListEqual([], bad_model_ids)
+        self.assertGreater(len(model_ids), 500)
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MarianIntegrationTest(unittest.TestCase):
+    src = "en"
+    tgt = "de"
+    src_text = [
+        "I am a small frog.",
+        "Now I can forget the 100 words of german that I know.",
+        "Tom asked his teacher for advice.",
+        "That's how I would do it.",
+        "Tom really admired Mary's courage.",
+        "Turn around and close your eyes.",
+    ]
+    expected_text = [
+        "Ich bin ein kleiner Frosch.",
+        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
+        "Tom bat seinen Lehrer um Rat.",
+        "So würde ich das machen.",
+        "Tom bewunderte Marias Mut wirklich.",
+        "Drehen Sie sich um und schließen Sie die Augen.",
+    ]
+    # ^^ actual C++ output differs slightly: (1) des Deutschen removed, (2) ""-> "O", (3) tun -> machen
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
+        return cls
+
+    @cached_property
+    def tokenizer(self) -> MarianTokenizer:
+        return AutoTokenizer.from_pretrained(self.model_name)
+
+    @property
+    def eos_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @cached_property
+    def model(self):
+        model: MarianMTModel = AutoModelWithLMHead.from_pretrained(self.model_name).to(torch_device)
+        c = model.config
+        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
+        self.assertEqual(c.max_length, 512)
+        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
+
+        if torch_device == "cuda":
+            return model.half()
+        else:
+            return model
+
+    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
+        generated_words = self.translate_src_text(**tokenizer_kwargs)
+        self.assertListEqual(self.expected_text, generated_words)
+
+    def translate_src_text(self, **tokenizer_kwargs):
+        model_inputs = self.tokenizer.prepare_seq2seq_batch(
+            src_texts=self.src_text, return_tensors="pt", **tokenizer_kwargs
+        ).to(torch_device)
+        self.assertEqual(self.model.device, model_inputs.input_ids.device)
+        generated_ids = self.model.generate(
+            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
+        )
+        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        return generated_words
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_EN_DE_More(MarianIntegrationTest):
+    @slow
+    def test_forward(self):
+        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
+        expected_ids = [38, 121, 14, 697, 38848, 0]
+
+        model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt, return_tensors="pt").to(
+            torch_device
+        )
+
+        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
+
+        desired_keys = {
+            "input_ids",
+            "attention_mask",
+            "labels",
+        }
+        self.assertSetEqual(desired_keys, set(model_inputs.keys()))
+        model_inputs["decoder_input_ids"] = shift_tokens_right(model_inputs.labels, self.tokenizer.pad_token_id)
+        model_inputs["return_dict"] = True
+        model_inputs["use_cache"] = False
+        with torch.no_grad():
+            outputs = self.model(**model_inputs)
+        max_indices = outputs.logits.argmax(-1)
+        self.tokenizer.batch_decode(max_indices)
+
+    def test_unk_support(self):
+        t = self.tokenizer
+        ids = t.prepare_seq2seq_batch(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
+        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
+        self.assertEqual(expected, ids)
+
+    def test_pad_not_split(self):
+        input_ids_w_pad = (
+            self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"], return_tensors="pt")
+            .input_ids[0]
+            .tolist()
+        )
+        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
+        self.assertListEqual(expected_w_pad, input_ids_w_pad)
+
+    @slow
+    def test_batch_generation_en_de(self):
+        self._assert_generated_batch_equal_expected()
+
+    def test_auto_config(self):
+        config = AutoConfig.from_pretrained(self.model_name)
+        self.assertIsInstance(config, MarianConfig)
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_EN_FR(MarianIntegrationTest):
+    src = "en"
+    tgt = "fr"
+    src_text = [
+        "I am a small frog.",
+        "Now I can forget the 100 words of german that I know.",
+    ]
+    expected_text = [
+        "Je suis une petite grenouille.",
+        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
+    ]
+
+    @slow
+    def test_batch_generation_en_fr(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_FR_EN(MarianIntegrationTest):
+    src = "fr"
+    tgt = "en"
+    src_text = [
+        "Donnez moi le micro.",
+        "Tom et Mary étaient assis à une table.",  # Accents
+    ]
+    expected_text = [
+        "Give me the microphone.",
+        "Tom and Mary were sitting at a table.",
+    ]
+
+    @slow
+    def test_batch_generation_fr_en(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_RU_FR(MarianIntegrationTest):
+    src = "ru"
+    tgt = "fr"
+    src_text = ["Он показал мне рукопись своей новой пьесы."]
+    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
+
+    @slow
+    def test_batch_generation_ru_fr(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_MT_EN(MarianIntegrationTest):
+    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
+
+    src = "mt"
+    tgt = "en"
+    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
+    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
+
+    @slow
+    def test_batch_generation_mt_en(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_zh(MarianIntegrationTest):
+    src = "en"
+    tgt = "zh"
+    src_text = ["My name is Wolfgang and I live in Berlin"]
+    expected_text = ["我叫沃尔夫冈 我住在柏林"]
+
+    @slow
+    def test_batch_generation_eng_zho(self):
+        self._assert_generated_batch_equal_expected()
+
+
+@require_sentencepiece
+@require_tokenizers
+class TestMarian_en_ROMANCE(MarianIntegrationTest):
+    """Multilingual on target side."""
+
+    src = "en"
+    tgt = "ROMANCE"
+    src_text = [
+        ">>fr<< Don't spend so much time watching TV.",
+        ">>pt<< Your message has been sent.",
+        ">>es<< He's two years older than me.",
+    ]
+    expected_text = [
+        "Ne passez pas autant de temps à regarder la télé.",
+        "A sua mensagem foi enviada.",
+        "Es dos años más viejo que yo.",
+    ]
+
+    @slow
+    def test_batch_generation_en_ROMANCE_multi(self):
+        self._assert_generated_batch_equal_expected()
+
+    def test_tokenizer_handles_empty(self):
+        normalized = self.tokenizer.normalize("")
+        self.assertIsInstance(normalized, str)
+        with self.assertRaises(ValueError):
+            self.tokenizer.prepare_seq2seq_batch([""], return_tensors="pt")
+
+    @slow
+    def test_pipeline(self):
+        device = 0 if torch_device == "cuda" else -1
+        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="pt", device=device)
+        output = pipeline(self.src_text)
+        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
+
+
+@require_torch
+class TestConversionUtils(unittest.TestCase):
+    def test_renaming_multilingual(self):
+        old_names = [
+            "opus-mt-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
+            "opus-mt-cmn+cn-fi",  # no group
+            "opus-mt-en-de",  # standard name
+            "opus-mt-en-de",  # standard name
+        ]
+        expected = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
+        self.assertListEqual(expected, [convert_opus_name_to_hf_name(x) for x in old_names])
+
+    def test_undoing_renaming(self):
+        hf_names = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
+        converted_opus_names = [convert_hf_name_to_opus_name(x) for x in hf_names]
+        expected_opus_names = [
+            "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
+            "cmn+cn-fi",
+            "en-de",  # standard name
+            "en-de",
+        ]
+        self.assertListEqual(expected_opus_names, converted_opus_names)

From fcdbcd9e3c3e4bc14c4c7cecd876968177b0bd43 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 18:54:59 +0000
Subject: [PATCH 12/51] finish blenderbot

---
 .../models/blenderbot/__init__.py             |    2 +
 .../blenderbot/configuration_blenderbot.py    |  200 ++-
 .../models/blenderbot/modeling_blenderbot.py  | 1200 ++++++++++++++++-
 .../models/old_blenderbot/__init__.py         |   32 +
 .../configuration_blenderbot.py               |  181 +++
 ..._original_pytorch_checkpoint_to_pytorch.py |  114 ++
 .../old_blenderbot/modeling_blenderbot.py     |   69 +
 .../old_blenderbot/modeling_tf_blenderbot.py  |   46 +
 .../old_blenderbot/tokenization_blenderbot.py |  269 ++++
 tests/test_modeling_blenderbot.py             |  320 ++++-
 tests/test_modeling_old_blenderbot.py         |  216 +++
 11 files changed, 2445 insertions(+), 204 deletions(-)
 mode change 100644 => 100755 src/transformers/models/blenderbot/modeling_blenderbot.py
 create mode 100644 src/transformers/models/old_blenderbot/__init__.py
 create mode 100644 src/transformers/models/old_blenderbot/configuration_blenderbot.py
 create mode 100644 src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/old_blenderbot/modeling_blenderbot.py
 create mode 100644 src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
 create mode 100644 src/transformers/models/old_blenderbot/tokenization_blenderbot.py
 create mode 100644 tests/test_modeling_old_blenderbot.py

diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py
index fccb38f80ac145..40c5c34f350d85 100644
--- a/src/transformers/models/blenderbot/__init__.py
+++ b/src/transformers/models/blenderbot/__init__.py
@@ -26,7 +26,9 @@
         BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
         BlenderbotForConditionalGeneration,
         BlenderbotModel,
+        BlenderbotPreTrainedModel,
     )
 
+
 if is_tf_available():
     from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index b273ebb6ae01ac..52304b6ea4f7e5 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and Huggingface, 2020
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,46 +12,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-"""
-BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
-blenderbot-90M defaults.
-"""
-from ..bart.configuration_bart import BartConfig
+""" Blenderbot model configuration """
 
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
 
 BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/blenderbot-3B": "https://cdn.huggingface.co/facebook/blenderbot-3B/config.json",
-    "facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/config.json",
+    "facebook/blenderbot-90M": "https://huggingface.co/facebook/blenderbot-90M/resolve/main/config.json",
+    # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
 }
 
 
-class BlenderbotConfig(BartConfig):
+class BlenderbotConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
-    has the same signature with different defaults.
+    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
+    to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
+    `facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
+
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 54944):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotForConditionalGeneration`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
+            :class:`~transformers.TFBlenderbotModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of encoder layers, 6 are used for the `blenderbot-90M` model.
-        decoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of decoder layers, 6 are used for the `blenderbot-90M` model.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
         encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
@@ -65,117 +67,99 @@ class BlenderbotConfig(BartConfig):
             The dropout ratio for activations inside the fully connected layer.
         classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``),
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+        Example::
+
+        >>> from transformers import BlenderbotModel, BlenderbotConfig
+
+        >>> # Initializing a Blenderbot facebook/blenderbot-90M style configuration
+        >>> configuration = BlenderbotConfig()
+
+        >>> # Initializing a model from the facebook/blenderbot-90M style configuration
+        >>> model = BlenderbotModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
     """
     model_type = "blenderbot"
 
     def __init__(
         self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=0,
-        activation_function="gelu",
-        vocab_size=54944,
-        d_model=512,
-        encoder_ffn_dim=2048,
-        encoder_layers=8,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
         encoder_attention_heads=16,
-        decoder_ffn_dim=2048,
-        decoder_layers=8,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
         decoder_attention_heads=16,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        attention_dropout=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
         dropout=0.1,
-        max_position_embeddings=512,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=1,
         classifier_dropout=0.0,
-        is_encoder_decoder=True,
+        scale_embedding=False,
+        gradient_checkpointing=False,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=True,
-        scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
-        force_bos_token_to_be_generated=False,
-        **common_kwargs
+        **kwargs
     ):
-        r"""
-        Examples::
-
-            >>> from transformers import BlenderbotConfig
-            >>> config = BlenderbotConfig.from_pretrained('facebook/blenderbot-90M')
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            vocab_size=vocab_size,
-            d_model=d_model,
-            encoder_ffn_dim=encoder_ffn_dim,
-            encoder_layers=encoder_layers,
-            encoder_layerdrop=encoder_layerdrop,
-            encoder_attention_heads=encoder_attention_heads,
-            decoder_layerdrop=decoder_layerdrop,
-            decoder_ffn_dim=decoder_ffn_dim,
-            decoder_layers=decoder_layers,
-            normalize_before=normalize_before,
-            normalize_embedding=normalize_embedding,
-            static_position_embeddings=static_position_embeddings,
-            add_bias_logits=add_bias_logits,
-            force_bos_token_to_be_generated=force_bos_token_to_be_generated,
-            do_blenderbot_90_layernorm=do_blenderbot_90_layernorm,
-            add_final_layer_norm=add_final_layer_norm,
-            scale_embedding=scale_embedding,
-            attention_dropout=attention_dropout,
-            dropout=dropout,
-            classifier_dropout=classifier_dropout,
-            activation_dropout=activation_dropout,
-            max_position_embeddings=max_position_embeddings,
-            extra_pos_embeddings=extra_pos_embeddings,
-            activation_function=activation_function,
-            decoder_attention_heads=decoder_attention_heads,
-            **common_kwargs,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
         )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
old mode 100644
new mode 100755
index 2a370fbabf8624..dba7c06e51336f
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,18 +12,469 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotForConditionalGeneration which inherits from BART"""
+""" PyTorch Blenderbot model. """
+
+
+import math
+import random
+from typing import Optional, Tuple
 
 import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
 
-from ...file_utils import add_start_docstrings
-from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
 from .configuration_blenderbot import BlenderbotConfig
 
 
-BLENDER_START_DOCSTRING = r"""
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BlenderbotConfig"
+_TOKENIZER_FOR_DOC = "BlenderbotTokenizer"
+
+
+BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/blenderbot-90M",
+    # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
+]
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+def BlenderbotLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class BlenderbotLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
+        num_embeddings
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class BlenderbotAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
 
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class BlenderbotEncoderLayer(nn.Module):
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BlenderbotAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BlenderbotDecoderLayer(nn.Module):
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BlenderbotAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.encoder_attn = BlenderbotAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class BlenderbotClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class BlenderbotPreTrainedModel(PreTrainedModel):
+    config_class = BlenderbotConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+BLENDERBOT_START_DOCSTRING = r"""
     This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
     methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
     pruning heads etc.)
@@ -33,37 +483,737 @@
     subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
     general usage and behavior.
 
+    Parameters:
+        config (:class:`~transformers.BlenderbotConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BLENDERBOT_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, BlenderbotConfig
+
+        >>> model = BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-90M')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
 """
 
-BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/blenderbot-90M"]
+BLENDERBOT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_blenderbot._prepare_decoder_inputs`
+            and modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class BlenderbotEncoder(BlenderbotPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`BlenderbotEncoderLayer`.
+
+    Args:
+        config: BlenderbotConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = BlenderbotLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = BlenderbotLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # add final layer norm
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class BlenderbotDecoder(BlenderbotPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BlenderbotDecoderLayer`
+
+    Args:
+        config: BlenderbotConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = BlenderbotLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layer_norm = BlenderbotLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        # in constrast to Bart, Blenderbot applies layernorm on inputs_embeds
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        # add final layer norm
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
 
 
 @add_start_docstrings(
-    "The bare BlenderBot Model transformer outputting raw hidden-states without any specific head on top.",
-    BLENDER_START_DOCSTRING,
+    "The bare Blenderbot Model outputting raw hidden-states without any specific head on top.",
+    BLENDERBOT_START_DOCSTRING,
 )
-class BlenderbotModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
+class BlenderbotModel(BlenderbotPreTrainedModel):
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__(config)
 
-    config_class = BlenderbotConfig
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = BlenderbotEncoder(config, self.shared)
+        self.decoder = BlenderbotDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/blenderbot-90M",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
 
 
 @add_start_docstrings(
-    "The BlenderBot Model with a language modeling head. Can be used for summarization.", BLENDER_START_DOCSTRING
+    "The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING
 )
-class BlenderbotForConditionalGeneration(BartForConditionalGeneration):
-    """
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
+class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
 
-    config_class = BlenderbotConfig
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__(config)
+        self.model = BlenderbotModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-90M')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
 
     def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        logits[:, self.config.bos_token_id] = -torch.finfo(torch.float16).max  # near infinity fp16
         if cur_len == max_length - 1 and self.config.eos_token_id is not None:
             self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
         return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/src/transformers/models/old_blenderbot/__init__.py b/src/transformers/models/old_blenderbot/__init__.py
new file mode 100644
index 00000000000000..fccb38f80ac145
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/__init__.py
@@ -0,0 +1,32 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...file_utils import is_tf_available, is_torch_available
+from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
+from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
+
+
+if is_torch_available():
+    from .modeling_blenderbot import (
+        BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BlenderbotForConditionalGeneration,
+        BlenderbotModel,
+    )
+
+if is_tf_available():
+    from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
diff --git a/src/transformers/models/old_blenderbot/configuration_blenderbot.py b/src/transformers/models/old_blenderbot/configuration_blenderbot.py
new file mode 100644
index 00000000000000..b273ebb6ae01ac
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/configuration_blenderbot.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and Huggingface, 2020
+#
+# This source code is licensed under the MIT license found in the;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# LICENSE file in the root directory of this source tree.
+"""
+BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
+blenderbot-90M defaults.
+"""
+from ..bart.configuration_bart import BartConfig
+
+
+BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/blenderbot-3B": "https://cdn.huggingface.co/facebook/blenderbot-3B/config.json",
+    "facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/config.json",
+}
+
+
+class BlenderbotConfig(BartConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    :class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
+    has the same signature with different defaults.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 54944):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotForConditionalGeneration`.
+        d_model (:obj:`int`, `optional`, defaults to 512):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 8):
+            Number of encoder layers, 6 are used for the `blenderbot-90M` model.
+        decoder_layers (:obj:`int`, `optional`, defaults to 8):
+            Number of decoder layers, 6 are used for the `blenderbot-90M` model.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            This should be completed, specific to marian.
+        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Call layernorm before attention ops.
+        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Call layernorm after embeddings.
+        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Don't learn positional embeddings, use sinusoidal.
+        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Why not add another layernorm?
+        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+            End of stream token id.
+        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+            Padding token id.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+            Beginning of stream token id.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
+            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether this is an encoder/decoder model.
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``),
+    """
+    model_type = "blenderbot"
+
+    def __init__(
+        self,
+        activation_dropout=0.0,
+        extra_pos_embeddings=0,
+        activation_function="gelu",
+        vocab_size=54944,
+        d_model=512,
+        encoder_ffn_dim=2048,
+        encoder_layers=8,
+        encoder_attention_heads=16,
+        decoder_ffn_dim=2048,
+        decoder_layers=8,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        attention_dropout=0.0,
+        dropout=0.1,
+        max_position_embeddings=512,
+        classifier_dropout=0.0,
+        is_encoder_decoder=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        normalize_before=False,
+        add_final_layer_norm=False,
+        do_blenderbot_90_layernorm=True,
+        scale_embedding=False,
+        normalize_embedding=True,
+        static_position_embeddings=False,
+        add_bias_logits=False,
+        force_bos_token_to_be_generated=False,
+        **common_kwargs
+    ):
+        r"""
+        Examples::
+
+            >>> from transformers import BlenderbotConfig
+            >>> config = BlenderbotConfig.from_pretrained('facebook/blenderbot-90M')
+
+        """
+        if "hidden_size" in common_kwargs:
+            raise ValueError("hidden size is called d_model")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            vocab_size=vocab_size,
+            d_model=d_model,
+            encoder_ffn_dim=encoder_ffn_dim,
+            encoder_layers=encoder_layers,
+            encoder_layerdrop=encoder_layerdrop,
+            encoder_attention_heads=encoder_attention_heads,
+            decoder_layerdrop=decoder_layerdrop,
+            decoder_ffn_dim=decoder_ffn_dim,
+            decoder_layers=decoder_layers,
+            normalize_before=normalize_before,
+            normalize_embedding=normalize_embedding,
+            static_position_embeddings=static_position_embeddings,
+            add_bias_logits=add_bias_logits,
+            force_bos_token_to_be_generated=force_bos_token_to_be_generated,
+            do_blenderbot_90_layernorm=do_blenderbot_90_layernorm,
+            add_final_layer_norm=add_final_layer_norm,
+            scale_embedding=scale_embedding,
+            attention_dropout=attention_dropout,
+            dropout=dropout,
+            classifier_dropout=classifier_dropout,
+            activation_dropout=activation_dropout,
+            max_position_embeddings=max_position_embeddings,
+            extra_pos_embeddings=extra_pos_embeddings,
+            activation_function=activation_function,
+            decoder_attention_heads=decoder_attention_heads,
+            **common_kwargs,
+        )
diff --git a/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..d31cf67c1e3f6c
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Blenderbot checkpoint."""
+
+import argparse
+
+import torch
+
+from transformers import BartConfig, BartForConditionalGeneration
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+PATTERNS = [
+    ["attention", "attn"],
+    ["encoder_attention", "encoder_attn"],
+    ["q_lin", "q_proj"],
+    ["k_lin", "k_proj"],
+    ["v_lin", "v_proj"],
+    ["out_lin", "out_proj"],
+    ["norm_embeddings", "layernorm_embedding"],
+    ["position_embeddings", "embed_positions"],
+    ["embeddings", "embed_tokens"],
+    ["ffn.lin", "fc"],
+]
+
+
+def rename_state_dict_key(k):
+    if k == "embeddings.weight":
+        return "shared.weight"
+
+    for parlai_name, hf_name in PATTERNS:
+        k = k.replace(parlai_name, hf_name)
+
+    if k.startswith("encoder"):
+        k = k.replace(".attn", ".self_attn")
+        k = k.replace("norm1", "self_attn_layer_norm")
+        k = k.replace("norm2", "final_layer_norm")
+    elif k.startswith("decoder"):
+        k = k.replace("norm1", "self_attn_layer_norm")
+        k = k.replace("norm2", "encoder_attn_layer_norm")
+        k = k.replace("norm3", "final_layer_norm")
+    return k
+
+
+def rename_layernorm_keys(sd):
+    keys = [
+        "model.encoder.layernorm_embedding.weight",
+        "model.encoder.layernorm_embedding.bias",
+        "model.decoder.layernorm_embedding.weight",
+        "model.decoder.layernorm_embedding.bias",
+    ]
+    for k in keys:
+        v = sd.pop(k)
+        new_k = k.replace("layernorm_embedding", "layer_norm")
+        assert new_k not in sd
+        sd[new_k] = v
+
+
+IGNORE_KEYS = ["START"]
+
+
+@torch.no_grad()
+def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
+    """
+    Copy/paste/tweak model's weights to our BERT structure.
+    """
+    model = torch.load(checkpoint_path, map_location="cpu")
+    sd = model["model"]
+    cfg = BartConfig.from_json_file(config_json_path)
+    m = BartForConditionalGeneration(cfg)
+    valid_keys = m.model.state_dict().keys()
+    failures = []
+    mapping = {}
+    for k, v in sd.items():
+        if k in IGNORE_KEYS:
+            continue
+
+        new_k = rename_state_dict_key(k)
+        if new_k not in valid_keys:
+            failures.append([k, new_k])
+        else:
+            mapping[new_k] = v
+    if cfg.normalize_before:  # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm
+        rename_layernorm_keys(sd)
+    m.model.load_state_dict(mapping, strict=True)
+    m.half()
+    m.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
+    parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
+    parser.add_argument(
+        "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
+    )
+    args = parser.parse_args()
+    convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
diff --git a/src/transformers/models/old_blenderbot/modeling_blenderbot.py b/src/transformers/models/old_blenderbot/modeling_blenderbot.py
new file mode 100644
index 00000000000000..2a370fbabf8624
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/modeling_blenderbot.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# LICENSE file in the root directory of this source tree.
+""""BlenderbotForConditionalGeneration which inherits from BART"""
+
+import torch
+
+from ...file_utils import add_start_docstrings
+from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
+from .configuration_blenderbot import BlenderbotConfig
+
+
+BLENDER_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+"""
+
+BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/blenderbot-90M"]
+
+
+@add_start_docstrings(
+    "The bare BlenderBot Model transformer outputting raw hidden-states without any specific head on top.",
+    BLENDER_START_DOCSTRING,
+)
+class BlenderbotModel(BartModel):
+    r"""
+    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
+    documentation alongside usage examples.
+    """
+
+    config_class = BlenderbotConfig
+
+
+@add_start_docstrings(
+    "The BlenderBot Model with a language modeling head. Can be used for summarization.", BLENDER_START_DOCSTRING
+)
+class BlenderbotForConditionalGeneration(BartForConditionalGeneration):
+    """
+    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
+    appropriate documentation alongside usage examples.
+    """
+
+    config_class = BlenderbotConfig
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        logits[:, self.config.bos_token_id] = -torch.finfo(torch.float16).max  # near infinity fp16
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
diff --git a/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
new file mode 100644
index 00000000000000..ba51e87a1c544f
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF BlenderBot model, ported from the fairseq repo."""
+
+import tensorflow as tf
+
+from ...file_utils import add_start_docstrings
+from ...utils import logging
+from ..bart.modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
+from .configuration_blenderbot import BlenderbotConfig
+
+
+_CONFIG_FOR_DOC = "BlenderbotConfig"
+
+START_DOCSTRING = BART_START_DOCSTRING.replace(
+    "inherits from :class:`~transformers.TFPreTrainedModel`",
+    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
+).replace("BartConfig", _CONFIG_FOR_DOC)
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_start_docstrings("Blenderbot model for open domain dialogue", START_DOCSTRING)
+class TFBlenderbotForConditionalGeneration(TFBartForConditionalGeneration):
+    config_class = BlenderbotConfig
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        """Never predict pad_token_id. Predict </s> when max_length is reached."""
+        vocab_range = tf.constant(range(self.config.vocab_size))
+        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
+        if cur_len == max_length - 1:
+            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
+        return logits
diff --git a/src/transformers/models/old_blenderbot/tokenization_blenderbot.py b/src/transformers/models/old_blenderbot/tokenization_blenderbot.py
new file mode 100644
index 00000000000000..bf96a63d04a4da
--- /dev/null
+++ b/src/transformers/models/old_blenderbot/tokenization_blenderbot.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# LICENSE file in the root directory of this source tree.
+""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+from ..roberta.tokenization_roberta import RobertaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    # "tokenizer_config_file": "tokenizer_config.json",
+}
+CKPT_3B = "facebook/blenderbot-3B"
+
+
+class BlenderbotTokenizer(RobertaTokenizer):
+    r"""
+    Construct a Blenderbot tokenizer.
+
+    :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
+    to the beginning of sequences.
+
+    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+        "tokenizer_config_file": "tokenizer_config.json",
+    }
+    pretrained_vocab_files_map = {
+        "vocab_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/vocab.json"},
+        "merges_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/merges.txt"},
+        "tokenizer_config_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/tokenizer_config.json"},
+    }
+    max_model_input_sizes = {"facebook/blenderbot-3B": 128}
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Blenderbot sequence has the following format:
+
+        - single sequence: `` X </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Will be ignored
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        return token_ids_0 + [self.eos_token_id]
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+class BlenderbotSmallTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+            The beginning of sentence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+            The end of sentence token.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        **kwargs
+            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+    """
+
+    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_vocab_files_map = {
+        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
+        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
+    }
+    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        bos_token="__start__",
+        eos_token="__end__",
+        unk_token="__unk__",
+        pad_token="__null__",
+        **kwargs
+    ):
+        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self) -> Dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token: str) -> str:
+        if token in self.cache:
+            return self.cache[token]
+        token = re.sub("([.,!?()])", r" \1", token)
+        token = re.sub("(')", r" \1 ", token)
+        token = re.sub(r"\s{2,}", " ", token)
+        if "\n" in token:
+            token = token.replace("\n", " __newln__")
+
+        tokens = token.split(" ")
+        words = []
+        for token in tokens:
+            if not len(token):
+                continue
+
+            token = token.lower()
+            word = tuple(token)
+            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+            pairs = get_pairs(word)
+
+            if not pairs:
+                words.append(token)
+                continue
+
+            while True:
+                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+                if bigram not in self.bpe_ranks:
+                    break
+                first, second = bigram
+                new_word = []
+                i = 0
+
+                while i < len(word):
+                    try:
+                        j = word.index(first, i)
+                        new_word.extend(word[i:j])
+                        i = j
+                    except ValueError:
+                        new_word.extend(word[i:])
+                        break
+
+                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                        new_word.append(first + second)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word = tuple(new_word)
+                word = new_word
+                if len(word) == 1:
+                    break
+                else:
+                    pairs = get_pairs(word)
+            word = "@@ ".join(word)
+            word = word[:-4]
+
+            self.cache[token] = word
+            words.append(word)
+        return " ".join(words)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """ Split a string into tokens using BPE."""
+        split_tokens = []
+
+        words = re.findall(r"\S+\n?", text)
+
+        for token in words:
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """ Converts a token to an id using the vocab. """
+        token = token.lower()
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """ Converts a sequence of tokens  in a single string. """
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py
index 668569a59553b9..d7341a1df7826c 100644
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -1,6 +1,5 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,14 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tests for BlenderBot"""
+""" Testing suite for the PyTorch Blenderbot model. """
+
+
+import copy
+import tempfile
 import unittest
 
+import timeout_decorator  # noqa
+
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
 from .test_modeling_common import ModelTesterMixin, ids_tensor
 
 
@@ -36,98 +42,262 @@
         BlenderbotSmallTokenizer,
         BlenderbotTokenizer,
     )
-
-TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
-FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
+    from transformers.models.blenderbot.modeling_blenderbot import BlenderbotDecoder, BlenderbotEncoder
+
+
+def prepare_blenderbot_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
 
 
 @require_torch
 class BlenderbotModelTester:
-    # Required attributes
-    vocab_size = 99
-    batch_size = 13
-    seq_length = 7
-    num_hidden_layers = 2
-    hidden_size = 16
-    num_attention_heads = 4
-    is_training = True
-
-    def __init__(self, parent):
-        torch.manual_seed(0)
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
         self.parent = parent
-        self.config = BlenderbotConfig(
-            d_model=self.hidden_size,
-            dropout=0.0,
-            activation_function="gelu",
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BlenderbotConfig(
             vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
             encoder_layers=self.num_hidden_layers,
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
             decoder_attention_heads=self.num_attention_heads,
-            attention_dropout=0.0,
-            encoder_ffn_dim=4,
-            decoder_ffn_dim=4,
-            do_blenderbot_90_layernorm=False,
-            normalize_before=True,
-            max_position_embeddings=50,
-            static_position_embeddings=False,
-            scale_embedding=True,
-            bos_token_id=0,
-            eos_token_id=2,
-            pad_token_id=1,
-            num_beams=1,
-            min_length=3,
-            max_length=10,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
         )
+        inputs_dict = prepare_blenderbot_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
 
     def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return self.config, inputs_dict
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BlenderbotModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BlenderbotModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BlenderbotEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BlenderbotDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
 
 
 @require_torch
-class BlenderbotTesterMixin(ModelTesterMixin, unittest.TestCase):
-    if is_torch_available():
-        all_generative_model_classes = (BlenderbotForConditionalGeneration,)
-        all_model_classes = (BlenderbotForConditionalGeneration, BlenderbotModel)
-    else:
-        all_generative_model_classes = ()
-        all_model_classes = ()
+class BlenderbotModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotModel, BlenderbotForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotForConditionalGeneration,) if is_torch_available() else ()
     is_encoder_decoder = True
-    test_head_masking = False
     test_pruning = False
+    test_head_masking = False
     test_missing_keys = False
-    test_torchscript = False
 
     def setUp(self):
         self.model_tester = BlenderbotModelTester(self)
         self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
 
-    def test_initialization_module(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = BlenderbotForConditionalGeneration(config).model
-        model.to(torch_device)
-        model.eval()
-        enc_embeds = model.encoder.embed_tokens.weight
-        assert (enc_embeds == model.shared.weight).all().item()
-        self.assertAlmostEqual(torch.std(enc_embeds).item(), config.init_std, 2)
-
-    def test_embed_pos_shape(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = BlenderbotForConditionalGeneration(config)
-        expected_shape = (config.max_position_embeddings + config.extra_pos_embeddings, config.d_model)
-        assert model.model.encoder.embed_positions.weight.shape == expected_shape
-        model.model.decoder.embed_positions.weight.shape == expected_shape
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
 
-    @unittest.skip("This test is flaky")
-    def test_feed_forward_chunking(self):
-        pass
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment")
-    def test_resize_embeddings_untied(self):
-        pass
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # BlenderbotForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in (BlenderbotModel, BlenderbotForConditionalGeneration):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BlenderbotForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
 
 
 @unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
@@ -143,6 +313,9 @@ def tokenizer(self):
 
     @slow
     def test_generation_from_short_input_same_as_parlai_3B(self):
+        FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
+        TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
         torch.cuda.empty_cache()
         model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
 
@@ -150,6 +323,9 @@ def test_generation_from_short_input_same_as_parlai_3B(self):
         model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
 
         generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
+        import ipdb
+
+        ipdb.set_trace()
         tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
 
         generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
@@ -195,7 +371,7 @@ def test_90_generation_from_long_input(self):
         model_inputs.pop("token_type_ids")
         assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
         generated_ids = self.model.generate(**model_inputs)[0]
-        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
+        reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
         assert reply in (
             "i don't know. i just feel like i'm going to throw up. it's not fun.",
@@ -209,7 +385,9 @@ def test_90_generation_from_short_input(self):
         model_inputs.pop("token_type_ids")
         generated_utterances = self.model.generate(**model_inputs)
 
-        clean_txt = self.tokenizer.decode(generated_utterances[0], **TOK_DECODE_KW)
+        clean_txt = self.tokenizer.decode(
+            generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
         assert clean_txt in (
             "have you ever been to a sam club? it's a great club in the south.",
             "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
diff --git a/tests/test_modeling_old_blenderbot.py b/tests/test_modeling_old_blenderbot.py
new file mode 100644
index 00000000000000..668569a59553b9
--- /dev/null
+++ b/tests/test_modeling_old_blenderbot.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for BlenderBot"""
+import unittest
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        AutoModelForSeq2SeqLM,
+        AutoTokenizer,
+        BlenderbotConfig,
+        BlenderbotForConditionalGeneration,
+        BlenderbotModel,
+        BlenderbotSmallTokenizer,
+        BlenderbotTokenizer,
+    )
+
+TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
+
+
+@require_torch
+class BlenderbotModelTester:
+    # Required attributes
+    vocab_size = 99
+    batch_size = 13
+    seq_length = 7
+    num_hidden_layers = 2
+    hidden_size = 16
+    num_attention_heads = 4
+    is_training = True
+
+    def __init__(self, parent):
+        torch.manual_seed(0)
+        self.parent = parent
+        self.config = BlenderbotConfig(
+            d_model=self.hidden_size,
+            dropout=0.0,
+            activation_function="gelu",
+            vocab_size=self.vocab_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            attention_dropout=0.0,
+            encoder_ffn_dim=4,
+            decoder_ffn_dim=4,
+            do_blenderbot_90_layernorm=False,
+            normalize_before=True,
+            max_position_embeddings=50,
+            static_position_embeddings=False,
+            scale_embedding=True,
+            bos_token_id=0,
+            eos_token_id=2,
+            pad_token_id=1,
+            num_beams=1,
+            min_length=3,
+            max_length=10,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return self.config, inputs_dict
+
+
+@require_torch
+class BlenderbotTesterMixin(ModelTesterMixin, unittest.TestCase):
+    if is_torch_available():
+        all_generative_model_classes = (BlenderbotForConditionalGeneration,)
+        all_model_classes = (BlenderbotForConditionalGeneration, BlenderbotModel)
+    else:
+        all_generative_model_classes = ()
+        all_model_classes = ()
+    is_encoder_decoder = True
+    test_head_masking = False
+    test_pruning = False
+    test_missing_keys = False
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = BlenderbotModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
+
+    def test_initialization_module(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = BlenderbotForConditionalGeneration(config).model
+        model.to(torch_device)
+        model.eval()
+        enc_embeds = model.encoder.embed_tokens.weight
+        assert (enc_embeds == model.shared.weight).all().item()
+        self.assertAlmostEqual(torch.std(enc_embeds).item(), config.init_std, 2)
+
+    def test_embed_pos_shape(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        model = BlenderbotForConditionalGeneration(config)
+        expected_shape = (config.max_position_embeddings + config.extra_pos_embeddings, config.d_model)
+        assert model.model.encoder.embed_positions.weight.shape == expected_shape
+        model.model.decoder.embed_positions.weight.shape == expected_shape
+
+    @unittest.skip("This test is flaky")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment")
+    def test_resize_embeddings_untied(self):
+        pass
+
+
+@unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class Blenderbot3BIntegrationTests(unittest.TestCase):
+    ckpt = "facebook/blenderbot-3B"
+
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotTokenizer.from_pretrained(self.ckpt)
+
+    @slow
+    def test_generation_from_short_input_same_as_parlai_3B(self):
+        torch.cuda.empty_cache()
+        model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
+
+        src_text = ["Sam"]
+        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
+
+        generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
+        tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
+
+        generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
+        assert generated_txt[0].strip() == tgt_text
+
+        src_text = "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like i'm going to throw up.\nand why is that?"
+
+        model_inputs = self.tokenizer([src_text], return_tensors="pt").to(torch_device)
+
+        generated_ids = model.generate(**model_inputs, **FASTER_GEN_KWARGS)[0]
+        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
+
+        assert "I think it's because we are so worried about what people think of us." == reply.strip()
+        del model
+
+
+@require_torch
+class Blenderbot90MIntegrationTests(unittest.TestCase):
+    ckpt = "facebook/blenderbot-90M"
+
+    @cached_property
+    def model(self):
+        model = AutoModelForSeq2SeqLM.from_pretrained(self.ckpt).to(torch_device)
+        if torch_device == "cuda":
+            model = model.half()
+        return model
+
+    @cached_property
+    def tokenizer(self):
+        return AutoTokenizer.from_pretrained(self.ckpt)
+
+    @slow
+    def test_90_generation_from_long_input(self):
+
+        src_text = [
+            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
+       i'm going to throw up.\nand why is that?"
+        ]
+
+        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
+
+        # model does not have "token_type_ids"
+        model_inputs.pop("token_type_ids")
+        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
+        generated_ids = self.model.generate(**model_inputs)[0]
+        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
+
+        assert reply in (
+            "i don't know. i just feel like i'm going to throw up. it's not fun.",
+            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
+        )
+
+    def test_90_generation_from_short_input(self):
+        model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
+
+        # model does not have "token_type_ids"
+        model_inputs.pop("token_type_ids")
+        generated_utterances = self.model.generate(**model_inputs)
+
+        clean_txt = self.tokenizer.decode(generated_utterances[0], **TOK_DECODE_KW)
+        assert clean_txt in (
+            "have you ever been to a sam club? it's a great club in the south.",
+            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
+        )

From 1021dd69b62e7fb6b59fee2c0a422ee64dbdd883 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 18:56:58 +0000
Subject: [PATCH 13/51] replace in file

---
 src/transformers/commands/add_new_model.py | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py
index 733b389106dbc6..6b27e0b24c3fc7 100644
--- a/src/transformers/commands/add_new_model.py
+++ b/src/transformers/commands/add_new_model.py
@@ -154,20 +154,20 @@ def remove_copy_lines(path):
             os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py")
             os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py")
 
-        #        shutil.move(
-        #            f"{directory}/{lowercase_model_name}.rst",
-        #            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
-        #        )
-        #
-        #        shutil.move(
-        #            f"{directory}/tokenization_{lowercase_model_name}.py",
-        #            f"{model_dir}/tokenization_{lowercase_model_name}.py",
-        #        )
-        #
-        #        shutil.move(
-        #            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
-        #            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
-        #        )
+        shutil.move(
+            f"{directory}/{lowercase_model_name}.rst",
+            f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst",
+        )
+
+        shutil.move(
+            f"{directory}/tokenization_{lowercase_model_name}.py",
+            f"{model_dir}/tokenization_{lowercase_model_name}.py",
+        )
+
+        shutil.move(
+            f"{directory}/tokenization_fast_{lowercase_model_name}.py",
+            f"{model_dir}/tokenization_{lowercase_model_name}_fast.py",
+        )
 
         from os import fdopen, remove
         from shutil import copymode, move
@@ -225,5 +225,5 @@ def replace_in_files(path_to_datafile):
 
             remove(path_to_datafile)
 
-        #        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
+        replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py")
         os.rmdir(directory)

From c8bbfa1098a1461674df07018f1d384ed3e9efb3 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 19:17:49 +0000
Subject: [PATCH 14/51] correctly split blenderbot

---
 docs/source/model_doc/blenderbot_small.rst    |   74 +
 src/transformers/__init__.py                  |   14 +-
 .../models/auto/configuration_auto.py         |    7 +
 src/transformers/models/auto/modeling_auto.py |    9 +-
 .../models/blenderbot/modeling_blenderbot.py  |   24 -
 .../models/blenderbot_small/__init__.py       |   29 +
 .../configuration_blenderbot_small.py         |  165 +++
 .../modeling_blenderbot_small.py              | 1193 +++++++++++++++++
 .../tokenization_blenderbot_small.py          |  225 ++++
 .../tokenization_blenderbot_small_fast.py     |  103 ++
 .../models/marian/modeling_marian.py          |   24 -
 tests/test_modeling_blenderbot.py             |  100 +-
 tests/test_modeling_blenderbot_small.py       |  317 +++++
 utils/check_repo.py                           |    4 +
 14 files changed, 2135 insertions(+), 153 deletions(-)
 create mode 100644 docs/source/model_doc/blenderbot_small.rst
 create mode 100644 src/transformers/models/blenderbot_small/__init__.py
 create mode 100644 src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
 create mode 100755 src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
 create mode 100644 src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
 create mode 100644 src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
 create mode 100644 tests/test_modeling_blenderbot_small.py

diff --git a/docs/source/model_doc/blenderbot_small.rst b/docs/source/model_doc/blenderbot_small.rst
new file mode 100644
index 00000000000000..2802af544fbc17
--- /dev/null
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -0,0 +1,74 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BLENDERBOT_SMALL
+-----------------------------------------------------------------------------------------------------------------------
+
+Note that :class:`~transformers.BlenderbotSmallModel` and
+:class:`~transformers.BlenderbotSmallForConditionalGeneration` are only used in combination with the checkpoint
+`facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__. Larger Blenderbot checkpoints should
+instead be used with :class:`~transformers.BlenderbotModel` and
+:class:`~transformers.BlenderbotForConditionalGeneration`
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+BlenderbotSmallConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallConfig
+    :members:
+
+
+BlenderbotSmallTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallModel
+    :members: forward
+
+
+BlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallForConditionalGeneration
+    :members: forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8121682297f3f8..eedda621c6861c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -120,11 +120,11 @@
 from .models.bert_generation import BertGenerationConfig
 from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
 from .models.bertweet import BertweetTokenizer
-from .models.blenderbot import (
-    BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    BlenderbotConfig,
+from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
+from .models.blenderbot_small import (
+    BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    BlenderbotSmallConfig,
     BlenderbotSmallTokenizer,
-    BlenderbotTokenizer,
 )
 from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -299,6 +299,7 @@
 
 # Modeling
 if is_torch_available():
+
     # Benchmarks
     from .benchmark.benchmark import PyTorchBenchmark
     from .benchmark.benchmark_args import PyTorchBenchmarkArguments
@@ -412,6 +413,11 @@
         BlenderbotForConditionalGeneration,
         BlenderbotModel,
     )
+    from .models.blenderbot_small import (
+        BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+    )
     from .models.camembert import (
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         CamembertForCausalLM,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 5a83846634d02e..442a113a2e3149 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -23,6 +23,10 @@
 from ..bert.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from ..bert_generation.configuration_bert_generation import BertGenerationConfig
 from ..blenderbot.configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
+from ..blenderbot_small.configuration_blenderbot_small import (
+    BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    BlenderbotSmallConfig,
+)
 from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
@@ -66,6 +70,7 @@
     (key, value)
     for pretrained_map in [
         # Add archive maps here
+        BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -105,6 +110,7 @@
 CONFIG_MAPPING = OrderedDict(
     [
         # Add configs here
+        ("blenderbot_small", BlenderbotSmallConfig),
         ("retribert", RetriBertConfig),
         ("mt5", MT5Config),
         ("t5", T5Config),
@@ -150,6 +156,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("blenderbot_small", "BlenderbotSmall"),
         ("retribert", "RetriBERT"),
         ("t5", "T5"),
         ("mobilebert", "MobileBERT"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 79042008815a05..24cf466732f7df 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -21,8 +21,6 @@
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings
 from ...utils import logging
-
-# Add modeling imports here
 from ..albert.modeling_albert import (
     AlbertForMaskedLM,
     AlbertForMultipleChoice,
@@ -51,6 +49,9 @@
 )
 from ..bert_generation.modeling_bert_generation import BertGenerationDecoder, BertGenerationEncoder
 from ..blenderbot.modeling_blenderbot import BlenderbotForConditionalGeneration, BlenderbotModel
+
+# Add modeling imports here
+from ..blenderbot_small.modeling_blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
 from ..camembert.modeling_camembert import (
     CamembertForCausalLM,
     CamembertForMaskedLM,
@@ -219,6 +220,7 @@
     BertConfig,
     BertGenerationConfig,
     BlenderbotConfig,
+    BlenderbotSmallConfig,
     CamembertConfig,
     CTRLConfig,
     DebertaConfig,
@@ -262,6 +264,7 @@
 MODEL_MAPPING = OrderedDict(
     [
         # Base model mapping
+        (BlenderbotSmallConfig, BlenderbotSmallModel),
         (RetriBertConfig, RetriBertModel),
         (MT5Config, MT5Model),
         (T5Config, T5Model),
@@ -338,6 +341,7 @@
 MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
     [
         # Model with LM heads mapping
+        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
         (LayoutLMConfig, LayoutLMForMaskedLM),
         (T5Config, T5ForConditionalGeneration),
         (DistilBertConfig, DistilBertForMaskedLM),
@@ -418,6 +422,7 @@
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
     [
         # Model for Seq2Seq Causal LM mapping
+        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
         (MT5Config, MT5ForConditionalGeneration),
         (T5Config, T5ForConditionalGeneration),
         (PegasusConfig, PegasusForConditionalGeneration),
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index dba7c06e51336f..a7bfc3a8714803 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -424,30 +424,6 @@ def forward(
         return outputs
 
 
-class BlenderbotClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
 class BlenderbotPreTrainedModel(PreTrainedModel):
     config_class = BlenderbotConfig
     base_model_prefix = "model"
diff --git a/src/transformers/models/blenderbot_small/__init__.py b/src/transformers/models/blenderbot_small/__init__.py
new file mode 100644
index 00000000000000..0e3b531d085477
--- /dev/null
+++ b/src/transformers/models/blenderbot_small/__init__.py
@@ -0,0 +1,29 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...file_utils import is_torch_available
+from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig
+from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
+
+
+if is_torch_available():
+    from .modeling_blenderbot_small import (
+        BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+        BlenderbotSmallPreTrainedModel,
+    )
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
new file mode 100644
index 00000000000000..8c10b6cf33a529
--- /dev/null
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BlenderbotSmall model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/blenderbot-90M": "https://huggingface.co/facebook/blenderbot-90M/resolve/main/config.json",
+    # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
+}
+
+
+class BlenderbotSmallConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
+    used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
+    `facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
+            represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
+            :class:`~transformers.TFBlenderbotSmallModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+        Example::
+
+        >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
+
+        >>> # Initializing a BlenderbotSmall facebook/blenderbot-90M style configuration
+        >>> configuration = BlenderbotSmallConfig()
+
+        >>> # Initializing a model from the facebook/blenderbot-90M style configuration
+        >>> model = BlenderbotSmallModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "blenderbot_small"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
new file mode 100755
index 00000000000000..8984f9c5649876
--- /dev/null
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -0,0 +1,1193 @@
+# coding=utf-8
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BlenderbotSmall model. """
+
+
+import math
+import random
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from .configuration_blenderbot_small import BlenderbotSmallConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BlenderbotSmallConfig"
+_TOKENIZER_FOR_DOC = "BlenderbotSmallTokenizer"
+
+
+BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/blenderbot-90M",
+    # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
+]
+
+
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+def BlenderbotSmallLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
+        num_embeddings
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions)
+
+
+class BlenderbotSmallAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class BlenderbotSmallEncoderLayer(nn.Module):
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BlenderbotSmallAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BlenderbotSmallDecoderLayer(nn.Module):
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BlenderbotSmallAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.encoder_attn = BlenderbotSmallAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
+                This requires the attentions tensor to be reshaped in this function.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class BlenderbotSmallPreTrainedModel(PreTrainedModel):
+    config_class = BlenderbotSmallConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+BLENDERBOT_SMALL_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.BlenderbotSmallConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration, BlenderbotSmallConfig
+
+        >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot-90M')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+"""
+
+BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Provide for translation and summarization training. By default, the model will create this tensor by
+            shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read
+            :func:`modeling_blenderbot_small._prepare_decoder_inputs` and modify to your needs. See diagram 1 in `the
+            paper <https://arxiv.org/abs/1910.13461>`__ for more information on the default strategy.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`BlenderbotSmallEncoderLayer`.
+
+    Args:
+        config: BlenderbotSmallConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = BlenderbotSmallLayerNorm(embed_dim)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
+    :class:`BlenderbotSmallDecoderLayer`
+
+    Args:
+        config: BlenderbotSmallConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = BlenderbotSmallLayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        # BlenderbotSmall applies layer norm on hidden_states
+        inputs_embeds = self.layernorm_embedding(inputs_embeds)
+        hidden_states = inputs_embeds + positions
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top.",
+    BLENDERBOT_SMALL_START_DOCSTRING,
+)
+class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = BlenderbotSmallEncoder(config, self.shared)
+        self.decoder = BlenderbotSmallDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/blenderbot-90M",
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The BlenderbotSmall Model with a language modeling head. Can be used for summarization.",
+    BLENDERBOT_SMALL_START_DOCSTRING,
+)
+class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
+
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__(config)
+        self.model = BlenderbotSmallModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+
+        Conditional generation example::
+
+            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
+            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot-90M')
+            >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+            >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
+            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+            >>> logits = model(input_ids).logits
+
+            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+            >>> probs = logits[0, masked_index].softmax(dim=0)
+            >>> values, predictions = probs.topk(5)
+
+            >>> tokenizer.decode(predictions).split()
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
+        return logits
+
+    @staticmethod
+    def _force_token_id_to_be_generated(scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
new file mode 100644
index 00000000000000..72aae43d8672ba
--- /dev/null
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the;
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# LICENSE file in the root directory of this source tree.
+""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    # "tokenizer_config_file": "tokenizer_config.json",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+class BlenderbotSmallTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+            The beginning of sentence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+            The end of sentence token.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        **kwargs
+            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+    """
+
+    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_vocab_files_map = {
+        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
+        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
+    }
+    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        bos_token="__start__",
+        eos_token="__end__",
+        unk_token="__unk__",
+        pad_token="__null__",
+        **kwargs
+    ):
+        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self) -> Dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token: str) -> str:
+        if token in self.cache:
+            return self.cache[token]
+        token = re.sub("([.,!?()])", r" \1", token)
+        token = re.sub("(')", r" \1 ", token)
+        token = re.sub(r"\s{2,}", " ", token)
+        if "\n" in token:
+            token = token.replace("\n", " __newln__")
+
+        tokens = token.split(" ")
+        words = []
+        for token in tokens:
+            if not len(token):
+                continue
+
+            token = token.lower()
+            word = tuple(token)
+            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+            pairs = get_pairs(word)
+
+            if not pairs:
+                words.append(token)
+                continue
+
+            while True:
+                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+                if bigram not in self.bpe_ranks:
+                    break
+                first, second = bigram
+                new_word = []
+                i = 0
+
+                while i < len(word):
+                    try:
+                        j = word.index(first, i)
+                        new_word.extend(word[i:j])
+                        i = j
+                    except ValueError:
+                        new_word.extend(word[i:])
+                        break
+
+                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                        new_word.append(first + second)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word = tuple(new_word)
+                word = new_word
+                if len(word) == 1:
+                    break
+                else:
+                    pairs = get_pairs(word)
+            word = "@@ ".join(word)
+            word = word[:-4]
+
+            self.cache[token] = word
+            words.append(word)
+        return " ".join(words)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """ Split a string into tokens using BPE."""
+        split_tokens = []
+
+        words = re.findall(r"\S+\n?", text)
+
+        for token in words:
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """ Converts a token to an id using the vocab. """
+        token = token.lower()
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """ Converts a sequence of tokens  in a single string. """
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
new file mode 100644
index 00000000000000..86d3437ce2f186
--- /dev/null
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for BlenderbotSmall."""
+from typing import List, Optional
+
+from tokenizers import ByteLevelBPETokenizer
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/blenderbot-90M": 1024,
+}
+
+
+class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BlenderbotSmallTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        trim_offsets=True,
+        **kwargs
+    ):
+        super().__init__(
+            ByteLevelBPETokenizer(
+                vocab_file=vocab_file,
+                merges_file=merges_file,
+                add_prefix_space=add_prefix_space,
+                trim_offsets=trim_offsets,
+            ),
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            **kwargs,
+        )
+        self.add_prefix_space = add_prefix_space
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall
+        does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index d79db98a7c4b3f..70182c1d01bc1c 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -440,30 +440,6 @@ def forward(
         return outputs
 
 
-class MarianClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
 class MarianPreTrainedModel(PreTrainedModel):
     config_class = MarianConfig
     base_model_prefix = "model"
diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py
index d7341a1df7826c..c9302c29ccb421 100644
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Blenderbot model. """
 
 
-import copy
 import tempfile
 import unittest
 
@@ -33,15 +32,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        AutoModelForSeq2SeqLM,
-        AutoTokenizer,
-        BlenderbotConfig,
-        BlenderbotForConditionalGeneration,
-        BlenderbotModel,
-        BlenderbotSmallTokenizer,
-        BlenderbotTokenizer,
-    )
+    from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration, BlenderbotModel, BlenderbotTokenizer
     from transformers.models.blenderbot.modeling_blenderbot import BlenderbotDecoder, BlenderbotEncoder
 
 
@@ -236,36 +227,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # BlenderbotForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (BlenderbotModel, BlenderbotForConditionalGeneration):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_ids = input_dict["input_ids"]
@@ -296,10 +257,6 @@ def assert_tensors_close(a, b, atol=1e-12, prefix=""):
         raise AssertionError(msg)
 
 
-def _long_tensor(tok_lst):
-    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
-
-
 @unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
 @require_torch
 @require_sentencepiece
@@ -323,9 +280,6 @@ def test_generation_from_short_input_same_as_parlai_3B(self):
         model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
 
         generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
-        import ipdb
-
-        ipdb.set_trace()
         tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
 
         generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
@@ -340,55 +294,3 @@ def test_generation_from_short_input_same_as_parlai_3B(self):
 
         assert "I think it's because we are so worried about what people think of us." == reply.strip()
         del model
-
-
-@require_torch
-class Blenderbot90MIntegrationTests(unittest.TestCase):
-    ckpt = "facebook/blenderbot-90M"
-
-    @cached_property
-    def model(self):
-        model = AutoModelForSeq2SeqLM.from_pretrained(self.ckpt).to(torch_device)
-        if torch_device == "cuda":
-            model = model.half()
-        return model
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.ckpt)
-
-    @slow
-    def test_90_generation_from_long_input(self):
-
-        src_text = [
-            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
-       i'm going to throw up.\nand why is that?"
-        ]
-
-        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
-
-        # model does not have "token_type_ids"
-        model_inputs.pop("token_type_ids")
-        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
-        generated_ids = self.model.generate(**model_inputs)[0]
-        reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-
-        assert reply in (
-            "i don't know. i just feel like i'm going to throw up. it's not fun.",
-            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
-        )
-
-    def test_90_generation_from_short_input(self):
-        model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
-
-        # model does not have "token_type_ids"
-        model_inputs.pop("token_type_ids")
-        generated_utterances = self.model.generate(**model_inputs)
-
-        clean_txt = self.tokenizer.decode(
-            generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
-        )
-        assert clean_txt in (
-            "have you ever been to a sam club? it's a great club in the south.",
-            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
-        )
diff --git a/tests/test_modeling_blenderbot_small.py b/tests/test_modeling_blenderbot_small.py
new file mode 100644
index 00000000000000..3f180f6be319cc
--- /dev/null
+++ b/tests/test_modeling_blenderbot_small.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch BlenderbotSmall model. """
+
+
+import tempfile
+import unittest
+
+import timeout_decorator  # noqa
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        BlenderbotSmallConfig,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+        BlenderbotSmallTokenizer,
+    )
+    from transformers.models.blenderbot_small.modeling_blenderbot_small import (
+        BlenderbotSmallDecoder,
+        BlenderbotSmallEncoder,
+    )
+
+
+def prepare_blenderbot_small_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+
+
+@require_torch
+class BlenderbotSmallModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = BlenderbotSmallConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_blenderbot_small_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = BlenderbotSmallModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = BlenderbotSmallModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = BlenderbotSmallEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = BlenderbotSmallDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class BlenderbotSmallModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (BlenderbotSmallModel, BlenderbotSmallForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (BlenderbotSmallForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = BlenderbotSmallModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BlenderbotSmallConfig)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = BlenderbotSmallForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+@require_torch
+class Blenderbot90MIntegrationTests(unittest.TestCase):
+    ckpt = "facebook/blenderbot-90M"
+
+    @cached_property
+    def model(self):
+        model = BlenderbotSmallForConditionalGeneration.from_pretrained(self.ckpt).to(torch_device)
+        if torch_device == "cuda":
+            model = model.half()
+        return model
+
+    @cached_property
+    def tokenizer(self):
+        return BlenderbotSmallTokenizer.from_pretrained(self.ckpt)
+
+    @slow
+    def test_90_generation_from_long_input(self):
+
+        src_text = [
+            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
+       i'm going to throw up.\nand why is that?"
+        ]
+
+        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
+
+        # model does not have "token_type_ids"
+        model_inputs.pop("token_type_ids")
+        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
+        generated_ids = self.model.generate(**model_inputs)[0]
+        reply = self.tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+        assert reply in (
+            "i don't know. i just feel like i'm going to throw up. it's not fun.",
+            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
+        )
+
+    def test_90_generation_from_short_input(self):
+        model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
+
+        # model does not have "token_type_ids"
+        model_inputs.pop("token_type_ids")
+        generated_utterances = self.model.generate(**model_inputs)
+
+        clean_txt = self.tokenizer.decode(
+            generated_utterances[0], skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        assert clean_txt in (
+            "have you ever been to a sam club? it's a great club in the south.",
+            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
+        )
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 5fd890a3477048..596113d9ed84c1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -30,6 +30,8 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = [
     # models to ignore for not tested
+    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
+    "BlenderbotSmallDecoder",  # Building part of bigger (tested) model.
     "BartDecoder",  # Building part of bigger (tested) model.
     "BartEncoder",  # Building part of bigger (tested) model.
     "BertLMHeadModel",  # Needs to be setup as decoder.
@@ -64,6 +66,8 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = [
     # models to ignore for model xxx mapping
+    "BlenderbotSmallEncoder",
+    "BlenderbotSmallDecoder",
     "BartDecoder",
     "BartEncoder",
     "DPRContextEncoder",

From bdcaacf85440ed6145d01f3fc78c84f5a1336aba Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 19:20:43 +0000
Subject: [PATCH 15/51] delete "old" folder

---
 src/transformers/models/old_bart/__init__.py  |   39 -
 .../models/old_bart/configuration_bart.py     |  230 ---
 ..._original_pytorch_checkpoint_to_pytorch.py |  143 --
 .../models/old_bart/modeling_bart.py          | 1520 -----------------
 .../models/old_bart/modeling_tf_bart.py       | 1326 --------------
 .../models/old_bart/tokenization_bart.py      |   99 --
 .../models/old_bart/tokenization_bart_fast.py |   92 -
 .../models/old_blenderbot/__init__.py         |   32 -
 .../configuration_blenderbot.py               |  181 --
 ..._original_pytorch_checkpoint_to_pytorch.py |  114 --
 .../old_blenderbot/modeling_blenderbot.py     |   69 -
 .../old_blenderbot/modeling_tf_blenderbot.py  |   46 -
 .../old_blenderbot/tokenization_blenderbot.py |  269 ---
 .../models/old_marian/__init__.py             |   30 -
 .../models/old_marian/configuration_marian.py |  100 --
 .../convert_marian_tatoeba_to_pytorch.py      | 1268 --------------
 .../old_marian/convert_marian_to_pytorch.py   |  632 -------
 .../models/old_marian/modeling_marian.py      |   63 -
 .../models/old_marian/modeling_tf_marian.py   |   52 -
 .../models/old_marian/tokenization_marian.py  |  291 ----
 src/transformers/models/old_mbart/__init__.py |   33 -
 .../models/old_mbart/configuration_mbart.py   |  105 --
 ...rt_mbart_original_checkpoint_to_pytorch.py |   50 -
 .../models/old_mbart/modeling_mbart.py        |   70 -
 .../models/old_mbart/modeling_tf_mbart.py     |   36 -
 .../models/old_mbart/tokenization_mbart.py    |  232 ---
 .../old_mbart/tokenization_mbart_fast.py      |  248 ---
 .../models/old_pegasus/__init__.py            |   33 -
 .../old_pegasus/configuration_pegasus.py      |  145 --
 .../convert_pegasus_tf_to_pytorch.py          |  132 --
 .../models/old_pegasus/modeling_pegasus.py    |   83 -
 .../models/old_pegasus/modeling_tf_pegasus.py |   41 -
 .../old_pegasus/tokenization_pegasus.py       |  294 ----
 .../old_pegasus/tokenization_pegasus_fast.py  |  232 ---
 tests/test_modeling_old_blenderbot.py         |  216 ---
 tests/test_modeling_old_marian.py             |  334 ----
 tests/test_modeling_old_mbart.py              |  192 ---
 tests/test_modeling_old_pegasus.py            |  126 --
 38 files changed, 9198 deletions(-)
 delete mode 100644 src/transformers/models/old_bart/__init__.py
 delete mode 100644 src/transformers/models/old_bart/configuration_bart.py
 delete mode 100644 src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/old_bart/modeling_bart.py
 delete mode 100644 src/transformers/models/old_bart/modeling_tf_bart.py
 delete mode 100644 src/transformers/models/old_bart/tokenization_bart.py
 delete mode 100644 src/transformers/models/old_bart/tokenization_bart_fast.py
 delete mode 100644 src/transformers/models/old_blenderbot/__init__.py
 delete mode 100644 src/transformers/models/old_blenderbot/configuration_blenderbot.py
 delete mode 100644 src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/old_blenderbot/modeling_blenderbot.py
 delete mode 100644 src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
 delete mode 100644 src/transformers/models/old_blenderbot/tokenization_blenderbot.py
 delete mode 100644 src/transformers/models/old_marian/__init__.py
 delete mode 100644 src/transformers/models/old_marian/configuration_marian.py
 delete mode 100644 src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
 delete mode 100644 src/transformers/models/old_marian/convert_marian_to_pytorch.py
 delete mode 100644 src/transformers/models/old_marian/modeling_marian.py
 delete mode 100644 src/transformers/models/old_marian/modeling_tf_marian.py
 delete mode 100644 src/transformers/models/old_marian/tokenization_marian.py
 delete mode 100644 src/transformers/models/old_mbart/__init__.py
 delete mode 100644 src/transformers/models/old_mbart/configuration_mbart.py
 delete mode 100644 src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
 delete mode 100644 src/transformers/models/old_mbart/modeling_mbart.py
 delete mode 100644 src/transformers/models/old_mbart/modeling_tf_mbart.py
 delete mode 100644 src/transformers/models/old_mbart/tokenization_mbart.py
 delete mode 100644 src/transformers/models/old_mbart/tokenization_mbart_fast.py
 delete mode 100644 src/transformers/models/old_pegasus/__init__.py
 delete mode 100644 src/transformers/models/old_pegasus/configuration_pegasus.py
 delete mode 100644 src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
 delete mode 100644 src/transformers/models/old_pegasus/modeling_pegasus.py
 delete mode 100644 src/transformers/models/old_pegasus/modeling_tf_pegasus.py
 delete mode 100644 src/transformers/models/old_pegasus/tokenization_pegasus.py
 delete mode 100644 src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
 delete mode 100644 tests/test_modeling_old_blenderbot.py
 delete mode 100644 tests/test_modeling_old_marian.py
 delete mode 100644 tests/test_modeling_old_mbart.py
 delete mode 100644 tests/test_modeling_old_pegasus.py

diff --git a/src/transformers/models/old_bart/__init__.py b/src/transformers/models/old_bart/__init__.py
deleted file mode 100644
index 22acfebc2fbd77..00000000000000
--- a/src/transformers/models/old_bart/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_bart import BartConfig
-from .tokenization_bart import BartTokenizer
-
-
-if is_tokenizers_available():
-    from .tokenization_bart_fast import BartTokenizerFast
-
-if is_torch_available():
-    from .modeling_bart import (
-        BART_PRETRAINED_MODEL_ARCHIVE_LIST,
-        BartForConditionalGeneration,
-        BartForQuestionAnswering,
-        BartForSequenceClassification,
-        BartModel,
-        BartPretrainedModel,
-        PretrainedBartModel,
-    )
-
-if is_tf_available():
-    from .modeling_tf_bart import TFBartForConditionalGeneration, TFBartModel, TFBartPretrainedModel
diff --git a/src/transformers/models/old_bart/configuration_bart.py b/src/transformers/models/old_bart/configuration_bart.py
deleted file mode 100644
index 90a1ea780ad06d..00000000000000
--- a/src/transformers/models/old_bart/configuration_bart.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" BART configuration """
-
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/config.json",
-    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
-    "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json",
-    "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json",
-    "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json",
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
-    "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/config.json",
-}
-
-
-class BartConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
-    instantiate a BART model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of encoder layers, 6 are used for the `bart-base` model.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of decoder layers, 6 are used for the `bart-base` model.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        num_labels: (:obj:`int`, `optional`, defaults to 3):
-            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
-            :obj:`True` for `bart-large-cnn`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-    """
-    model_type = "bart"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=2,
-        activation_function="gelu",
-        vocab_size=50265,
-        d_model=1024,
-        encoder_ffn_dim=4096,
-        encoder_layers=12,
-        encoder_attention_heads=16,
-        decoder_ffn_dim=4096,
-        decoder_layers=12,
-        decoder_attention_heads=16,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        attention_dropout=0.0,
-        dropout=0.1,
-        max_position_embeddings=1024,
-        init_std=0.02,
-        classifier_dropout=0.0,
-        num_labels=3,
-        is_encoder_decoder=True,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=False,
-        scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
-        force_bos_token_to_be_generated=False,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        **common_kwargs
-    ):
-        r"""
-        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
-
-        Examples::
-
-            >>> from transformers import BartConfig, BartModel
-
-            >>> config = BartConfig.from_pretrained('facebook/bart-large')
-            >>> model = BartModel(config)
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
-        super().__init__(
-            num_labels=num_labels,
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            **common_kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = self.num_hidden_layers = encoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_layers = decoder_layers
-        self.decoder_attention_heads = decoder_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.init_std = init_std  # Normal(0, this parameter)
-        self.activation_function = activation_function
-
-        # Params introduced for Mbart
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.normalize_embedding = normalize_embedding  # True for mbart, False otherwise
-        self.normalize_before = normalize_before  # combo of fairseq's encoder_ and decoder_normalize_before
-        self.add_final_layer_norm = add_final_layer_norm
-
-        # Params introduced for Marian
-        self.add_bias_logits = add_bias_logits
-        self.static_position_embeddings = static_position_embeddings
-
-        # 3 Types of Dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.dropout = dropout
-
-        # Classifier stuff
-        self.classifier_dropout = classifier_dropout
-
-        # pos embedding offset
-        self.extra_pos_embeddings = extra_pos_embeddings
-        # bart has a hack that offsets positional embeddings by 2, other models don't do this
-
-        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
-
-        self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
-
-        self.use_cache = use_cache
-
-    @property
-    def num_attention_heads(self) -> int:
-        return self.encoder_attention_heads
-
-    @property
-    def hidden_size(self) -> int:
-        return self.d_model
-
-    def is_valid_mbart(self) -> bool:
-        """Is the configuration aligned with the MBART paper."""
-        if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
-            return True
-        if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
-            logger.info("This configuration is a mixture of MBART and BART settings")
-        return False
diff --git a/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 8978b8b2e57f45..00000000000000
--- a/src/transformers/models/old_bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert BART checkpoint."""
-
-
-import argparse
-import os
-from pathlib import Path
-
-import fairseq
-import torch
-from packaging import version
-
-from transformers import (
-    BartConfig,
-    BartForConditionalGeneration,
-    BartForSequenceClassification,
-    BartModel,
-    BartTokenizer,
-)
-from transformers.models.bart.modeling_bart import _make_linear_from_emb
-from transformers.utils import logging
-
-
-FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"]
-extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification}
-if version.parse(fairseq.__version__) < version.parse("0.9.0"):
-    raise Exception("requires fairseq >= 0.9.0")
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-SAMPLE_TEXT = " Hello world! cécé herlolip"
-
-mnli_rename_keys = [
-    ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"),
-    ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"),
-    ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"),
-    ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"),
-]
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "_float_tensor",
-    ]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
-def load_xsum_checkpoint(checkpoint_path):
-    """Checkpoint path should end in model.pt"""
-    sd = torch.load(checkpoint_path, map_location="cpu")
-    hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval()
-    hub_interface.model.load_state_dict(sd["model"])
-    return hub_interface
-
-
-@torch.no_grad()
-def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    if not os.path.exists(checkpoint_path):
-        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
-    else:
-        bart = load_xsum_checkpoint(checkpoint_path)
-
-    bart.model.upgrade_state_dict(bart.model.state_dict())
-    if hf_checkpoint_name is None:
-        hf_checkpoint_name = checkpoint_path.replace(".", "-")
-    config = BartConfig.from_pretrained(hf_checkpoint_name)
-    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
-    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
-    assert torch.eq(tokens, tokens2).all()
-
-    if checkpoint_path == "bart.large.mnli":
-        state_dict = bart.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
-        for src, dest in mnli_rename_keys:
-            rename_key(state_dict, src, dest)
-        model = BartForSequenceClassification(config).eval()
-        model.load_state_dict(state_dict)
-        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
-        new_model_outputs = model(tokens)[0]  # logits
-    else:  # no classification heads to worry about
-        state_dict = bart.model.state_dict()
-        remove_ignore_keys_(state_dict)
-        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-        fairseq_output = bart.extract_features(tokens)
-        if hf_checkpoint_name == "facebook/bart-large":
-            model = BartModel(config).eval()
-            model.load_state_dict(state_dict)
-            new_model_outputs = model(tokens).model[0]
-        else:
-            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
-            model.model.load_state_dict(state_dict)
-            if hasattr(model, "lm_head"):
-                model.lm_head = _make_linear_from_emb(model.model.shared)
-            new_model_outputs = model.model(tokens)[0]
-
-    # Check results
-    assert fairseq_output.shape == new_model_outputs.shape
-    assert (fairseq_output == new_model_outputs).all().item()
-    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum"
-    )
-    args = parser.parse_args()
-    convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config)
diff --git a/src/transformers/models/old_bart/modeling_bart.py b/src/transformers/models/old_bart/modeling_bart.py
deleted file mode 100644
index f631736cc4380b..00000000000000
--- a/src/transformers/models/old_bart/modeling_bart.py
+++ /dev/null
@@ -1,1520 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BART model, ported from the fairseq repo."""
-import math
-import random
-import warnings
-from typing import Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from ...modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-)
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
-from .configuration_bart import BartConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
-
-
-BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/bart-base",
-    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "facebook/mbart-large-en-ro",
-]
-# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
-
-
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
-    """
-    Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
-    """
-    prev_output_tokens = input_ids.clone()
-
-    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
-    # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
-
-    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
-    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
-    prev_output_tokens[:, 0] = decoder_start_tokens
-
-    return prev_output_tokens
-
-
-def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), float("-inf"))
-    mask_cond = torch.arange(mask.size(-1))
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
-
-
-def BartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
-
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
-class BartLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
-    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
-    the forward function.
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int):
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
-        self.offset = offset
-        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings += offset
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
-
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
-        return super().forward(positions + self.offset)
-
-
-class BartSinusoidalPositionalEmbedding(nn.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
-
-    @staticmethod
-    def _init_weight(out: nn.Parameter):
-        """
-        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
-        the 2nd half of the vector. [dim // 2:]
-        """
-        n_pos, dim = out.shape
-        position_enc = np.array(
-            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
-        )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
-        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
-        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
-        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
-
-    @torch.no_grad()
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
-        return super().forward(positions)
-
-
-class BartAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
-        self.scaling = self.head_dim ** -0.5
-        self.is_decoder = is_decoder
-
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-
-        assert attn_weights.size() == (
-            bsz * self.num_heads,
-            tgt_len,
-            src_len,
-        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
-
-        if attention_mask is not None:
-            assert attention_mask.size() == (
-                bsz,
-                1,
-                tgt_len,
-                src_len,
-            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
-        attn_weights = F.softmax(attn_weights, dim=-1)
-
-        if output_attentions:
-            # this operation is a bit akward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
-
-        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
-
-        attn_output = torch.bmm(attn_probs, value_states)
-
-        assert attn_output.size() == (
-            bsz * self.num_heads,
-            tgt_len,
-            self.head_dim,
-        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
-
-        attn_output = (
-            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-            .transpose(1, 2)
-            .reshape(bsz, tgt_len, embed_dim)
-        )
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights_reshaped, past_key_value
-
-
-class BartEncoderLayer(nn.Module):
-    def __init__(self, config: BartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = BartAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-        )
-        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BartLayerNorm(self.embed_dim)
-
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
-        """
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
-        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        return hidden_states, attn_weights
-
-
-class BartDecoderLayer(nn.Module):
-    def __init__(self, config: BartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-
-        self.self_attn = BartAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.normalize_before = config.normalize_before
-
-        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
-        self.encoder_attn = BartAttention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-        )
-        self.encoder_attn_layer_norm = BartLayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BartLayerNorm(self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[torch.Tensor] = False,
-    ):
-        """
-        Args:
-            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
-        """
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-            if self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = residual + hidden_states
-            if not self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        return (
-            hidden_states,
-            self_attn_weights,
-            present_key_value,
-            cross_attn_weights,
-        )
-
-
-class BartClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
-class BartPretrainedModel(PreTrainedModel):
-    config_class = BartConfig
-    base_model_prefix = "model"
-
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, BartSinusoidalPositionalEmbedding):
-            pass
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    @property
-    def dummy_inputs(self):
-        pad_token = self.config.pad_token_id
-        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
-        dummy_inputs = {
-            "attention_mask": input_ids.ne(pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-
-class PretrainedBartModel(BartPretrainedModel):
-    def __init_subclass__(self):
-        warnings.warn(
-            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
-            FutureWarning,
-        )
-
-
-BART_START_DOCSTRING = r"""
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-"""
-
-BART_GENERATION_EXAMPLE = r"""
-    Summarization example::
-
-        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
-
-        >>> # see ``examples/summarization/bart/run_eval.py`` for a longer example
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
-
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-
-        >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
-"""
-
-BART_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
-            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
-            also be used by default.
-
-            If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
-            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
-            information on the default strategy.
-        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
-            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
-            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
-            cross-attention of the decoder.
-        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
-            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
-            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
-            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
-            takes the value of :obj:`inputs_embeds`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-
-class BartEncoder(BartPretrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    :class:`BartEncoderLayer`.
-
-    Args:
-        config: BartConfig
-        embed_tokens (torch.nn.Embedding): output embedding
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
-        super().__init__(config)
-
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-
-        embed_dim = config.d_model
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_position_embeddings
-
-        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
-
-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                embed_dim,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
-        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
-        # mbart has one extra layer_norm
-        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
-
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
-                for details.
-
-                `What are input IDs? <../glossary.html#input-ids>`__
-            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
-                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
-            output_attentions (:obj:`bool`, `optional`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
-                returned tensors for more detail.
-            output_hidden_states (:obj:`bool`, `optional`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
-                for more detail.
-            return_dict (:obj:`bool`, `optional`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
-        embed_pos = self.embed_positions(input_shape)
-
-        hidden_states = inputs_embeds + embed_pos
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
-                attn = None
-            else:
-                hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
-
-            if output_attentions:
-                all_attentions = all_attentions + (attn,)
-
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-class BartDecoder(BartPretrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
-
-    Args:
-        config: BartConfig
-        embed_tokens (torch.nn.Embedding): output embedding
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm  # layernorm variant
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-
-        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
-
-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, config.pad_token_id
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
-        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
-        self.layer_norm = BartLayerNorm(config.d_model) if config.add_final_layer_norm else None
-
-        self.init_weights()
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Args:
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
-                for details.
-
-                `What are input IDs? <../glossary.html#input-ids>`__
-            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
-                decoding.
-
-                If :obj:`past_key_values` are used, the user can optionally input only the last
-                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
-                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
-                sequence_length)`.
-            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
-                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
-            output_attentions (:obj:`bool`, `optional`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
-                returned tensors for more detail.
-            output_hidden_states (:obj:`bool`, `optional`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
-                for more detail.
-            return_dict (:obj:`bool`, `optional`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
-            ).to(self.device)
-
-        # create decoder_padding_mask if not provided and needed
-        # 4.12.20 (PVP): Not a fan of this "magical" function that
-        # automatically creates attention_mask for padded tokens
-        # => this is inconsistent with other models
-        # => Pegasus uses the pad_token as decoder_start_token_id, so that this could
-        # pose some problems.
-        if (
-            attention_mask is None
-            and input_ids is not None
-            and input_shape[-1] > 1
-            and self.config.pad_token_id in input_ids
-        ):
-            # should be kept for backwards compatibility
-            attention_mask = input_ids.ne(self.config.pad_token_id).to(torch.long)
-            # never mask leading token, even if it is pad
-            attention_mask[:, 0] = attention_mask[:, 1]
-            if past_key_values_length > 0:
-                attention_mask = torch.cat(
-                    [
-                        torch.ones(
-                            (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device
-                        ),
-                        attention_mask,
-                    ],
-                    dim=-1,
-                )
-
-        if attention_mask is not None and combined_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            combined_attention_mask = combined_attention_mask + _expand_mask(
-                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-            )
-
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-
-        # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
-
-        if self.do_blenderbot_90_layernorm:
-            hidden_states = self.layernorm_embedding(inputs_embeds)
-            hidden_states += positions
-        else:
-            hidden_states = inputs_embeds + positions
-            hidden_states = self.layernorm_embedding(hidden_states)
-
-        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-            if self.training and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(
-                hidden_states,
-                attention_mask=combined_attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-            )
-
-            if use_cache:
-                next_decoder_cache += (present_key_value,)
-
-            if output_attentions:
-                all_self_attns += (layer_self_attn,)
-                all_cross_attentions += (layer_cross_attn,)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        # if config.add_final_layer_norm (mBART)
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-@add_start_docstrings(
-    "The bare BART Model outputting raw hidden-states without any specific head on top.",
-    BART_START_DOCSTRING,
-)
-class BartModel(BartPretrainedModel):
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-
-        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
-        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-
-        self.encoder = BartEncoder(config, self.shared)
-        self.decoder = BartDecoder(config, self.shared)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, value):
-        self.shared = value
-        self.encoder.embed_tokens = self.shared
-        self.decoder.embed_tokens = self.shared
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-
-        # 4.12.20 (PVP): Not a fan of this "magical" function and
-        # also wonder how often it's actually used ... keep now
-        # for backward compatibility
-        # -> is this used for backward compatibility
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
-)
-class BartForConditionalGeneration(BartPretrainedModel):
-    base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-        r"lm_head\.weight",
-    ]
-
-    def __init__(self, config: BartConfig):
-        super().__init__(config)
-        self.model = BartModel(config)
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
-
-        self.init_weights()
-
-    def get_encoder(self):
-        return self.model.get_encoder()
-
-    def get_decoder(self):
-        return self.model.get_decoder()
-
-    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens)
-        self._resize_final_logits_bias(new_num_tokens)
-        return new_embeddings
-
-    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
-        old_num_tokens = self.final_logits_bias.shape[-1]
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    @add_end_docstrings(BART_GENERATION_EXAMPLE)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
-            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-
-        Returns:
-
-        Conditional generation example::
-
-            >>> # Mask filling only works for bart-large
-            >>> from transformers import BartTokenizer, BartForConditionalGeneration
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
-            >>> # ['good', 'great', 'all', 'really', 'very']
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if labels is not None:
-            use_cache = False
-            if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # TODO(SS): do we need to ignore pad tokens in labels?
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
-    ):
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            self._force_token_id_to_be_generated(logits, self.config.bos_token_id)
-        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
-        return logits
-
-    @staticmethod
-    def _force_token_id_to_be_generated(scores, token_id) -> None:
-        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
-        scores[:, [x for x in range(scores.shape[1]) if x != token_id]] = -float("inf")
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
-    tasks.
-    """,
-    BART_START_DOCSTRING,
-)
-class BartForSequenceClassification(BartPretrainedModel):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = BartModel(config)
-        self.classification_head = BartClassificationHead(
-            config.d_model,
-            config.d_model,
-            config.num_labels,
-            config.classifier_dropout,
-        )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=Seq2SeqSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-
-        if len(torch.unique(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    BART_START_DOCSTRING,
-)
-class BartForQuestionAnswering(BartPretrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-
-        self.model = BartModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.model._init_weights(self.qa_outputs)
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=Seq2SeqQuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
diff --git a/src/transformers/models/old_bart/modeling_tf_bart.py b/src/transformers/models/old_bart/modeling_tf_bart.py
deleted file mode 100644
index 03c24c209c5a1c..00000000000000
--- a/src/transformers/models/old_bart/modeling_tf_bart.py
+++ /dev/null
@@ -1,1326 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF BART model, ported from the fairseq repo."""
-
-import math
-import random
-import warnings
-from typing import Dict, Optional, Tuple, Union
-
-import numpy as np
-import tensorflow as tf
-
-from ...activations_tf import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from ...modeling_tf_outputs import (
-    TFBaseModelOutput,
-    TFBaseModelOutputWithPast,
-    TFSeq2SeqLMOutput,
-    TFSeq2SeqModelOutput,
-)
-
-# Public API
-from ...modeling_tf_utils import (
-    DUMMY_INPUTS,
-    TFPreTrainedModel,
-    TFSharedEmbeddings,
-    TFWrappedEmbeddings,
-    input_processing,
-    keras_serializable,
-    shape_list,
-)
-from ...utils import logging
-from .configuration_bart import BartConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "BartConfig"
-_TOKENIZER_FOR_DOC = "BartTokenizer"
-
-LARGE_NEGATIVE = -1e8
-
-
-def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, eos_token_id: int):
-    shifted_input_ids = tf.cast(input_ids, tf.int32)
-    shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1)
-    start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), eos_token_id)
-    shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1)
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids = tf.where(
-        shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
-    )
-
-    # "Verify that `labels` has only positive values and -100"
-    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, tf.int32))
-
-    # Make sure the assertion op is called by wrapping the result in an identity no-op
-    with tf.control_dependencies([assert_gte0]):
-        shifted_input_ids = tf.identity(shifted_input_ids)
-
-    return shifted_input_ids
-
-
-def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = tf.ones((tgt_len, tgt_len), dtype=tf.float32) * LARGE_NEGATIVE
-    mask_cond = tf.range(shape_list(mask)[-1])
-
-    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
-    mask = tf.cast(mask, tf.float32)
-
-    if past_key_values_length > 0:
-        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1)
-    return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
-
-
-def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = shape_list(mask)
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32)
-
-    return (1.0 - expanded_mask) * LARGE_NEGATIVE
-
-
-class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
-    """
-    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
-    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
-    the forward function.
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset, **kwargs):
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models dont have this hack
-        self.offset = offset
-        assert padding_idx is not None, "padding_idx cannot be None"
-        num_embeddings += offset
-        super().__init__(num_embeddings, embedding_dim, **kwargs)
-
-    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
-        """Input is expected to be of size [bsz x seqlen]."""
-        bsz, seq_len = input_shape[:2]
-
-        positions = tf.range(
-            past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
-        )
-        return super().call(positions + self.offset)  # super object is not callable for some reason
-
-
-class TFBartSinusoidalPositionalEmbedding(tf.keras.layers.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
-
-        if embedding_dim % 2 != 0:
-            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
-        super().__init__(
-            num_positions,
-            embedding_dim,
-            **kwargs,
-        )
-
-    def build(self, input_shape: tf.TensorShape):
-        """
-        Build shared token embedding layer Shared weights logic adapted from
-        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        super().build(input_shape)  # Instantiates self.weight so it can be loaded
-        weight: np.ndarray = self._init_weight(self.input_dim, self.output_dim)
-        self.set_weights([weight])  # overwrite self.weight to correct value
-
-    @staticmethod
-    def _init_weight(n_pos: int, dim: int):
-        """
-        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
-        the 2nd half of the vector. [dim // 2:]
-        """
-        position_enc = np.array(
-            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
-        )
-        # index 0 is all zero
-        position_enc[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
-        position_enc[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
-        # convert to tensor
-        table = tf.convert_to_tensor(position_enc, dtype=tf.float32)
-        tf.stop_gradient(table)
-        return table
-
-    def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
-        """Input is expected to be of size [bsz x seqlen]."""
-        bsz, seq_len = input_shape[:2]
-
-        positions = tf.range(
-            past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
-        )
-        return super().call(positions)
-
-
-class TFBartAttention(tf.keras.layers.Layer):
-    """Multi-headed attention from "Attention Is All You Need"""
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.embed_dim = embed_dim
-
-        self.num_heads = num_heads
-        self.dropout = tf.keras.layers.Dropout(dropout)
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
-        self.is_decoder = is_decoder
-
-        self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
-        self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
-        self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
-        self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
-
-    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
-        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
-
-    def call(
-        self,
-        hidden_states: tf.Tensor,
-        key_value_states: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None,
-        attention_mask: Optional[tf.Tensor] = None,
-        training=False,
-    ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]:
-        """Input shape: Batch x Time x Channel"""
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, tgt_len, embed_dim = shape_list(hidden_states)
-
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = tf.concat([past_key_value[0], key_states], axis=2)
-            value_states = tf.concat([past_key_value[1], value_states], axis=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
-        key_states = tf.reshape(key_states, proj_shape)
-        value_states = tf.reshape(value_states, proj_shape)
-
-        src_len = shape_list(key_states)[1]
-        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
-
-        tf.debugging.assert_equal(
-            shape_list(attn_weights),
-            [bsz * self.num_heads, tgt_len, src_len],
-            message=f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {shape_list(attn_weights)}",
-        )
-
-        if attention_mask is not None:
-            tf.debugging.assert_equal(
-                shape_list(attention_mask),
-                [bsz, 1, tgt_len, src_len],
-                message=f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {shape_list(attention_mask)}",
-            )
-            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
-            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
-
-        attn_weights = tf.nn.softmax(attn_weights, axis=-1)
-
-        attn_probs = self.dropout(attn_weights, training=training)
-
-        attn_output = tf.matmul(attn_probs, value_states)
-
-        tf.debugging.assert_equal(
-            shape_list(attn_output),
-            [bsz * self.num_heads, tgt_len, self.head_dim],
-            message=f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {shape_list(attn_output)}",
-        )
-
-        attn_output = tf.transpose(
-            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
-        )
-        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
-
-        attn_output = self.out_proj(attn_output)
-        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
-
-        return attn_output, attn_weights, past_key_value
-
-
-class TFBartEncoderLayer(tf.keras.layers.Layer):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(**kwargs)
-        self.embed_dim = config.d_model
-        self.self_attn = TFBartAttention(
-            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
-        )
-        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
-        self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
-        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
-        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
-
-    def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, training=False):
-        """
-        Args:
-            hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (:obj:`tf.Tensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-        """
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, self_attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states, attention_mask=attention_mask
-        )
-        tf.debugging.assert_equal(
-            shape_list(hidden_states),
-            shape_list(residual),
-            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
-        )
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = self.activation_dropout(hidden_states, training=training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        return hidden_states, self_attn_weights
-
-
-class TFBartDecoderLayer(tf.keras.layers.Layer):
-    def __init__(self, config: BartConfig, **kwargs):
-        super().__init__(**kwargs)
-        self.embed_dim = config.d_model
-        self.self_attn = TFBartAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            name="self_attn",
-            is_decoder=True,
-        )
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
-        self.normalize_before = config.normalize_before
-
-        self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
-        self.encoder_attn = TFBartAttention(
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            name="encoder_attn",
-            is_decoder=True,
-        )
-        self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
-        self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
-        self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
-        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
-
-    def call(
-        self,
-        hidden_states,
-        attention_mask: Optional[tf.Tensor] = None,
-        encoder_hidden_states: Optional[tf.Tensor] = None,
-        encoder_attention_mask: Optional[tf.Tensor] = None,
-        past_key_value: Optional[Tuple[tf.Tensor]] = None,
-        training=False,
-    ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
-        """
-        Args:
-            hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
-            attention_mask (:obj:`tf.Tensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
-        """
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-        )
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
-
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-            if self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, _, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                past_key_value=cross_attn_past_key_value,
-            )
-            hidden_states = self.dropout(hidden_states, training=training)
-            hidden_states = residual + hidden_states
-            if not self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
-
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        # Fully Connected
-        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = self.activation_dropout(hidden_states, training=training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = residual + hidden_states
-
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
-
-        return (
-            hidden_states,
-            self_attn_weights,
-            present_key_value,
-        )
-
-
-class TFBartPretrainedModel(TFPreTrainedModel):
-    config_class = BartConfig
-    base_model_prefix = "model"
-
-    @property
-    def dummy_inputs(self):
-        pad_token = 1
-        input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
-        decoder_input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
-        dummy_inputs = {
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": tf.math.not_equal(input_ids, pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-
-
-class TFPretrainedBartModel(TFBartPretrainedModel):
-    def __init_subclass__(self):
-        warnings.warn(
-            "The class `TFPretrainedBartModel` has been deprecated, please use `TFBartPretrainedModel` instead.",
-            FutureWarning,
-        )
-
-
-BART_START_DOCSTRING = r"""
-    This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
-    generic methods the library implements for all its model (such as downloading or saving, resizing the input
-    embeddings, pruning heads etc.)
-
-    This model is also a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ subclass. Use
-    it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
-    and behavior.
-
-    .. note::
-
-        TF 2.0 models accepts two formats as inputs:
-
-        - having all inputs as keyword arguments (like PyTorch models), or
-        - having all inputs as a list, tuple or dict in the first positional arguments.
-
-        This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
-        the tensors in the first argument of the model call function: :obj:`model(inputs)`.
-
-        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
-        the first positional argument :
-
-        - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)`
-        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
-        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
-
-    Args:
-        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the
-            model weights.
-"""
-
-BART_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
-            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
-        encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
-            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-            of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of
-        past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`)
-            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.TFModelOutput` instead of a plain tuple.
-        training (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use the model in training mode (some modules like dropout modules have different
-            behaviors between training and evaluation).
-"""
-
-
-@keras_serializable
-class TFBartEncoder(tf.keras.layers.Layer):
-    config_class = BartConfig
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    :class:`TFBartEncoderLayer`.
-
-    Args:
-        config: BartConfig
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs):
-        super().__init__(**kwargs)
-        self.config = config
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.layerdrop = config.encoder_layerdrop
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_position_embeddings
-
-        self.embed_tokens = embed_tokens
-        if config.static_position_embeddings:
-            self.embed_positions = TFBartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                name="embed_positions",
-            )
-        else:
-            self.embed_positions = TFBartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-                name="embed_positions",
-            )
-        self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
-        self.layernorm_embedding = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
-            if config.normalize_embedding
-            else tf.keras.layers.Layer()
-        )
-        self.layer_norm = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
-            if config.add_final_layer_norm
-            else None
-        )
-
-    def call(
-        self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs,
-    ):
-        """
-        Args:
-            input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
-                for details.
-
-                `What are input IDs? <../glossary.html#input-ids>`__
-            attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
-                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
-            output_attentions (:obj:`bool`, `optional`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
-                returned tensors for more detail.
-            output_hidden_states (:obj:`bool`, `optional`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
-                for more detail.
-            return_dict (:obj:`bool`, `optional`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        """
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif inputs["input_ids"] is not None:
-            input_shape = shape_list(inputs["input_ids"])
-        elif inputs["inputs_embeds"] is not None:
-            input_shape = shape_list(inputs["inputs_embeds"])[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if inputs["inputs_embeds"] is None:
-            inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
-        else:
-            inputs["inputs_embeds"] = inputs["inputs_embeds"]
-
-        inputs["inputs_embeds"] = inputs["inputs_embeds"] * self.embed_scale
-
-        embed_pos = self.embed_positions(input_shape)
-        hidden_states = inputs["inputs_embeds"] + embed_pos
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=inputs["training"])
-
-        # check attention mask and invert
-        if inputs["attention_mask"] is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            inputs["attention_mask"] = _expand_mask(inputs["attention_mask"])
-
-        encoder_states = () if inputs["output_hidden_states"] else None
-        all_attentions = () if inputs["output_attentions"] else None
-
-        # encoder layers
-        for encoder_layer in self.layers:
-
-            if inputs["output_hidden_states"]:
-                encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            dropout_probability = random.uniform(0, 1)
-            if inputs["training"] and (dropout_probability < self.layerdrop):  # skip the layer
-                continue
-
-            hidden_states, attn = encoder_layer(hidden_states, inputs["attention_mask"])
-
-            if inputs["output_attentions"]:
-                all_attentions += (attn,)
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
-        if inputs["output_hidden_states"]:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not inputs["return_dict"]:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return TFBaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-@keras_serializable
-class TFBartDecoder(tf.keras.layers.Layer):
-    config_class = BartConfig
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TFBartDecoderLayer`
-
-    Args:
-        config: BartConfig
-        embed_tokens: output embedding
-    """
-
-    def __init__(self, config: BartConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs):
-        super().__init__(**kwargs)
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        self.embed_tokens = embed_tokens
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.layerdrop = config.decoder_layerdrop
-        if config.static_position_embeddings:
-            self.embed_positions = TFBartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                name="embed_positions",
-            )
-        else:
-            self.embed_positions = TFBartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-                name="embed_positions",
-            )
-        self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
-        self.layernorm_embedding = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
-            if config.normalize_embedding
-            else tf.keras.layers.Layer()
-        )
-        self.layer_norm = (
-            tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
-            if config.add_final_layer_norm
-            else None
-        )
-
-        self.dropout = tf.keras.layers.Dropout(config.dropout)
-        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm
-
-    def call(
-        self,
-        input_ids=None,
-        inputs_embeds=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs,
-    ):
-        r"""
-        Args:
-            input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-
-                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
-                for details.
-
-                `What are input IDs? <../glossary.html#input-ids>`__
-            attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in ``[0, 1]``:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
-                decoding.
-
-                If :obj:`past_key_values` are used, the user can optionally input only the last
-                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
-                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
-                sequence_length)`.
-            inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
-                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
-                into associated vectors than the model's internal embedding lookup matrix.
-            output_attentions (:obj:`bool`, `optional`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
-                returned tensors for more detail.
-            output_hidden_states (:obj:`bool`, `optional`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
-                for more detail.
-            return_dict (:obj:`bool`, `optional`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-        """
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            inputs_embeds=inputs_embeds,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif inputs["input_ids"] is not None:
-            input_shape = shape_list(inputs["input_ids"])
-        elif inputs["inputs_embeds"] is not None:
-            input_shape = shape_list(inputs["inputs_embeds"])[:-1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        past_key_values_length = (
-            inputs["past_key_values"][0][0].shape[2] if inputs["past_key_values"] is not None else 0
-        )
-
-        # embed positions
-        positions = self.embed_positions(input_shape, past_key_values_length)
-
-        if inputs["inputs_embeds"] is None:
-            inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
-
-        hidden_states = inputs["inputs_embeds"] * self.embed_scale
-
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
-        else:
-            combined_attention_mask = _expand_mask(
-                tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
-            )
-
-        if inputs["attention_mask"] is None and inputs["input_ids"] is not None and input_shape[-1] > 1:
-            inputs["attention_mask"] = tf.cast(
-                tf.math.not_equal(inputs["input_ids"], self.config.pad_token_id), inputs["input_ids"].dtype
-            )
-            inputs["attention_mask"] = tf.concat(
-                [
-                    tf.ones((input_shape[0], past_key_values_length), dtype=inputs["attention_mask"].dtype),
-                    inputs["attention_mask"],
-                ],
-                axis=-1,
-            )
-        else:
-            inputs["attention_mask"] = tf.ones(
-                (input_shape[0], input_shape[1] + past_key_values_length), dtype=tf.int32
-            )
-
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = combined_attention_mask + _expand_mask(
-            inputs["attention_mask"], tgt_len=input_shape[-1]
-        )
-
-        if inputs["encoder_hidden_states"] is not None and inputs["encoder_attention_mask"] is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            inputs["encoder_attention_mask"] = _expand_mask(inputs["encoder_attention_mask"], tgt_len=input_shape[-1])
-
-        if self.do_blenderbot_90_layernorm:
-            hidden_states = self.layernorm_embedding(hidden_states) + positions
-        else:
-            hidden_states = self.layernorm_embedding(hidden_states + positions)
-        hidden_states = self.dropout(hidden_states, training=inputs["training"])
-
-        # decoder layers
-        all_hidden_states = ()
-        all_self_attns = ()
-        present_key_values = ()
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if inputs["output_hidden_states"]:
-                all_hidden_states += (hidden_states,)
-            dropout_probability = random.uniform(0, 1)
-
-            if inputs["training"] and (dropout_probability < self.layerdrop):
-                continue
-
-            past_key_value = inputs["past_key_values"][idx] if inputs["past_key_values"] is not None else None
-
-            hidden_states, layer_self_attn, present_key_value = decoder_layer(
-                hidden_states,
-                attention_mask=combined_attention_mask,
-                encoder_hidden_states=inputs["encoder_hidden_states"],
-                encoder_attention_mask=inputs["encoder_attention_mask"],
-                past_key_value=past_key_value,
-            )
-
-            if inputs["use_cache"]:
-                present_key_values += (present_key_value,)
-
-            if inputs["output_attentions"]:
-                all_self_attns += (layer_self_attn,)
-
-        if self.layer_norm is not None:  # same as if config.add_final_layer_norm
-            hidden_states = self.layer_norm(hidden_states)
-
-        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
-        if inputs["output_hidden_states"]:
-            all_hidden_states += (hidden_states,)
-        else:
-            all_hidden_states = None
-
-        all_self_attns = list(all_self_attns) if inputs["output_attentions"] else None
-
-        present_key_values = (inputs["encoder_hidden_states"], present_key_values) if inputs["use_cache"] else None
-
-        if not inputs["return_dict"]:
-            return hidden_states, present_key_values, all_hidden_states, all_self_attns
-        else:
-            return TFBaseModelOutputWithPast(
-                last_hidden_state=hidden_states,
-                past_key_values=present_key_values,
-                hidden_states=all_hidden_states,
-                attentions=all_self_attns,
-            )
-
-
-@add_start_docstrings(
-    "The bare BART Model outputting raw hidden-states without any specific head on top.",
-    BART_START_DOCSTRING,
-)
-@keras_serializable
-class TFBartModel(TFBartPretrainedModel):
-    base_model_prefix = "model"
-
-    def __init__(self, config: BartConfig, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, config.pad_token_id, name="model.shared")
-
-        with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name:
-            pass
-
-        # Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope.
-        embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name)
-        embed_tokens.vocab_size = self.shared.vocab_size
-        embed_tokens.hidden_size = self.shared.hidden_size
-
-        self.encoder = TFBartEncoder(config, embed_tokens, name="encoder")
-        self.decoder = TFBartDecoder(config, embed_tokens, name="decoder")
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/bart-large",
-        output_type=TFSeq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def call(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        training=False,
-        **kwargs
-    ):
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        if inputs["decoder_input_ids"] is None and inputs["decoder_inputs_embeds"] is None:
-            inputs["use_cache"] = False
-
-        inputs["output_hidden_states"] = (
-            inputs["output_hidden_states"]
-            if inputs["output_hidden_states"] is not None
-            else self.config.output_hidden_states
-        )
-
-        if inputs["decoder_input_ids"] is None and inputs["input_ids"] is not None:
-            inputs["decoder_input_ids"] = shift_tokens_right(
-                inputs["input_ids"], self.config.pad_token_id, self.config.eos_token_id
-            )
-
-        if inputs["encoder_outputs"] is None:
-            inputs["encoder_outputs"] = self.encoder(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                inputs_embeds=inputs["inputs_embeds"],
-                output_attentions=inputs["output_attentions"],
-                output_hidden_states=inputs["output_hidden_states"],
-                return_dict=inputs["return_dict"],
-                training=inputs["training"],
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
-        elif inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], TFBaseModelOutput):
-            inputs["encoder_outputs"] = TFBaseModelOutput(
-                last_hidden_state=inputs["encoder_outputs"][0],
-                hidden_states=inputs["encoder_outputs"][1] if len(inputs["encoder_outputs"]) > 1 else None,
-                attentions=inputs["encoder_outputs"][2] if len(inputs["encoder_outputs"]) > 2 else None,
-            )
-        # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
-        elif not inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], tuple):
-            inputs["encoder_outputs"] = inputs["encoder_outputs"].to_tuple()
-
-        decoder_outputs = self.decoder(
-            inputs["decoder_input_ids"],
-            attention_mask=inputs["decoder_attention_mask"],
-            encoder_hidden_states=inputs["encoder_outputs"][0],
-            encoder_attention_mask=inputs["attention_mask"],
-            past_key_values=inputs["past_key_values"],
-            inputs_embeds=inputs["decoder_inputs_embeds"],
-            use_cache=inputs["use_cache"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
-
-        if not inputs["return_dict"]:
-            return decoder_outputs + inputs["encoder_outputs"]
-
-        return TFSeq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            encoder_last_hidden_state=inputs["encoder_outputs"].last_hidden_state,
-            encoder_hidden_states=inputs["encoder_outputs"].hidden_states,
-            encoder_attentions=inputs["encoder_outputs"].attentions,
-        )
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, value):
-        self.shared = value
-
-    def get_output_embeddings(self):
-        return self.shared
-
-
-@add_start_docstrings(
-    "The BART Model with a language modeling head. Can be used for summarization.",
-    BART_START_DOCSTRING,
-)
-class TFBartForConditionalGeneration(TFBartPretrainedModel):
-    _keys_to_ignore_on_load_unexpected = [
-        r"model.encoder.embed_tokens.weight",
-        r"model.decoder.embed_tokens.weight",
-    ]
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.model = TFBartModel(config, name="model")
-        self.use_cache = config.use_cache
-        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency.
-        self.final_logits_bias = self.add_weight(
-            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
-        )
-
-    def get_decoder(self):
-        return self.model.decoder
-
-    def resize_token_embeddings(self, new_num_tokens):
-        super().resize_token_embeddings(new_num_tokens=new_num_tokens)
-
-        # BART is a special case where the bias has two dimensions
-        # and not named just `bias`
-        if new_num_tokens is not None:
-            num_tokens_to_copy = min(self.final_logits_bias.shape[0], new_num_tokens)
-            init_bias = tf.zeros((new_num_tokens,))
-            init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy]
-            self.final_logits_bias = self.add_weight(
-                shape=(1, new_num_tokens),
-                initializer="zeros",
-                trainable=False,
-                name="final_logits_bias",
-            )
-            self.final_logits_bias.assign(init_bias)
-
-    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    def call(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs: Optional[TFBaseModelOutput] = None,
-        past_key_values=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        labels=None,
-        training=False,
-        **kwargs,
-    ):
-        """
-        Returns:
-
-        Examples::
-
-            # Mask filling only works for bart-large
-            from transformers import BartTokenizer, TFBartForConditionalGeneration
-            import tensorflow as tf
-            mname = 'facebook/bart-large'
-            tokenizer = BartTokenizer.from_pretrained(mname)
-            TXT = "My friends are <mask> but they eat too many carbs."
-            model = TFBartForConditionalGeneration.from_pretrained(mname)
-            batch = tokenizer([TXT], return_tensors='tf')
-            logits = model(inputs=batch.input_ids).logits
-            probs = tf.nn.softmax(logits[0])
-            # probs[5] is associated with the mask token
-        """
-        inputs = input_processing(
-            func=self.call,
-            config=self.config,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            labels=labels,
-            training=training,
-            kwargs_call=kwargs,
-        )
-
-        if inputs["labels"] is not None:
-            inputs["use_cache"] = False
-            if inputs["decoder_input_ids"] is None:
-                inputs["decoder_input_ids"] = shift_tokens_right(
-                    inputs["labels"], self.config.pad_token_id, self.config.eos_token_id
-                )
-
-        outputs = self.model(
-            inputs["input_ids"],
-            attention_mask=inputs["attention_mask"],
-            decoder_input_ids=inputs["decoder_input_ids"],
-            encoder_outputs=inputs["encoder_outputs"],
-            decoder_attention_mask=inputs["decoder_attention_mask"],
-            past_key_values=inputs["past_key_values"],
-            inputs_embeds=inputs["inputs_embeds"],
-            decoder_inputs_embeds=inputs["decoder_inputs_embeds"],
-            use_cache=inputs["use_cache"],
-            output_attentions=inputs["output_attentions"],
-            output_hidden_states=inputs["output_hidden_states"],
-            return_dict=inputs["return_dict"],
-            training=inputs["training"],
-        )
-        lm_logits = self.model.shared(outputs[0], mode="linear")
-        lm_logits = lm_logits + self.final_logits_bias
-        masked_lm_loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], lm_logits)
-
-        if not inputs["return_dict"]:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return TFSeq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,  # index 1 of d outputs
-            decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
-            decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
-            encoder_last_hidden_state=outputs.last_hidden_state,  # index 0 of encoder outputs
-            encoder_hidden_states=outputs.encoder_hidden_states,  # 1 of e out
-            encoder_attentions=outputs.encoder_attentions,  # 2 of e out
-        )
-
-    def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs) -> Dict:
-        assert past is not None and len(past) in {1, 2}, f"past has to be an iterable of length 1,2 got {past}"
-        if len(past) == 1:
-            assert isinstance(past[0], tf.Tensor), f"`past[0]` has to be of type `tf.Tensor`, but is {type(past[0])}"
-            encoder_outputs = TFBaseModelOutput(last_hidden_state=past[0])
-            past_key_values = None
-        else:
-            assert (
-                len(past) == 2
-            ), "`past` has to be of length 2 with the encoder_outputs at the first position and past_key_values at the second position."
-            encoder_outputs, past_key_values = past
-            if isinstance(encoder_outputs, tuple):
-                assert isinstance(
-                    encoder_outputs[0], tf.Tensor
-                ), f"`encoder_outputs[0]` has to be of type `tf.Tensor`, but is {type(encoder_outputs[0])}"
-                encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs[0])
-            elif isinstance(encoder_outputs, tf.Tensor):
-                encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs)
-            assert (
-                past_key_values
-            ), f"decoder cached states must be truthy. got {past_key_values} from the 2nd element of past"
-            decoder_input_ids = decoder_input_ids[:, -1:]
-
-        assert isinstance(
-            encoder_outputs, TFBaseModelOutput
-        ), f"encoder_outputs should be a TFBaseModelOutput, Instead got {type(encoder_outputs)}."
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past_key_values,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        if len(past) == 1:
-            return past
-
-        past_key_values = past[1]
-
-        reordered_past = ()
-        for layer_past_key_values in past_key_values:
-            reordered_past += (
-                tuple(tf.gather(layer_past_key_value, beam_idx) for layer_past_key_value in layer_past_key_values),
-            )
-        return (past[0], reordered_past)
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != self.config.bos_token_id, LARGE_NEGATIVE, logits)
-        elif cur_len == max_length - 1:
-            vocab_range = tf.constant(range(self.config.vocab_size))
-            return tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
-        else:
-            return logits
-
-    def get_output_embeddings(self):
-        return self.model.shared
-
-    def get_encoder(self):
-        return self.model.encoder
-
-    def compute_loss(self, labels, logits):
-        """CrossEntropyLoss that ignores pad tokens"""
-        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-            from_logits=True,
-            reduction=tf.keras.losses.Reduction.NONE,
-        )
-        melted_labels = tf.reshape(labels, (-1,))
-        active_loss = tf.not_equal(melted_labels, self.config.pad_token_id)
-        reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
-        labels = tf.boolean_mask(melted_labels, active_loss)
-        return loss_fn(labels, reduced_logits)
diff --git a/src/transformers/models/old_bart/tokenization_bart.py b/src/transformers/models/old_bart/tokenization_bart.py
deleted file mode 100644
index 6b46e30e9d527c..00000000000000
--- a/src/transformers/models/old_bart/tokenization_bart.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from transformers import add_start_docstrings
-
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ...utils import logging
-from ..roberta.tokenization_roberta import RobertaTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-
-# vocab and merges same as roberta
-vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
-merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
-_all_bart_models = [
-    "facebook/bart-base",
-    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "yjernite/bart_eli5",
-    # This is not exhaustive: see https://huggingface.co/models?filter=bart
-]
-
-
-class BartTokenizer(RobertaTokenizer):
-    r"""
-    Construct a BART tokenizer.
-
-    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
-    :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
-
-    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
-    initialization parameters and other methods.
-    """
-    # merges and vocab same as Roberta
-    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
-    pretrained_vocab_files_map = {
-        "vocab_file": {m: vocab_url for m in _all_bart_models},
-        "merges_file": {m: merges_url for m in _all_bart_models},
-    }
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        tgt_texts: Optional[List[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        padding: str = "longest",
-        return_tensors: str = None,
-        truncation=True,
-        **kwargs,
-    ) -> BatchEncoding:
-        kwargs.pop("src_lang", None)
-        kwargs.pop("tgt_lang", None)
-        if max_length is None:
-            max_length = self.model_max_length
-        model_inputs: BatchEncoding = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
-        )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        labels = self(
-            tgt_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_target_length,
-            truncation=truncation,
-            **kwargs,
-        )["input_ids"]
-        model_inputs["labels"] = labels
-        return model_inputs
diff --git a/src/transformers/models/old_bart/tokenization_bart_fast.py b/src/transformers/models/old_bart/tokenization_bart_fast.py
deleted file mode 100644
index 30b77275f22169..00000000000000
--- a/src/transformers/models/old_bart/tokenization_bart_fast.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from transformers import add_start_docstrings
-
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ...utils import logging
-from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
-from .tokenization_bart import BartTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-
-# vocab and merges same as roberta
-vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
-merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
-tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
-_all_bart_models = [
-    "facebook/bart-base",
-    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "yjernite/bart_eli5",
-    # This is not exhaustive: see https://huggingface.co/models?filter=bart
-]
-
-
-class BartTokenizerFast(RobertaTokenizerFast):
-    # merges and vocab same as Roberta
-    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
-    pretrained_vocab_files_map = {
-        "vocab_file": {m: vocab_url for m in _all_bart_models},
-        "merges_file": {m: merges_url for m in _all_bart_models},
-        "tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
-    }
-    slow_tokenizer_class = BartTokenizer
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        tgt_texts: Optional[List[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        padding: str = "longest",
-        return_tensors: Optional[str] = None,
-        truncation=True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if max_length is None:
-            max_length = self.model_max_length
-        model_inputs: BatchEncoding = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
-        )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        labels = self(
-            tgt_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_target_length,
-            truncation=truncation,
-            **kwargs,
-        )["input_ids"]
-        model_inputs["labels"] = labels
-        return model_inputs
diff --git a/src/transformers/models/old_blenderbot/__init__.py b/src/transformers/models/old_blenderbot/__init__.py
deleted file mode 100644
index fccb38f80ac145..00000000000000
--- a/src/transformers/models/old_blenderbot/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...file_utils import is_tf_available, is_torch_available
-from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
-from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
-
-
-if is_torch_available():
-    from .modeling_blenderbot import (
-        BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
-        BlenderbotForConditionalGeneration,
-        BlenderbotModel,
-    )
-
-if is_tf_available():
-    from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
diff --git a/src/transformers/models/old_blenderbot/configuration_blenderbot.py b/src/transformers/models/old_blenderbot/configuration_blenderbot.py
deleted file mode 100644
index b273ebb6ae01ac..00000000000000
--- a/src/transformers/models/old_blenderbot/configuration_blenderbot.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and Huggingface, 2020
-#
-# This source code is licensed under the MIT license found in the;
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# LICENSE file in the root directory of this source tree.
-"""
-BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
-blenderbot-90M defaults.
-"""
-from ..bart.configuration_bart import BartConfig
-
-
-BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/blenderbot-3B": "https://cdn.huggingface.co/facebook/blenderbot-3B/config.json",
-    "facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/config.json",
-}
-
-
-class BlenderbotConfig(BartConfig):
-    r"""
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
-    has the same signature with different defaults.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 54944):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotForConditionalGeneration`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of encoder layers, 6 are used for the `blenderbot-90M` model.
-        decoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of decoder layers, 6 are used for the `blenderbot-90M` model.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``),
-    """
-    model_type = "blenderbot"
-
-    def __init__(
-        self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=0,
-        activation_function="gelu",
-        vocab_size=54944,
-        d_model=512,
-        encoder_ffn_dim=2048,
-        encoder_layers=8,
-        encoder_attention_heads=16,
-        decoder_ffn_dim=2048,
-        decoder_layers=8,
-        decoder_attention_heads=16,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        attention_dropout=0.0,
-        dropout=0.1,
-        max_position_embeddings=512,
-        classifier_dropout=0.0,
-        is_encoder_decoder=True,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=True,
-        scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
-        force_bos_token_to_be_generated=False,
-        **common_kwargs
-    ):
-        r"""
-        Examples::
-
-            >>> from transformers import BlenderbotConfig
-            >>> config = BlenderbotConfig.from_pretrained('facebook/blenderbot-90M')
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            vocab_size=vocab_size,
-            d_model=d_model,
-            encoder_ffn_dim=encoder_ffn_dim,
-            encoder_layers=encoder_layers,
-            encoder_layerdrop=encoder_layerdrop,
-            encoder_attention_heads=encoder_attention_heads,
-            decoder_layerdrop=decoder_layerdrop,
-            decoder_ffn_dim=decoder_ffn_dim,
-            decoder_layers=decoder_layers,
-            normalize_before=normalize_before,
-            normalize_embedding=normalize_embedding,
-            static_position_embeddings=static_position_embeddings,
-            add_bias_logits=add_bias_logits,
-            force_bos_token_to_be_generated=force_bos_token_to_be_generated,
-            do_blenderbot_90_layernorm=do_blenderbot_90_layernorm,
-            add_final_layer_norm=add_final_layer_norm,
-            scale_embedding=scale_embedding,
-            attention_dropout=attention_dropout,
-            dropout=dropout,
-            classifier_dropout=classifier_dropout,
-            activation_dropout=activation_dropout,
-            max_position_embeddings=max_position_embeddings,
-            extra_pos_embeddings=extra_pos_embeddings,
-            activation_function=activation_function,
-            decoder_attention_heads=decoder_attention_heads,
-            **common_kwargs,
-        )
diff --git a/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index d31cf67c1e3f6c..00000000000000
--- a/src/transformers/models/old_blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Blenderbot checkpoint."""
-
-import argparse
-
-import torch
-
-from transformers import BartConfig, BartForConditionalGeneration
-from transformers.utils import logging
-
-
-logging.set_verbosity_info()
-logger = logging.get_logger(__name__)
-
-PATTERNS = [
-    ["attention", "attn"],
-    ["encoder_attention", "encoder_attn"],
-    ["q_lin", "q_proj"],
-    ["k_lin", "k_proj"],
-    ["v_lin", "v_proj"],
-    ["out_lin", "out_proj"],
-    ["norm_embeddings", "layernorm_embedding"],
-    ["position_embeddings", "embed_positions"],
-    ["embeddings", "embed_tokens"],
-    ["ffn.lin", "fc"],
-]
-
-
-def rename_state_dict_key(k):
-    if k == "embeddings.weight":
-        return "shared.weight"
-
-    for parlai_name, hf_name in PATTERNS:
-        k = k.replace(parlai_name, hf_name)
-
-    if k.startswith("encoder"):
-        k = k.replace(".attn", ".self_attn")
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "final_layer_norm")
-    elif k.startswith("decoder"):
-        k = k.replace("norm1", "self_attn_layer_norm")
-        k = k.replace("norm2", "encoder_attn_layer_norm")
-        k = k.replace("norm3", "final_layer_norm")
-    return k
-
-
-def rename_layernorm_keys(sd):
-    keys = [
-        "model.encoder.layernorm_embedding.weight",
-        "model.encoder.layernorm_embedding.bias",
-        "model.decoder.layernorm_embedding.weight",
-        "model.decoder.layernorm_embedding.bias",
-    ]
-    for k in keys:
-        v = sd.pop(k)
-        new_k = k.replace("layernorm_embedding", "layer_norm")
-        assert new_k not in sd
-        sd[new_k] = v
-
-
-IGNORE_KEYS = ["START"]
-
-
-@torch.no_grad()
-def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
-    """
-    Copy/paste/tweak model's weights to our BERT structure.
-    """
-    model = torch.load(checkpoint_path, map_location="cpu")
-    sd = model["model"]
-    cfg = BartConfig.from_json_file(config_json_path)
-    m = BartForConditionalGeneration(cfg)
-    valid_keys = m.model.state_dict().keys()
-    failures = []
-    mapping = {}
-    for k, v in sd.items():
-        if k in IGNORE_KEYS:
-            continue
-
-        new_k = rename_state_dict_key(k)
-        if new_k not in valid_keys:
-            failures.append([k, new_k])
-        else:
-            mapping[new_k] = v
-    if cfg.normalize_before:  # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm
-        rename_layernorm_keys(sd)
-    m.model.load_state_dict(mapping, strict=True)
-    m.half()
-    m.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
-    parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
-    parser.add_argument(
-        "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
-    )
-    args = parser.parse_args()
-    convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
diff --git a/src/transformers/models/old_blenderbot/modeling_blenderbot.py b/src/transformers/models/old_blenderbot/modeling_blenderbot.py
deleted file mode 100644
index 2a370fbabf8624..00000000000000
--- a/src/transformers/models/old_blenderbot/modeling_blenderbot.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the;
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotForConditionalGeneration which inherits from BART"""
-
-import torch
-
-from ...file_utils import add_start_docstrings
-from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
-from .configuration_blenderbot import BlenderbotConfig
-
-
-BLENDER_START_DOCSTRING = r"""
-
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-"""
-
-BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = ["facebook/blenderbot-3B", "facebook/blenderbot-90M"]
-
-
-@add_start_docstrings(
-    "The bare BlenderBot Model transformer outputting raw hidden-states without any specific head on top.",
-    BLENDER_START_DOCSTRING,
-)
-class BlenderbotModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = BlenderbotConfig
-
-
-@add_start_docstrings(
-    "The BlenderBot Model with a language modeling head. Can be used for summarization.", BLENDER_START_DOCSTRING
-)
-class BlenderbotForConditionalGeneration(BartForConditionalGeneration):
-    """
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-    """
-
-    config_class = BlenderbotConfig
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        logits[:, self.config.bos_token_id] = -torch.finfo(torch.float16).max  # near infinity fp16
-        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
-        return logits
diff --git a/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
deleted file mode 100644
index ba51e87a1c544f..00000000000000
--- a/src/transformers/models/old_blenderbot/modeling_tf_blenderbot.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF BlenderBot model, ported from the fairseq repo."""
-
-import tensorflow as tf
-
-from ...file_utils import add_start_docstrings
-from ...utils import logging
-from ..bart.modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
-from .configuration_blenderbot import BlenderbotConfig
-
-
-_CONFIG_FOR_DOC = "BlenderbotConfig"
-
-START_DOCSTRING = BART_START_DOCSTRING.replace(
-    "inherits from :class:`~transformers.TFPreTrainedModel`",
-    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
-).replace("BartConfig", _CONFIG_FOR_DOC)
-
-
-logger = logging.get_logger(__name__)
-
-
-@add_start_docstrings("Blenderbot model for open domain dialogue", START_DOCSTRING)
-class TFBlenderbotForConditionalGeneration(TFBartForConditionalGeneration):
-    config_class = BlenderbotConfig
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        """Never predict pad_token_id. Predict </s> when max_length is reached."""
-        vocab_range = tf.constant(range(self.config.vocab_size))
-        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
-        if cur_len == max_length - 1:
-            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
-        return logits
diff --git a/src/transformers/models/old_blenderbot/tokenization_blenderbot.py b/src/transformers/models/old_blenderbot/tokenization_blenderbot.py
deleted file mode 100644
index bf96a63d04a4da..00000000000000
--- a/src/transformers/models/old_blenderbot/tokenization_blenderbot.py
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the;
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
-import json
-import os
-from typing import Dict, List, Optional, Tuple
-
-import regex as re
-
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
-from ..roberta.tokenization_roberta import RobertaTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-    # "tokenizer_config_file": "tokenizer_config.json",
-}
-CKPT_3B = "facebook/blenderbot-3B"
-
-
-class BlenderbotTokenizer(RobertaTokenizer):
-    r"""
-    Construct a Blenderbot tokenizer.
-
-    :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
-    to the beginning of sequences.
-
-    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
-    parameters.
-    """
-    vocab_files_names = {
-        "vocab_file": "vocab.json",
-        "merges_file": "merges.txt",
-        "tokenizer_config_file": "tokenizer_config.json",
-    }
-    pretrained_vocab_files_map = {
-        "vocab_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/vocab.json"},
-        "merges_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/merges.txt"},
-        "tokenizer_config_file": {CKPT_3B: "https://cdn.huggingface.co/facebook/blenderbot-3B/tokenizer_config.json"},
-    }
-    max_model_input_sizes = {"facebook/blenderbot-3B": 128}
-
-    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A Blenderbot sequence has the following format:
-
-        - single sequence: `` X </s>``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Will be ignored
-
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        return token_ids_0 + [self.eos_token_id]
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-
-    pairs = set(pairs)
-    return pairs
-
-
-class BlenderbotSmallTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to the superclass for more information regarding methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            File containing the vocabulary.
-        merges_file (:obj:`str`):
-            Path to the merges file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
-            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
-            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
-    """
-
-    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-    pretrained_vocab_files_map = {
-        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
-        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
-    }
-    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        bos_token="__start__",
-        eos_token="__end__",
-        unk_token="__unk__",
-        pad_token="__null__",
-        **kwargs
-    ):
-        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token: str) -> str:
-        if token in self.cache:
-            return self.cache[token]
-        token = re.sub("([.,!?()])", r" \1", token)
-        token = re.sub("(')", r" \1 ", token)
-        token = re.sub(r"\s{2,}", " ", token)
-        if "\n" in token:
-            token = token.replace("\n", " __newln__")
-
-        tokens = token.split(" ")
-        words = []
-        for token in tokens:
-            if not len(token):
-                continue
-
-            token = token.lower()
-            word = tuple(token)
-            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
-            pairs = get_pairs(word)
-
-            if not pairs:
-                words.append(token)
-                continue
-
-            while True:
-                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-                if bigram not in self.bpe_ranks:
-                    break
-                first, second = bigram
-                new_word = []
-                i = 0
-
-                while i < len(word):
-                    try:
-                        j = word.index(first, i)
-                        new_word.extend(word[i:j])
-                        i = j
-                    except ValueError:
-                        new_word.extend(word[i:])
-                        break
-
-                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                        new_word.append(first + second)
-                        i += 2
-                    else:
-                        new_word.append(word[i])
-                        i += 1
-                new_word = tuple(new_word)
-                word = new_word
-                if len(word) == 1:
-                    break
-                else:
-                    pairs = get_pairs(word)
-            word = "@@ ".join(word)
-            word = word[:-4]
-
-            self.cache[token] = word
-            words.append(word)
-        return " ".join(words)
-
-    def _tokenize(self, text: str) -> List[str]:
-        """ Split a string into tokens using BPE."""
-        split_tokens = []
-
-        words = re.findall(r"\S+\n?", text)
-
-        for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
-        return split_tokens
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token to an id using the vocab. """
-        token = token.lower()
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """ Converts a sequence of tokens  in a single string. """
-        out_string = " ".join(tokens).replace("@@ ", "").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
diff --git a/src/transformers/models/old_marian/__init__.py b/src/transformers/models/old_marian/__init__.py
deleted file mode 100644
index bf7be4bd92579c..00000000000000
--- a/src/transformers/models/old_marian/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
-from .configuration_marian import MarianConfig
-
-
-if is_sentencepiece_available():
-    from .tokenization_marian import MarianTokenizer
-
-if is_torch_available():
-    from .modeling_marian import MarianMTModel
-
-if is_tf_available():
-    from .modeling_tf_marian import TFMarianMTModel
diff --git a/src/transformers/models/old_marian/configuration_marian.py b/src/transformers/models/old_marian/configuration_marian.py
deleted file mode 100644
index a17531bb2f4d8f..00000000000000
--- a/src/transformers/models/old_marian/configuration_marian.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Marian model configuration """
-
-from ..bart.configuration_bart import BartConfig
-
-
-PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/config.json",
-}
-
-
-class MarianConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
-    instantiate a Marian model according to the specified arguments, defining the model architecture.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 58101):
-            Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianMTModel`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 6):
-            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 6):
-            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
-
-    model_type = "marian"
-    keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
deleted file mode 100644
index 0ab653e9a23a0b..00000000000000
--- a/src/transformers/models/old_marian/convert_marian_tatoeba_to_pytorch.py
+++ /dev/null
@@ -1,1268 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from typing import List, Tuple
-
-from transformers.models.marian.convert_marian_to_pytorch import (
-    FRONT_MATTER_TEMPLATE,
-    _parse_readme,
-    convert_all_sentencepiece_models,
-    get_system_metadata,
-    remove_prefix,
-    remove_suffix,
-)
-
-
-try:
-    import pandas as pd
-except ImportError:
-    pass
-
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-LANG_CODE_URL = "https://datahub.io/core/language-codes/r/language-codes-3b2.csv"
-ISO_URL = "https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv"
-ISO_PATH = "lang_code_data/iso-639-3.csv"
-LANG_CODE_PATH = "lang_code_data/language-codes-3b2.csv"
-
-
-class TatoebaConverter:
-    """
-    Convert Tatoeba-Challenge models to huggingface format.
-
-    Steps:
-
-        1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
-        2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
-           one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
-        3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
-           members.
-    """
-
-    def __init__(self, save_dir="marian_converted"):
-        assert Path(DEFAULT_REPO).exists(), "need git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git"
-        reg = self.make_tatoeba_registry()
-        self.download_metadata()
-        self.registry = reg
-        reg_df = pd.DataFrame(reg, columns=["id", "prepro", "url_model", "url_test_set"])
-        assert reg_df.id.value_counts().max() == 1
-        reg_df = reg_df.set_index("id")
-        reg_df["src"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[0]).values
-        reg_df["tgt"] = reg_df.reset_index().id.apply(lambda x: x.split("-")[1]).values
-
-        released_cols = [
-            "url_base",
-            "pair",  # (ISO639-3/ISO639-5 codes),
-            "short_pair",  # (reduced codes),
-            "chrF2_score",
-            "bleu",
-            "brevity_penalty",
-            "ref_len",
-            "src_name",
-            "tgt_name",
-        ]
-
-        released = pd.read_csv("Tatoeba-Challenge/models/released-models.txt", sep="\t", header=None).iloc[:-1]
-        released.columns = released_cols
-        released["fname"] = released["url_base"].apply(
-            lambda x: remove_suffix(remove_prefix(x, "https://object.pouta.csc.fi/Tatoeba-Challenge/opus"), ".zip")
-        )
-
-        released["2m"] = released.fname.str.startswith("2m")
-        released["date"] = pd.to_datetime(
-            released["fname"].apply(lambda x: remove_prefix(remove_prefix(x, "2m-"), "-"))
-        )
-
-        released["base_ext"] = released.url_base.apply(lambda x: Path(x).name)
-        reg_df["base_ext"] = reg_df.url_model.apply(lambda x: Path(x).name)
-
-        metadata_new = reg_df.reset_index().merge(released.rename(columns={"pair": "id"}), on=["base_ext", "id"])
-
-        metadata_renamer = {"src": "src_alpha3", "tgt": "tgt_alpha3", "id": "long_pair", "date": "train_date"}
-        metadata_new = metadata_new.rename(columns=metadata_renamer)
-
-        metadata_new["src_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[0])
-        metadata_new["tgt_alpha2"] = metadata_new.short_pair.apply(lambda x: x.split("-")[1])
-        DROP_COLS_BOTH = ["url_base", "base_ext", "fname"]
-
-        metadata_new = metadata_new.drop(DROP_COLS_BOTH, 1)
-        metadata_new["prefer_old"] = metadata_new.long_pair.isin([])
-        self.metadata = metadata_new
-        assert self.metadata.short_pair.value_counts().max() == 1, "Multiple metadata entries for a short pair"
-        self.metadata = self.metadata.set_index("short_pair")
-
-        # wget.download(LANG_CODE_URL)
-        mapper = pd.read_csv(LANG_CODE_PATH)
-        mapper.columns = ["a3", "a2", "ref"]
-        self.iso_table = pd.read_csv(ISO_PATH, sep="\t").rename(columns=lambda x: x.lower())
-        more_3_to_2 = self.iso_table.set_index("id").part1.dropna().to_dict()
-        more_3_to_2.update(mapper.set_index("a3").a2.to_dict())
-        self.alpha3_to_alpha2 = more_3_to_2
-        self.model_card_dir = Path(save_dir)
-        self.constituents = GROUP_MEMBERS
-
-    def convert_models(self, tatoeba_ids, dry_run=False):
-        entries_to_convert = [x for x in self.registry if x[0] in tatoeba_ids]
-        converted_paths = convert_all_sentencepiece_models(entries_to_convert, dest_dir=self.model_card_dir)
-
-        for path in converted_paths:
-            long_pair = remove_prefix(path.name, "opus-mt-").split("-")  # eg. heb-eng
-            assert len(long_pair) == 2
-            new_p_src = self.get_two_letter_code(long_pair[0])
-            new_p_tgt = self.get_two_letter_code(long_pair[1])
-            hf_model_id = f"opus-mt-{new_p_src}-{new_p_tgt}"
-            new_path = path.parent.joinpath(hf_model_id)  # opus-mt-he-en
-            os.rename(str(path), str(new_path))
-            self.write_model_card(hf_model_id, dry_run=dry_run)
-
-    def get_two_letter_code(self, three_letter_code):
-        return self.alpha3_to_alpha2.get(three_letter_code, three_letter_code)
-
-    def expand_group_to_two_letter_codes(self, grp_name):
-        return [self.get_two_letter_code(x) for x in self.constituents[grp_name]]
-
-    def get_tags(self, code, ref_name):
-        if len(code) == 2:
-            assert "languages" not in ref_name, f"{code}: {ref_name}"
-            return [code], False
-        elif "languages" in ref_name or len(self.constituents.get(code, [])) > 1:
-            group = self.expand_group_to_two_letter_codes(code)
-            group.append(code)
-            return group, True
-        else:  # zho-> zh
-            print(f"Three letter monolingual code: {code}")
-            return [code], False
-
-    def resolve_lang_code(self, r) -> Tuple[List[str], str, str]:
-        """R is a row in ported"""
-        short_pair = r.short_pair
-        src, tgt = short_pair.split("-")
-        src_tags, src_multilingual = self.get_tags(src, r.src_name)
-        assert isinstance(src_tags, list)
-        tgt_tags, tgt_multilingual = self.get_tags(tgt, r.tgt_name)
-        assert isinstance(tgt_tags, list)
-
-        return dedup(src_tags + tgt_tags), src_multilingual, tgt_multilingual
-
-    def write_model_card(
-        self,
-        hf_model_id: str,
-        repo_root=DEFAULT_REPO,
-        dry_run=False,
-    ) -> str:
-        """
-        Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync
-        model_card_dir s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-        """
-        short_pair = remove_prefix(hf_model_id, "opus-mt-")
-        extra_metadata = self.metadata.loc[short_pair].drop("2m")
-        extra_metadata["short_pair"] = short_pair
-        lang_tags, src_multilingual, tgt_multilingual = self.resolve_lang_code(extra_metadata)
-        opus_name = f"{extra_metadata.src_alpha3}-{extra_metadata.tgt_alpha3}"
-        # opus_name: str = self.convert_hf_name_to_opus_name(hf_model_name)
-
-        assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
-        opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
-        assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
-
-        opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
-
-        readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
-
-        s, t = ",".join(opus_src), ",".join(opus_tgt)
-
-        metadata = {
-            "hf_name": short_pair,
-            "source_languages": s,
-            "target_languages": t,
-            "opus_readme_url": readme_url,
-            "original_repo": repo_root,
-            "tags": ["translation"],
-            "languages": lang_tags,
-        }
-        lang_tags = l2front_matter(lang_tags)
-        metadata["src_constituents"] = self.constituents[s]
-        metadata["tgt_constituents"] = self.constituents[t]
-        metadata["src_multilingual"] = src_multilingual
-        metadata["tgt_multilingual"] = tgt_multilingual
-
-        metadata.update(extra_metadata)
-        metadata.update(get_system_metadata(repo_root))
-
-        # combine with Tatoeba markdown
-
-        extra_markdown = f"### {short_pair}\n\n* source group: {metadata['src_name']} \n* target group: {metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
-
-        content = opus_readme_path.open().read()
-        content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
-        splat = content.split("*")[2:]
-
-        content = "*".join(splat)
-        # BETTER FRONT MATTER LOGIC
-
-        content = (
-            FRONT_MATTER_TEMPLATE.format(lang_tags)
-            + extra_markdown
-            + "\n* "
-            + content.replace("download", "download original " "weights")
-        )
-
-        items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
-        sec3 = "\n### System Info: \n" + items
-        content += sec3
-        if dry_run:
-            return content, metadata
-        sub_dir = self.model_card_dir / hf_model_id
-        sub_dir.mkdir(exist_ok=True)
-        dest = sub_dir / "README.md"
-        dest.open("w").write(content)
-        pd.Series(metadata).to_json(sub_dir / "metadata.json")
-        return content, metadata
-
-    def download_metadata(self):
-        Path(LANG_CODE_PATH).parent.mkdir(exist_ok=True)
-        import wget
-
-        if not os.path.exists(ISO_PATH):
-            wget.download(ISO_URL, ISO_PATH)
-        if not os.path.exists(LANG_CODE_PATH):
-            wget.download(LANG_CODE_URL, LANG_CODE_PATH)
-
-    @staticmethod
-    def make_tatoeba_registry(repo_path=DEFAULT_MODEL_DIR):
-        if not (Path(repo_path) / "zho-eng" / "README.md").exists():
-            raise ValueError(
-                f"repo_path:{repo_path} does not exist: "
-                "You must run: git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git before calling."
-            )
-        results = {}
-        for p in Path(repo_path).iterdir():
-            if len(p.name) != 7:
-                continue
-            lns = list(open(p / "README.md").readlines())
-            results[p.name] = _parse_readme(lns)
-        return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
-
-
-GROUP_MEMBERS = {
-    # three letter code -> (group/language name, {constituents...}
-    # if this language is on the target side the constituents can be used as target language codes.
-    # if the language is on the source side they are supported natively without special codes.
-    "aav": ("Austro-Asiatic languages", {"hoc", "hoc_Latn", "kha", "khm", "khm_Latn", "mnw", "vie", "vie_Hani"}),
-    "afa": (
-        "Afro-Asiatic languages",
-        {
-            "acm",
-            "afb",
-            "amh",
-            "apc",
-            "ara",
-            "arq",
-            "ary",
-            "arz",
-            "hau_Latn",
-            "heb",
-            "kab",
-            "mlt",
-            "rif_Latn",
-            "shy_Latn",
-            "som",
-            "thv",
-            "tir",
-        },
-    ),
-    "afr": ("Afrikaans", {"afr"}),
-    "alv": (
-        "Atlantic-Congo languages",
-        {
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "ara": ("Arabic", {"afb", "apc", "apc_Latn", "ara", "ara_Latn", "arq", "arq_Latn", "arz"}),
-    "art": (
-        "Artificial languages",
-        {
-            "afh_Latn",
-            "avk_Latn",
-            "dws_Latn",
-            "epo",
-            "ido",
-            "ido_Latn",
-            "ile_Latn",
-            "ina_Latn",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "nov_Latn",
-            "qya",
-            "qya_Latn",
-            "sjn_Latn",
-            "tlh_Latn",
-            "tzl",
-            "tzl_Latn",
-            "vol_Latn",
-        },
-    ),
-    "aze": ("Azerbaijani", {"aze_Latn"}),
-    "bat": ("Baltic languages", {"lit", "lav", "prg_Latn", "ltg", "sgs"}),
-    "bel": ("Belarusian", {"bel", "bel_Latn"}),
-    "ben": ("Bengali", {"ben"}),
-    "bnt": (
-        "Bantu languages",
-        {"kin", "lin", "lug", "nya", "run", "sna", "swh", "toi_Latn", "tso", "umb", "xho", "zul"},
-    ),
-    "bul": ("Bulgarian", {"bul", "bul_Latn"}),
-    "cat": ("Catalan", {"cat"}),
-    "cau": ("Caucasian languages", {"abk", "kat", "che", "ady"}),
-    "ccs": ("South Caucasian languages", {"kat"}),
-    "ceb": ("Cebuano", {"ceb"}),
-    "cel": ("Celtic languages", {"gla", "gle", "bre", "cor", "glv", "cym"}),
-    "ces": ("Czech", {"ces"}),
-    "cpf": ("Creoles and pidgins, French‑based", {"gcf_Latn", "hat", "mfe"}),
-    "cpp": (
-        "Creoles and pidgins, Portuguese-based",
-        {"zsm_Latn", "ind", "pap", "min", "tmw_Latn", "max_Latn", "zlm_Latn"},
-    ),
-    "cus": ("Cushitic languages", {"som"}),
-    "dan": ("Danish", {"dan"}),
-    "deu": ("German", {"deu"}),
-    "dra": ("Dravidian languages", {"tam", "kan", "mal", "tel"}),
-    "ell": ("Modern Greek (1453-)", {"ell"}),
-    "eng": ("English", {"eng"}),
-    "epo": ("Esperanto", {"epo"}),
-    "est": ("Estonian", {"est"}),
-    "euq": ("Basque (family)", {"eus"}),
-    "eus": ("Basque", {"eus"}),
-    "fin": ("Finnish", {"fin"}),
-    "fiu": (
-        "Finno-Ugrian languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "fra": ("French", {"fra"}),
-    "gem": (
-        "Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "dan",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "fao",
-            "frr",
-            "fry",
-            "gos",
-            "got_Goth",
-            "gsw",
-            "isl",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "pdc",
-            "sco",
-            "stq",
-            "swe",
-            "swg",
-            "yid",
-        },
-    ),
-    "gle": ("Irish", {"gle"}),
-    "glg": ("Galician", {"glg"}),
-    "gmq": ("North Germanic languages", {"dan", "nob", "nob_Hebr", "swe", "isl", "nno", "non_Latn", "fao"}),
-    "gmw": (
-        "West Germanic languages",
-        {
-            "afr",
-            "ang_Latn",
-            "deu",
-            "eng",
-            "enm_Latn",
-            "frr",
-            "fry",
-            "gos",
-            "gsw",
-            "ksh",
-            "ltz",
-            "nds",
-            "nld",
-            "pdc",
-            "sco",
-            "stq",
-            "swg",
-            "yid",
-        },
-    ),
-    "grk": ("Greek languages", {"grc_Grek", "ell"}),
-    "hbs": ("Serbo-Croatian", {"hrv", "srp_Cyrl", "bos_Latn", "srp_Latn"}),
-    "heb": ("Hebrew", {"heb"}),
-    "hin": ("Hindi", {"hin"}),
-    "hun": ("Hungarian", {"hun"}),
-    "hye": ("Armenian", {"hye", "hye_Latn"}),
-    "iir": (
-        "Indo-Iranian languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "jdt_Cyrl",
-            "kur_Arab",
-            "kur_Latn",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "oss",
-            "pan_Guru",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pnb",
-            "pus",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "urd",
-            "zza",
-        },
-    ),
-    "ilo": ("Iloko", {"ilo"}),
-    "inc": (
-        "Indic languages",
-        {
-            "asm",
-            "awa",
-            "ben",
-            "bho",
-            "gom",
-            "guj",
-            "hif_Latn",
-            "hin",
-            "mai",
-            "mar",
-            "npi",
-            "ori",
-            "pan_Guru",
-            "pnb",
-            "rom",
-            "san_Deva",
-            "sin",
-            "snd_Arab",
-            "urd",
-        },
-    ),
-    "ine": (
-        "Indo-European languages",
-        {
-            "afr",
-            "afr_Arab",
-            "aln",
-            "ang_Latn",
-            "arg",
-            "asm",
-            "ast",
-            "awa",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bjn",
-            "bos_Latn",
-            "bre",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ces",
-            "cor",
-            "cos",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "egl",
-            "ell",
-            "eng",
-            "enm_Latn",
-            "ext",
-            "fao",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "gcf_Latn",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "gsw",
-            "guj",
-            "hat",
-            "hif_Latn",
-            "hin",
-            "hrv",
-            "hsb",
-            "hye",
-            "hye_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "jdt_Cyrl",
-            "ksh",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lav",
-            "lij",
-            "lit",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "mai",
-            "mar",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mkd",
-            "mwl",
-            "nds",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "non_Latn",
-            "npi",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "pan_Guru",
-            "pap",
-            "pcd",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "prg_Latn",
-            "pus",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "rus",
-            "rus_Latn",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "sin",
-            "slv",
-            "snd_Arab",
-            "spa",
-            "sqi",
-            "srd",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "swe",
-            "swg",
-            "tgk_Cyrl",
-            "tly_Latn",
-            "tmw_Latn",
-            "ukr",
-            "urd",
-            "vec",
-            "wln",
-            "yid",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zza",
-        },
-    ),
-    "isl": ("Icelandic", {"isl"}),
-    "ita": ("Italian", {"ita"}),
-    "itc": (
-        "Italic languages",
-        {
-            "arg",
-            "ast",
-            "bjn",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lat_Grek",
-            "lat_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pcd",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "srd",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "jpn": ("Japanese", {"jpn", "jpn_Bopo", "jpn_Hang", "jpn_Hani", "jpn_Hira", "jpn_Kana", "jpn_Latn", "jpn_Yiii"}),
-    "jpx": ("Japanese (family)", {"jpn"}),
-    "kat": ("Georgian", {"kat"}),
-    "kor": ("Korean", {"kor_Hani", "kor_Hang", "kor_Latn", "kor"}),
-    "lav": ("Latvian", {"lav"}),
-    "lit": ("Lithuanian", {"lit"}),
-    "mkd": ("Macedonian", {"mkd"}),
-    "mkh": ("Mon-Khmer languages", {"vie_Hani", "mnw", "vie", "kha", "khm_Latn", "khm"}),
-    "msa": ("Malay (macrolanguage)", {"zsm_Latn", "ind", "max_Latn", "zlm_Latn", "min"}),
-    "mul": (
-        "Multiple languages",
-        {
-            "abk",
-            "acm",
-            "ady",
-            "afb",
-            "afh_Latn",
-            "afr",
-            "akl_Latn",
-            "aln",
-            "amh",
-            "ang_Latn",
-            "apc",
-            "ara",
-            "arg",
-            "arq",
-            "ary",
-            "arz",
-            "asm",
-            "ast",
-            "avk_Latn",
-            "awa",
-            "aze_Latn",
-            "bak",
-            "bam_Latn",
-            "bel",
-            "bel_Latn",
-            "ben",
-            "bho",
-            "bod",
-            "bos_Latn",
-            "bre",
-            "brx",
-            "brx_Latn",
-            "bul",
-            "bul_Latn",
-            "cat",
-            "ceb",
-            "ces",
-            "cha",
-            "che",
-            "chr",
-            "chv",
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cor",
-            "cos",
-            "crh",
-            "crh_Latn",
-            "csb_Latn",
-            "cym",
-            "dan",
-            "deu",
-            "dsb",
-            "dtp",
-            "dws_Latn",
-            "egl",
-            "ell",
-            "enm_Latn",
-            "epo",
-            "est",
-            "eus",
-            "ewe",
-            "ext",
-            "fao",
-            "fij",
-            "fin",
-            "fkv_Latn",
-            "fra",
-            "frm_Latn",
-            "frr",
-            "fry",
-            "fuc",
-            "fuv",
-            "gan",
-            "gcf_Latn",
-            "gil",
-            "gla",
-            "gle",
-            "glg",
-            "glv",
-            "gom",
-            "gos",
-            "got_Goth",
-            "grc_Grek",
-            "grn",
-            "gsw",
-            "guj",
-            "hat",
-            "hau_Latn",
-            "haw",
-            "heb",
-            "hif_Latn",
-            "hil",
-            "hin",
-            "hnj_Latn",
-            "hoc",
-            "hoc_Latn",
-            "hrv",
-            "hsb",
-            "hun",
-            "hye",
-            "iba",
-            "ibo",
-            "ido",
-            "ido_Latn",
-            "ike_Latn",
-            "ile_Latn",
-            "ilo",
-            "ina_Latn",
-            "ind",
-            "isl",
-            "ita",
-            "izh",
-            "jav",
-            "jav_Java",
-            "jbo",
-            "jbo_Cyrl",
-            "jbo_Latn",
-            "jdt_Cyrl",
-            "jpn",
-            "kab",
-            "kal",
-            "kan",
-            "kat",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kek_Latn",
-            "kha",
-            "khm",
-            "khm_Latn",
-            "kin",
-            "kir_Cyrl",
-            "kjh",
-            "kpv",
-            "krl",
-            "ksh",
-            "kum",
-            "kur_Arab",
-            "kur_Latn",
-            "lad",
-            "lad_Latn",
-            "lao",
-            "lat_Latn",
-            "lav",
-            "ldn_Latn",
-            "lfn_Cyrl",
-            "lfn_Latn",
-            "lij",
-            "lin",
-            "lit",
-            "liv_Latn",
-            "lkt",
-            "lld_Latn",
-            "lmo",
-            "ltg",
-            "ltz",
-            "lug",
-            "lzh",
-            "lzh_Hans",
-            "mad",
-            "mah",
-            "mai",
-            "mal",
-            "mar",
-            "max_Latn",
-            "mdf",
-            "mfe",
-            "mhr",
-            "mic",
-            "min",
-            "mkd",
-            "mlg",
-            "mlt",
-            "mnw",
-            "moh",
-            "mon",
-            "mri",
-            "mwl",
-            "mww",
-            "mya",
-            "myv",
-            "nan",
-            "nau",
-            "nav",
-            "nds",
-            "niu",
-            "nld",
-            "nno",
-            "nob",
-            "nob_Hebr",
-            "nog",
-            "non_Latn",
-            "nov_Latn",
-            "npi",
-            "nya",
-            "oci",
-            "ori",
-            "orv_Cyrl",
-            "oss",
-            "ota_Arab",
-            "ota_Latn",
-            "pag",
-            "pan_Guru",
-            "pap",
-            "pau",
-            "pdc",
-            "pes",
-            "pes_Latn",
-            "pes_Thaa",
-            "pms",
-            "pnb",
-            "pol",
-            "por",
-            "ppl_Latn",
-            "prg_Latn",
-            "pus",
-            "quc",
-            "qya",
-            "qya_Latn",
-            "rap",
-            "rif_Latn",
-            "roh",
-            "rom",
-            "ron",
-            "rue",
-            "run",
-            "rus",
-            "sag",
-            "sah",
-            "san_Deva",
-            "scn",
-            "sco",
-            "sgs",
-            "shs_Latn",
-            "shy_Latn",
-            "sin",
-            "sjn_Latn",
-            "slv",
-            "sma",
-            "sme",
-            "smo",
-            "sna",
-            "snd_Arab",
-            "som",
-            "spa",
-            "sqi",
-            "srp_Cyrl",
-            "srp_Latn",
-            "stq",
-            "sun",
-            "swe",
-            "swg",
-            "swh",
-            "tah",
-            "tam",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tel",
-            "tet",
-            "tgk_Cyrl",
-            "tha",
-            "tir",
-            "tlh_Latn",
-            "tly_Latn",
-            "tmw_Latn",
-            "toi_Latn",
-            "ton",
-            "tpw_Latn",
-            "tso",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tvl",
-            "tyv",
-            "tzl",
-            "tzl_Latn",
-            "udm",
-            "uig_Arab",
-            "uig_Cyrl",
-            "ukr",
-            "umb",
-            "urd",
-            "uzb_Cyrl",
-            "uzb_Latn",
-            "vec",
-            "vie",
-            "vie_Hani",
-            "vol_Latn",
-            "vro",
-            "war",
-            "wln",
-            "wol",
-            "wuu",
-            "xal",
-            "xho",
-            "yid",
-            "yor",
-            "yue",
-            "yue_Hans",
-            "yue_Hant",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-            "zlm_Latn",
-            "zsm_Latn",
-            "zul",
-            "zza",
-        },
-    ),
-    "nic": (
-        "Niger-Kordofanian languages",
-        {
-            "bam_Latn",
-            "ewe",
-            "fuc",
-            "fuv",
-            "ibo",
-            "kin",
-            "lin",
-            "lug",
-            "nya",
-            "run",
-            "sag",
-            "sna",
-            "swh",
-            "toi_Latn",
-            "tso",
-            "umb",
-            "wol",
-            "xho",
-            "yor",
-            "zul",
-        },
-    ),
-    "nld": ("Dutch", {"nld"}),
-    "nor": ("Norwegian", {"nob", "nno"}),
-    "phi": ("Philippine languages", {"ilo", "akl_Latn", "war", "hil", "pag", "ceb"}),
-    "pol": ("Polish", {"pol"}),
-    "por": ("Portuguese", {"por"}),
-    "pqe": (
-        "Eastern Malayo-Polynesian languages",
-        {"fij", "gil", "haw", "mah", "mri", "nau", "niu", "rap", "smo", "tah", "ton", "tvl"},
-    ),
-    "roa": (
-        "Romance languages",
-        {
-            "arg",
-            "ast",
-            "cat",
-            "cos",
-            "egl",
-            "ext",
-            "fra",
-            "frm_Latn",
-            "gcf_Latn",
-            "glg",
-            "hat",
-            "ind",
-            "ita",
-            "lad",
-            "lad_Latn",
-            "lij",
-            "lld_Latn",
-            "lmo",
-            "max_Latn",
-            "mfe",
-            "min",
-            "mwl",
-            "oci",
-            "pap",
-            "pms",
-            "por",
-            "roh",
-            "ron",
-            "scn",
-            "spa",
-            "tmw_Latn",
-            "vec",
-            "wln",
-            "zlm_Latn",
-            "zsm_Latn",
-        },
-    ),
-    "ron": ("Romanian", {"ron"}),
-    "run": ("Rundi", {"run"}),
-    "rus": ("Russian", {"rus"}),
-    "sal": ("Salishan languages", {"shs_Latn"}),
-    "sem": ("Semitic languages", {"acm", "afb", "amh", "apc", "ara", "arq", "ary", "arz", "heb", "mlt", "tir"}),
-    "sla": (
-        "Slavic languages",
-        {
-            "bel",
-            "bel_Latn",
-            "bos_Latn",
-            "bul",
-            "bul_Latn",
-            "ces",
-            "csb_Latn",
-            "dsb",
-            "hrv",
-            "hsb",
-            "mkd",
-            "orv_Cyrl",
-            "pol",
-            "rue",
-            "rus",
-            "slv",
-            "srp_Cyrl",
-            "srp_Latn",
-            "ukr",
-        },
-    ),
-    "slv": ("Slovenian", {"slv"}),
-    "spa": ("Spanish", {"spa"}),
-    "swe": ("Swedish", {"swe"}),
-    "taw": ("Tai", {"lao", "tha"}),
-    "tgl": ("Tagalog", {"tgl_Latn"}),
-    "tha": ("Thai", {"tha"}),
-    "trk": (
-        "Turkic languages",
-        {
-            "aze_Latn",
-            "bak",
-            "chv",
-            "crh",
-            "crh_Latn",
-            "kaz_Cyrl",
-            "kaz_Latn",
-            "kir_Cyrl",
-            "kjh",
-            "kum",
-            "ota_Arab",
-            "ota_Latn",
-            "sah",
-            "tat",
-            "tat_Arab",
-            "tat_Latn",
-            "tuk",
-            "tuk_Latn",
-            "tur",
-            "tyv",
-            "uig_Arab",
-            "uig_Cyrl",
-            "uzb_Cyrl",
-            "uzb_Latn",
-        },
-    ),
-    "tur": ("Turkish", {"tur"}),
-    "ukr": ("Ukrainian", {"ukr"}),
-    "urd": ("Urdu", {"urd"}),
-    "urj": (
-        "Uralic languages",
-        {
-            "est",
-            "fin",
-            "fkv_Latn",
-            "hun",
-            "izh",
-            "kpv",
-            "krl",
-            "liv_Latn",
-            "mdf",
-            "mhr",
-            "myv",
-            "sma",
-            "sme",
-            "udm",
-            "vep",
-            "vro",
-        },
-    ),
-    "vie": ("Vietnamese", {"vie", "vie_Hani"}),
-    "war": ("Waray (Philippines)", {"war"}),
-    "zho": (
-        "Chinese",
-        {
-            "cjy_Hans",
-            "cjy_Hant",
-            "cmn",
-            "cmn_Bopo",
-            "cmn_Hang",
-            "cmn_Hani",
-            "cmn_Hans",
-            "cmn_Hant",
-            "cmn_Hira",
-            "cmn_Kana",
-            "cmn_Latn",
-            "cmn_Yiii",
-            "gan",
-            "hak_Hani",
-            "lzh",
-            "lzh_Bopo",
-            "lzh_Hang",
-            "lzh_Hani",
-            "lzh_Hans",
-            "lzh_Hira",
-            "lzh_Kana",
-            "lzh_Yiii",
-            "nan",
-            "nan_Hani",
-            "wuu",
-            "wuu_Bopo",
-            "wuu_Hani",
-            "wuu_Latn",
-            "yue",
-            "yue_Bopo",
-            "yue_Hang",
-            "yue_Hani",
-            "yue_Hans",
-            "yue_Hant",
-            "yue_Hira",
-            "yue_Kana",
-            "zho",
-            "zho_Hans",
-            "zho_Hant",
-        },
-    ),
-    "zle": ("East Slavic languages", {"bel", "orv_Cyrl", "bel_Latn", "rus", "ukr", "rue"}),
-    "zls": ("South Slavic languages", {"bos_Latn", "bul", "bul_Latn", "hrv", "mkd", "slv", "srp_Cyrl", "srp_Latn"}),
-    "zlw": ("West Slavic languages", {"csb_Latn", "dsb", "hsb", "pol", "ces"}),
-}
-
-
-def l2front_matter(langs):
-    return "".join(f"- {l}\n" for l in langs)
-
-
-def dedup(lst):
-    """Preservers order"""
-    new_lst = []
-    for item in lst:
-        if not item:
-            continue
-        elif item in new_lst:
-            continue
-        else:
-            new_lst.append(item)
-    return new_lst
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-m", "--models", action="append", help="<Required> Set flag", required=True, nargs="+", dest="models"
-    )
-    parser.add_argument("-save_dir", "--save_dir", default="marian_converted", help="where to save converted models")
-    args = parser.parse_args()
-    resolver = TatoebaConverter(save_dir=args.save_dir)
-    resolver.convert_models(args.models[0])
diff --git a/src/transformers/models/old_marian/convert_marian_to_pytorch.py b/src/transformers/models/old_marian/convert_marian_to_pytorch.py
deleted file mode 100644
index a7faef942e97e3..00000000000000
--- a/src/transformers/models/old_marian/convert_marian_to_pytorch.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import os
-import socket
-import time
-import warnings
-from pathlib import Path
-from typing import Dict, List, Union
-from zipfile import ZipFile
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from transformers import MarianConfig, MarianMTModel, MarianTokenizer
-from transformers.hf_api import HfApi
-
-
-def remove_suffix(text: str, suffix: str):
-    if text.endswith(suffix):
-        return text[: -len(suffix)]
-    return text  # or whatever
-
-
-def remove_prefix(text: str, prefix: str):
-    if text.startswith(prefix):
-        return text[len(prefix) :]
-    return text  # or whatever
-
-
-def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict):
-    sd = {}
-    for k in opus_dict:
-        if not k.startswith(layer_prefix):
-            continue
-        stripped = remove_prefix(k, layer_prefix)
-        v = opus_dict[k].T  # besides embeddings, everything must be transposed.
-        sd[converter[stripped]] = torch.tensor(v).squeeze()
-    return sd
-
-
-def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is_decoder=False):
-    for i, layer in enumerate(layer_lst):
-        layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_"
-        sd = convert_encoder_layer(opus_state, layer_tag, converter)
-        layer.load_state_dict(sd, strict=True)
-
-
-def find_pretrained_model(src_lang: str, tgt_lang: str) -> List[str]:
-    """Find models that can accept src_lang as input and return tgt_lang as output."""
-    prefix = "Helsinki-NLP/opus-mt-"
-    api = HfApi()
-    model_list = api.model_list()
-    model_ids = [x.modelId for x in model_list if x.modelId.startswith("Helsinki-NLP")]
-    src_and_targ = [
-        remove_prefix(m, prefix).lower().split("-") for m in model_ids if "+" not in m
-    ]  # + cant be loaded.
-    matching = [f"{prefix}{a}-{b}" for (a, b) in src_and_targ if src_lang in a and tgt_lang in b]
-    return matching
-
-
-def add_emb_entries(wemb, final_bias, n_special_tokens=1):
-    vsize, d_model = wemb.shape
-    embs_to_add = np.zeros((n_special_tokens, d_model))
-    new_embs = np.concatenate([wemb, embs_to_add])
-    bias_to_add = np.zeros((n_special_tokens, 1))
-    new_bias = np.concatenate((final_bias, bias_to_add), axis=1)
-    return new_embs, new_bias
-
-
-def _cast_yaml_str(v):
-    bool_dct = {"true": True, "false": False}
-    if not isinstance(v, str):
-        return v
-    elif v in bool_dct:
-        return bool_dct[v]
-    try:
-        return int(v)
-    except (TypeError, ValueError):
-        return v
-
-
-def cast_marian_config(raw_cfg: Dict[str, str]) -> Dict:
-    return {k: _cast_yaml_str(v) for k, v in raw_cfg.items()}
-
-
-CONFIG_KEY = "special:model.yml"
-
-
-def load_config_from_state_dict(opus_dict):
-    import yaml
-
-    cfg_str = "".join([chr(x) for x in opus_dict[CONFIG_KEY]])
-    yaml_cfg = yaml.load(cfg_str[:-1], Loader=yaml.BaseLoader)
-    return cast_marian_config(yaml_cfg)
-
-
-def find_model_file(dest_dir):  # this one better
-    model_files = list(Path(dest_dir).glob("*.npz"))
-    assert len(model_files) == 1, model_files
-    model_file = model_files[0]
-    return model_file
-
-
-# Group Names Logic: change long opus model names to something shorter, like opus-mt-en-ROMANCE
-ROM_GROUP = (
-    "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO+es_EC+es_ES+es_GT"
-    "+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR+pt_PT+gl+lad+an+mwl+it+it_IT+co"
-    "+nap+scn+vec+sc+ro+la"
-)
-GROUPS = [
-    ("cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh", "ZH"),
-    (ROM_GROUP, "ROMANCE"),
-    ("de+nl+fy+af+da+fo+is+no+nb+nn+sv", "NORTH_EU"),
-    ("da+fo+is+no+nb+nn+sv", "SCANDINAVIA"),
-    ("se+sma+smj+smn+sms", "SAMI"),
-    ("nb_NO+nb+nn_NO+nn+nog+no_nb+no", "NORWAY"),
-    ("ga+cy+br+gd+kw+gv", "CELTIC"),  # https://en.wikipedia.org/wiki/Insular_Celtic_languages
-]
-GROUP_TO_OPUS_NAME = {
-    "opus-mt-ZH-de": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-de",
-    "opus-mt-ZH-fi": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-    "opus-mt-ZH-sv": "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-sv",
-    "opus-mt-SCANDINAVIA-SCANDINAVIA": "da+fo+is+no+nb+nn+sv-da+fo+is+no+nb+nn+sv",
-    "opus-mt-NORTH_EU-NORTH_EU": "de+nl+fy+af+da+fo+is+no+nb+nn+sv-de+nl+fy+af+da+fo+is+no+nb+nn+sv",
-    "opus-mt-de-ZH": "de-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-en_el_es_fi-en_el_es_fi": "en+el+es+fi-en+el+es+fi",
-    "opus-mt-en-ROMANCE": "en-fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-    "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-    "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la",
-    "opus-mt-en-CELTIC": "en-ga+cy+br+gd+kw+gv",
-    "opus-mt-es-NORWAY": "es-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-fi_nb_no_nn_ru_sv_en-SAMI": "fi+nb+no+nn+ru+sv+en-se+sma+smj+smn+sms",
-    "opus-mt-fi-ZH": "fi-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-fi-NORWAY": "fi-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-    "opus-mt-ROMANCE-en": "fr+fr_BE+fr_CA+fr_FR+wa+frp+oc+ca+rm+lld+fur+lij+lmo+es+es_AR+es_CL+es_CO+es_CR+es_DO"
-    "+es_EC+es_ES+es_GT+es_HN+es_MX+es_NI+es_PA+es_PE+es_PR+es_SV+es_UY+es_VE+pt+pt_br+pt_BR"
-    "+pt_PT+gl+lad+an+mwl+it+it_IT+co+nap+scn+vec+sc+ro+la-en",
-    "opus-mt-CELTIC-en": "ga+cy+br+gd+kw+gv-en",
-    "opus-mt-sv-ZH": "sv-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh",
-    "opus-mt-sv-NORWAY": "sv-nb_NO+nb+nn_NO+nn+nog+no_nb+no",
-}
-OPUS_GITHUB_URL = "https://github.com/Helsinki-NLP/OPUS-MT-train/blob/master/models/"
-ORG_NAME = "Helsinki-NLP/"
-
-
-def convert_opus_name_to_hf_name(x):
-    """For OPUS-MT-Train/ DEPRECATED"""
-    for substr, grp_name in GROUPS:
-        x = x.replace(substr, grp_name)
-    return x.replace("+", "_")
-
-
-def convert_hf_name_to_opus_name(hf_model_name):
-    """
-    Relies on the assumption that there are no language codes like pt_br in models that are not in GROUP_TO_OPUS_NAME.
-    """
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    if hf_model_name in GROUP_TO_OPUS_NAME:
-        opus_w_prefix = GROUP_TO_OPUS_NAME[hf_model_name]
-    else:
-        opus_w_prefix = hf_model_name.replace("_", "+")
-    return remove_prefix(opus_w_prefix, "opus-mt-")
-
-
-def get_system_metadata(repo_root):
-    import git
-
-    return dict(
-        helsinki_git_sha=git.Repo(path=repo_root, search_parent_directories=True).head.object.hexsha,
-        transformers_git_sha=git.Repo(path=".", search_parent_directories=True).head.object.hexsha,
-        port_machine=socket.gethostname(),
-        port_time=time.strftime("%Y-%m-%d-%H:%M"),
-    )
-
-
-# docstyle-ignore
-FRONT_MATTER_TEMPLATE = """---
-language:
-{}
-tags:
-- translation
-
-license: apache-2.0
----
-"""
-DEFAULT_REPO = "Tatoeba-Challenge"
-DEFAULT_MODEL_DIR = os.path.join(DEFAULT_REPO, "models")
-
-
-def write_model_card(
-    hf_model_name: str,
-    repo_root=DEFAULT_REPO,
-    save_dir=Path("marian_converted"),
-    dry_run=False,
-    extra_metadata={},
-) -> str:
-    """
-    Copy the most recent model's readme section from opus, and add metadata. upload command: aws s3 sync model_card_dir
-    s3://models.huggingface.co/bert/Helsinki-NLP/ --dryrun
-    """
-    import pandas as pd
-
-    hf_model_name = remove_prefix(hf_model_name, ORG_NAME)
-    opus_name: str = convert_hf_name_to_opus_name(hf_model_name)
-    assert repo_root in ("OPUS-MT-train", "Tatoeba-Challenge")
-    opus_readme_path = Path(repo_root).joinpath("models", opus_name, "README.md")
-    assert opus_readme_path.exists(), f"Readme file {opus_readme_path} not found"
-
-    opus_src, opus_tgt = [x.split("+") for x in opus_name.split("-")]
-
-    readme_url = f"https://github.com/Helsinki-NLP/{repo_root}/tree/master/models/{opus_name}/README.md"
-
-    s, t = ",".join(opus_src), ",".join(opus_tgt)
-    metadata = {
-        "hf_name": hf_model_name,
-        "source_languages": s,
-        "target_languages": t,
-        "opus_readme_url": readme_url,
-        "original_repo": repo_root,
-        "tags": ["translation"],
-    }
-    metadata.update(extra_metadata)
-    metadata.update(get_system_metadata(repo_root))
-
-    # combine with opus markdown
-
-    extra_markdown = (
-        f"### {hf_model_name}\n\n* source group: {metadata['src_name']} \n* target group: "
-        f"{metadata['tgt_name']} \n*  OPUS readme: [{opus_name}]({readme_url})\n"
-    )
-
-    content = opus_readme_path.open().read()
-    content = content.split("\n# ")[-1]  # Get the lowest level 1 header in the README -- the most recent model.
-    splat = content.split("*")[2:]
-    print(splat[3])
-    content = "*".join(splat)
-    content = (
-        FRONT_MATTER_TEMPLATE.format(metadata["src_alpha2"])
-        + extra_markdown
-        + "\n* "
-        + content.replace("download", "download original weights")
-    )
-
-    items = "\n\n".join([f"- {k}: {v}" for k, v in metadata.items()])
-    sec3 = "\n### System Info: \n" + items
-    content += sec3
-    if dry_run:
-        return content, metadata
-    sub_dir = save_dir / f"opus-mt-{hf_model_name}"
-    sub_dir.mkdir(exist_ok=True)
-    dest = sub_dir / "README.md"
-    dest.open("w").write(content)
-    pd.Series(metadata).to_json(sub_dir / "metadata.json")
-
-    # if dry_run:
-    return content, metadata
-
-
-def make_registry(repo_path="Opus-MT-train/models"):
-    if not (Path(repo_path) / "fr-en" / "README.md").exists():
-        raise ValueError(
-            f"repo_path:{repo_path} does not exist: "
-            "You must run: git clone git@github.com:Helsinki-NLP/Opus-MT-train.git before calling."
-        )
-    results = {}
-    for p in Path(repo_path).iterdir():
-        n_dash = p.name.count("-")
-        if n_dash == 0:
-            continue
-        else:
-            lns = list(open(p / "README.md").readlines())
-            results[p.name] = _parse_readme(lns)
-    return [(k, v["pre-processing"], v["download"], v["download"][:-4] + ".test.txt") for k, v in results.items()]
-
-
-def convert_all_sentencepiece_models(model_list=None, repo_path=None, dest_dir=Path("marian_converted")):
-    """Requires 300GB"""
-    save_dir = Path("marian_ckpt")
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-    save_paths = []
-    if model_list is None:
-        model_list: list = make_registry(repo_path=repo_path)
-    for k, prepro, download, test_set_url in tqdm(model_list):
-        if "SentencePiece" not in prepro:  # dont convert BPE models.
-            continue
-        if not os.path.exists(save_dir / k):
-            download_and_unzip(download, save_dir / k)
-        pair_name = convert_opus_name_to_hf_name(k)
-        convert(save_dir / k, dest_dir / f"opus-mt-{pair_name}")
-
-        save_paths.append(dest_dir / f"opus-mt-{pair_name}")
-    return save_paths
-
-
-def lmap(f, x) -> List:
-    return list(map(f, x))
-
-
-def fetch_test_set(test_set_url):
-    import wget
-
-    fname = wget.download(test_set_url, "opus_test.txt")
-    lns = Path(fname).open().readlines()
-    src = lmap(str.strip, lns[::4])
-    gold = lmap(str.strip, lns[1::4])
-    mar_model = lmap(str.strip, lns[2::4])
-    assert (
-        len(gold) == len(mar_model) == len(src)
-    ), f"Gold, marian and source lengths {len(gold)}, {len(mar_model)}, {len(src)} mismatched"
-    os.remove(fname)
-    return src, mar_model, gold
-
-
-def convert_whole_dir(path=Path("marian_ckpt/")):
-    for subdir in tqdm(list(path.ls())):
-        dest_dir = f"marian_converted/{subdir.name}"
-        if (dest_dir / "pytorch_model.bin").exists():
-            continue
-        convert(source_dir, dest_dir)
-
-
-def _parse_readme(lns):
-    """Get link and metadata from opus model card equivalent."""
-    subres = {}
-    for ln in [x.strip() for x in lns]:
-        if not ln.startswith("*"):
-            continue
-        ln = ln[1:].strip()
-
-        for k in ["download", "dataset", "models", "model", "pre-processing"]:
-            if ln.startswith(k):
-                break
-        else:
-            continue
-        if k in ["dataset", "model", "pre-processing"]:
-            splat = ln.split(":")
-            _, v = splat
-            subres[k] = v
-        elif k == "download":
-            v = ln.split("(")[-1][:-1]
-            subres[k] = v
-    return subres
-
-
-def save_tokenizer_config(dest_dir: Path):
-    dname = dest_dir.name.split("-")
-    dct = dict(target_lang=dname[-1], source_lang="-".join(dname[:-1]))
-    save_json(dct, dest_dir / "tokenizer_config.json")
-
-
-def add_to_vocab_(vocab: Dict[str, int], special_tokens: List[str]):
-    start = max(vocab.values()) + 1
-    added = 0
-    for tok in special_tokens:
-        if tok in vocab:
-            continue
-        vocab[tok] = start + added
-        added += 1
-    return added
-
-
-def find_vocab_file(model_dir):
-    return list(model_dir.glob("*vocab.yml"))[0]
-
-
-def add_special_tokens_to_vocab(model_dir: Path) -> None:
-    vocab = load_yaml(find_vocab_file(model_dir))
-    vocab = {k: int(v) for k, v in vocab.items()}
-    num_added = add_to_vocab_(vocab, ["<pad>"])
-    print(f"added {num_added} tokens to vocab")
-    save_json(vocab, model_dir / "vocab.json")
-    save_tokenizer_config(model_dir)
-
-
-def check_equal(marian_cfg, k1, k2):
-    v1, v2 = marian_cfg[k1], marian_cfg[k2]
-    assert v1 == v2, f"hparams {k1},{k2} differ: {v1} != {v2}"
-
-
-def check_marian_cfg_assumptions(marian_cfg):
-    assumed_settings = {
-        "tied-embeddings-all": True,
-        "layer-normalization": False,
-        "right-left": False,
-        "transformer-ffn-depth": 2,
-        "transformer-aan-depth": 2,
-        "transformer-no-projection": False,
-        "transformer-postprocess-emb": "d",
-        "transformer-postprocess": "dan",  # Dropout, add, normalize
-        "transformer-preprocess": "",
-        "type": "transformer",
-        "ulr-dim-emb": 0,
-        "dec-cell-base-depth": 2,
-        "dec-cell-high-depth": 1,
-        "transformer-aan-nogate": False,
-    }
-    for k, v in assumed_settings.items():
-        actual = marian_cfg[k]
-        assert actual == v, f"Unexpected config value for {k} expected {v} got {actual}"
-    check_equal(marian_cfg, "transformer-ffn-activation", "transformer-aan-activation")
-    check_equal(marian_cfg, "transformer-ffn-depth", "transformer-aan-depth")
-    check_equal(marian_cfg, "transformer-dim-ffn", "transformer-dim-aan")
-
-
-BIAS_KEY = "decoder_ff_logit_out_b"
-BART_CONVERTER = {  # for each encoder and decoder layer
-    "self_Wq": "self_attn.q_proj.weight",
-    "self_Wk": "self_attn.k_proj.weight",
-    "self_Wv": "self_attn.v_proj.weight",
-    "self_Wo": "self_attn.out_proj.weight",
-    "self_bq": "self_attn.q_proj.bias",
-    "self_bk": "self_attn.k_proj.bias",
-    "self_bv": "self_attn.v_proj.bias",
-    "self_bo": "self_attn.out_proj.bias",
-    "self_Wo_ln_scale": "self_attn_layer_norm.weight",
-    "self_Wo_ln_bias": "self_attn_layer_norm.bias",
-    "ffn_W1": "fc1.weight",
-    "ffn_b1": "fc1.bias",
-    "ffn_W2": "fc2.weight",
-    "ffn_b2": "fc2.bias",
-    "ffn_ffn_ln_scale": "final_layer_norm.weight",
-    "ffn_ffn_ln_bias": "final_layer_norm.bias",
-    # Decoder Cross Attention
-    "context_Wk": "encoder_attn.k_proj.weight",
-    "context_Wo": "encoder_attn.out_proj.weight",
-    "context_Wq": "encoder_attn.q_proj.weight",
-    "context_Wv": "encoder_attn.v_proj.weight",
-    "context_bk": "encoder_attn.k_proj.bias",
-    "context_bo": "encoder_attn.out_proj.bias",
-    "context_bq": "encoder_attn.q_proj.bias",
-    "context_bv": "encoder_attn.v_proj.bias",
-    "context_Wo_ln_scale": "encoder_attn_layer_norm.weight",
-    "context_Wo_ln_bias": "encoder_attn_layer_norm.bias",
-}
-
-
-class OpusState:
-    def __init__(self, source_dir):
-        npz_path = find_model_file(source_dir)
-        self.state_dict = np.load(npz_path)
-        cfg = load_config_from_state_dict(self.state_dict)
-        assert cfg["dim-vocabs"][0] == cfg["dim-vocabs"][1]
-        assert "Wpos" not in self.state_dict, "Wpos key in state dictionary"
-        self.state_dict = dict(self.state_dict)
-        self.wemb, self.final_bias = add_emb_entries(self.state_dict["Wemb"], self.state_dict[BIAS_KEY], 1)
-        self.pad_token_id = self.wemb.shape[0] - 1
-        cfg["vocab_size"] = self.pad_token_id + 1
-        # self.state_dict['Wemb'].sha
-        self.state_keys = list(self.state_dict.keys())
-        assert "Wtype" not in self.state_dict, "Wtype key in state dictionary"
-        self._check_layer_entries()
-        self.source_dir = source_dir
-        self.cfg = cfg
-        hidden_size, intermediate_shape = self.state_dict["encoder_l1_ffn_W1"].shape
-        assert (
-            hidden_size == cfg["dim-emb"] == 512
-        ), f"Hidden size {hidden_size} and configured size {cfg['dim_emb']} mismatched or not 512"
-
-        # Process decoder.yml
-        decoder_yml = cast_marian_config(load_yaml(source_dir / "decoder.yml"))
-        check_marian_cfg_assumptions(cfg)
-        self.hf_config = MarianConfig(
-            vocab_size=cfg["vocab_size"],
-            decoder_layers=cfg["dec-depth"],
-            encoder_layers=cfg["enc-depth"],
-            decoder_attention_heads=cfg["transformer-heads"],
-            encoder_attention_heads=cfg["transformer-heads"],
-            decoder_ffn_dim=cfg["transformer-dim-ffn"],
-            encoder_ffn_dim=cfg["transformer-dim-ffn"],
-            d_model=cfg["dim-emb"],
-            activation_function=cfg["transformer-aan-activation"],
-            pad_token_id=self.pad_token_id,
-            eos_token_id=0,
-            bos_token_id=0,
-            max_position_embeddings=cfg["dim-emb"],
-            scale_embedding=True,
-            normalize_embedding="n" in cfg["transformer-preprocess"],
-            static_position_embeddings=not cfg["transformer-train-position-embeddings"],
-            dropout=0.1,  # see opus-mt-train repo/transformer-dropout param.
-            # default: add_final_layer_norm=False,
-            num_beams=decoder_yml["beam-size"],
-            decoder_start_token_id=self.pad_token_id,
-            bad_words_ids=[[self.pad_token_id]],
-            max_length=512,
-        )
-
-    def _check_layer_entries(self):
-        self.encoder_l1 = self.sub_keys("encoder_l1")
-        self.decoder_l1 = self.sub_keys("decoder_l1")
-        self.decoder_l2 = self.sub_keys("decoder_l2")
-        if len(self.encoder_l1) != 16:
-            warnings.warn(f"Expected 16 keys for each encoder layer, got {len(self.encoder_l1)}")
-        if len(self.decoder_l1) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-        if len(self.decoder_l2) != 26:
-            warnings.warn(f"Expected 26 keys for each decoder layer, got {len(self.decoder_l1)}")
-
-    @property
-    def extra_keys(self):
-        extra = []
-        for k in self.state_keys:
-            if (
-                k.startswith("encoder_l")
-                or k.startswith("decoder_l")
-                or k in [CONFIG_KEY, "Wemb", "Wpos", "decoder_ff_logit_out_b"]
-            ):
-                continue
-            else:
-                extra.append(k)
-        return extra
-
-    def sub_keys(self, layer_prefix):
-        return [remove_prefix(k, layer_prefix) for k in self.state_dict if k.startswith(layer_prefix)]
-
-    def load_marian_model(self) -> MarianMTModel:
-        state_dict, cfg = self.state_dict, self.hf_config
-
-        assert cfg.static_position_embeddings, "config.static_position_embeddings should be True"
-        model = MarianMTModel(cfg)
-
-        assert "hidden_size" not in cfg.to_dict()
-        load_layers_(
-            model.model.encoder.layers,
-            state_dict,
-            BART_CONVERTER,
-        )
-        load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True)
-
-        # handle tensors not associated with layers
-        wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb))
-        bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias))
-        model.model.shared.weight = wemb_tensor
-        model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared
-
-        model.final_logits_bias = bias_tensor
-
-        if "Wpos" in state_dict:
-            print("Unexpected: got Wpos")
-            wpos_tensor = torch.tensor(state_dict["Wpos"])
-            model.model.encoder.embed_positions.weight = wpos_tensor
-            model.model.decoder.embed_positions.weight = wpos_tensor
-
-        if cfg.normalize_embedding:
-            assert "encoder_emb_ln_scale_pre" in state_dict
-            raise NotImplementedError("Need to convert layernorm_embedding")
-
-        assert not self.extra_keys, f"Failed to convert {self.extra_keys}"
-        assert (
-            model.model.shared.padding_idx == self.pad_token_id
-        ), f"Padding tokens {model.model.shared.padding_idx} and {self.pad_token_id} mismatched"
-        return model
-
-
-def download_and_unzip(url, dest_dir):
-    try:
-        import wget
-    except ImportError:
-        raise ImportError("you must pip install wget")
-
-    filename = wget.download(url)
-    unzip(filename, dest_dir)
-    os.remove(filename)
-
-
-def convert(source_dir: Path, dest_dir):
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-
-    add_special_tokens_to_vocab(source_dir)
-    tokenizer = MarianTokenizer.from_pretrained(str(source_dir))
-    tokenizer.save_pretrained(dest_dir)
-
-    opus_state = OpusState(source_dir)
-    assert opus_state.cfg["vocab_size"] == len(
-        tokenizer.encoder
-    ), f"Original vocab size {opus_state.cfg['vocab_size']} and new vocab size {len(tokenizer.encoder)} mismatched"
-    # save_json(opus_state.cfg, dest_dir / "marian_original_config.json")
-    # ^^ Uncomment to save human readable marian config for debugging
-
-    model = opus_state.load_marian_model()
-    model = model.half()
-    model.save_pretrained(dest_dir)
-    model.from_pretrained(dest_dir)  # sanity check
-
-
-def load_yaml(path):
-    import yaml
-
-    with open(path) as f:
-        return yaml.load(f, Loader=yaml.BaseLoader)
-
-
-def save_json(content: Union[Dict, List], path: str) -> None:
-    with open(path, "w") as f:
-        json.dump(content, f)
-
-
-def unzip(zip_path: str, dest_dir: str) -> None:
-    with ZipFile(zip_path, "r") as zipObj:
-        zipObj.extractall(dest_dir)
-
-
-if __name__ == "__main__":
-    """
-    Tatoeba conversion instructions in scripts/tatoeba/README.md
-    """
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--src", type=str, help="path to marian model sub dir", default="en-de")
-    parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    source_dir = Path(args.src)
-    assert source_dir.exists(), f"Source directory {source_dir} not found"
-    dest_dir = f"converted-{source_dir.name}" if args.dest is None else args.dest
-    convert(source_dir, dest_dir)
diff --git a/src/transformers/models/old_marian/modeling_marian.py b/src/transformers/models/old_marian/modeling_marian.py
deleted file mode 100644
index 25d3dc1ea969c5..00000000000000
--- a/src/transformers/models/old_marian/modeling_marian.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Marian Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch MarianMTModel model, ported from the Marian C++ repo."""
-
-
-from ..bart.modeling_bart import BartForConditionalGeneration
-from .configuration_marian import MarianConfig
-
-
-# See all Marian models at https://huggingface.co/models?search=Helsinki-NLP
-
-
-class MarianMTModel(BartForConditionalGeneration):
-    r"""
-    Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
-    models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
-
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-
-    Examples::
-
-        >>> from transformers import MarianTokenizer, MarianMTModel
-        >>> from typing import List
-        >>> src = 'fr'  # source language
-        >>> trg = 'en'  # target language
-        >>> sample_text = "où est l'arrêt de bus ?"
-        >>> mname = f'Helsinki-NLP/opus-mt-{src}-{trg}'
-
-        >>> model = MarianMTModel.from_pretrained(mname)
-        >>> tok = MarianTokenizer.from_pretrained(mname)
-        >>> batch = tok.prepare_seq2seq_batch(src_texts=[sample_text], return_tensors="pt")  # don't need tgt_text for inference
-        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
-        >>> words: List[str] = tok.batch_decode(gen, skip_special_tokens=True)  # returns "Where is the bus stop ?"
-
-    """
-    config_class = MarianConfig
-    _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        logits[:, self.config.pad_token_id] = float("-inf")  # never predict pad token.
-        if cur_len == max_length - 1 and self.config.eos_token_id is not None:
-            self._force_token_id_to_be_generated(logits, self.config.eos_token_id)
-        return logits
diff --git a/src/transformers/models/old_marian/modeling_tf_marian.py b/src/transformers/models/old_marian/modeling_tf_marian.py
deleted file mode 100644
index f17182306eee18..00000000000000
--- a/src/transformers/models/old_marian/modeling_tf_marian.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF Marian model, ported from the fairseq repo."""
-
-from ...file_utils import add_start_docstrings, is_tf_available
-from ...utils import logging
-from ..bart.modeling_tf_bart import BART_START_DOCSTRING, LARGE_NEGATIVE, TFBartForConditionalGeneration
-from .configuration_marian import MarianConfig
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-
-_CONFIG_FOR_DOC = "MarianConfig"
-
-START_DOCSTRING = BART_START_DOCSTRING.replace(
-    "inherits from :class:`~transformers.TFPreTrainedModel`",
-    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
-).replace("BartConfig", _CONFIG_FOR_DOC)
-
-
-logger = logging.get_logger(__name__)
-
-
-@add_start_docstrings("Marian model for machine translation", START_DOCSTRING)
-class TFMarianMTModel(TFBartForConditionalGeneration):
-    _keys_to_ignore_on_load_missing = [
-        r"model.encoder.embed_positions.weight",
-        r"model.decoder.embed_positions.weight",
-    ]
-    config_class = MarianConfig
-
-    def adjust_logits_during_generation(self, logits, cur_len, max_length):
-        """Never predict pad_token_id. Predict </s> when max_length is reached."""
-        vocab_range = tf.constant(range(self.config.vocab_size))
-        logits = tf.where(vocab_range == self.config.pad_token_id, LARGE_NEGATIVE, logits)
-        if cur_len == max_length - 1:
-            logits = tf.where(vocab_range != self.config.eos_token_id, LARGE_NEGATIVE, logits)
-        return logits
diff --git a/src/transformers/models/old_marian/tokenization_marian.py b/src/transformers/models/old_marian/tokenization_marian.py
deleted file mode 100644
index 3b4ede693f0baa..00000000000000
--- a/src/transformers/models/old_marian/tokenization_marian.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import re
-import warnings
-from pathlib import Path
-from shutil import copyfile
-from typing import Dict, List, Optional, Tuple, Union
-
-import sentencepiece
-
-from ...file_utils import add_start_docstrings
-from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
-
-
-vocab_files_names = {
-    "source_spm": "source.spm",
-    "target_spm": "target.spm",
-    "vocab": "vocab.json",
-    "tokenizer_config_file": "tokenizer_config.json",
-}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
-    "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
-    "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
-    "tokenizer_config_file": {
-        "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
-    },
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
-PRETRAINED_INIT_CONFIGURATION = {}
-
-# Example URL https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json
-
-
-class MarianTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a Marian tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        source_spm (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
-            contains the vocabulary for the source language.
-        target_spm (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
-            contains the vocabulary for the target language.
-        source_lang (:obj:`str`, `optional`):
-            A string representing the source language.
-        target_lang (:obj:`str`, `optional`):
-            A string representing the target language.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The end of sequence token.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        model_max_length (:obj:`int`, `optional`, defaults to 512):
-            The maximum sentence length the model accepts.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
-            Additional special tokens used by the tokenizer.
-
-    Examples::
-
-        >>> from transformers import MarianTokenizer
-        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
-        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
-        >>> # keys  [input_ids, attention_mask, labels].
-        >>> # model(**batch) should work
-    """
-
-    vocab_files_names = vocab_files_names
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["attention_mask"]
-    language_code_re = re.compile(">>.+<<")  # type: re.Pattern
-
-    def __init__(
-        self,
-        vocab,
-        source_spm,
-        target_spm,
-        source_lang=None,
-        target_lang=None,
-        unk_token="<unk>",
-        eos_token="</s>",
-        pad_token="<pad>",
-        model_max_length=512,
-        **kwargs
-    ):
-        super().__init__(
-            # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
-            source_lang=source_lang,
-            target_lang=target_lang,
-            unk_token=unk_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            model_max_length=model_max_length,
-            **kwargs,
-        )
-        assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
-        self.encoder = load_json(vocab)
-        if self.unk_token not in self.encoder:
-            raise KeyError("<unk> token must be in vocab")
-        assert self.pad_token in self.encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-
-        self.source_lang = source_lang
-        self.target_lang = target_lang
-        self.supported_language_codes: list = [k for k in self.encoder if k.startswith(">>") and k.endswith("<<")]
-        self.spm_files = [source_spm, target_spm]
-
-        # load SentencePiece model for pre-processing
-        self.spm_source = load_spm(source_spm)
-        self.spm_target = load_spm(target_spm)
-        self.current_spm = self.spm_source
-
-        # Multilingual target side: default to using first supported language code.
-
-        self._setup_normalizer()
-
-    def _setup_normalizer(self):
-        try:
-            from sacremoses import MosesPunctNormalizer
-
-            self.punc_normalizer = MosesPunctNormalizer(self.source_lang).normalize
-        except (ImportError, FileNotFoundError):
-            warnings.warn("Recommended: pip install sacremoses.")
-            self.punc_normalizer = lambda x: x
-
-    def normalize(self, x: str) -> str:
-        """Cover moses empty string edge case. They return empty list for '' input!"""
-        return self.punc_normalizer(x) if x else ""
-
-    def _convert_token_to_id(self, token):
-        return self.encoder.get(token, self.encoder[self.unk_token])
-
-    def remove_language_code(self, text: str):
-        """Remove language codes like <<fr>> before sentencepiece"""
-        match = self.language_code_re.match(text)
-        code: list = [match.group(0)] if match else []
-        return code, self.language_code_re.sub("", text)
-
-    def _tokenize(self, text: str) -> List[str]:
-        code, text = self.remove_language_code(text)
-        pieces = self.current_spm.EncodeAsPieces(text)
-        return code + pieces
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the encoder."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Uses target language sentencepiece model"""
-        return self.spm_target.DecodePieces(tokens)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
-        """Build model inputs from a sequence by appending eos_token_id."""
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        tgt_texts: Optional[List[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        return_tensors: Optional[str] = None,
-        truncation=True,
-        padding="longest",
-        **unused,
-    ) -> BatchEncoding:
-        if "" in src_texts:
-            raise ValueError(f"found empty string in src_texts: {src_texts}")
-        self.current_spm = self.spm_source
-        src_texts = [self.normalize(t) for t in src_texts]  # this does not appear to do much
-        tokenizer_kwargs = dict(
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-        )
-        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
-
-        if tgt_texts is None:
-            return model_inputs
-        if max_target_length is not None:
-            tokenizer_kwargs["max_length"] = max_target_length
-
-        self.current_spm = self.spm_target
-        model_inputs["labels"] = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
-        self.current_spm = self.spm_source
-        return model_inputs
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        save_dir = Path(save_directory)
-        assert save_dir.is_dir(), f"{save_directory} should be a directory"
-        save_json(
-            self.encoder,
-            save_dir / ((filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab"]),
-        )
-
-        for orig, f in zip(["source.spm", "target.spm"], self.spm_files):
-            dest_path = save_dir / ((filename_prefix + "-" if filename_prefix else "") + Path(f).name)
-            if not dest_path.exists():
-                copyfile(f, save_dir / orig)
-
-        return tuple(
-            save_dir / ((filename_prefix + "-" if filename_prefix else "") + f) for f in self.vocab_files_names
-        )
-
-    def get_vocab(self) -> Dict:
-        vocab = self.encoder.copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self) -> Dict:
-        state = self.__dict__.copy()
-        state.update({k: None for k in ["spm_source", "spm_target", "current_spm", "punc_normalizer"]})
-        return state
-
-    def __setstate__(self, d: Dict) -> None:
-        self.__dict__ = d
-        self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files)
-        self.current_spm = self.spm_source
-        self._setup_normalizer()
-
-    def num_special_tokens_to_add(self, **unused):
-        """Just EOS"""
-        return 1
-
-    def _special_token_mask(self, seq):
-        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
-        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-        return [1 if x in all_special_ids else 0 for x in seq]
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
-        if already_has_special_tokens:
-            return self._special_token_mask(token_ids_0)
-        elif token_ids_1 is None:
-            return self._special_token_mask(token_ids_0) + [1]
-        else:
-            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
-
-
-def load_spm(path: str) -> sentencepiece.SentencePieceProcessor:
-    spm = sentencepiece.SentencePieceProcessor()
-    spm.Load(path)
-    return spm
-
-
-def save_json(data, path: str) -> None:
-    with open(path, "w") as f:
-        json.dump(data, f, indent=2)
-
-
-def load_json(path: str) -> Union[Dict, List]:
-    with open(path, "r") as f:
-        return json.load(f)
diff --git a/src/transformers/models/old_mbart/__init__.py b/src/transformers/models/old_mbart/__init__.py
deleted file mode 100644
index 2fa8876085ed72..00000000000000
--- a/src/transformers/models/old_mbart/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_mbart import MBartConfig
-
-
-if is_sentencepiece_available():
-    from .tokenization_mbart import MBartTokenizer
-
-if is_tokenizers_available():
-    from .tokenization_mbart_fast import MBartTokenizerFast
-
-if is_torch_available():
-    from .modeling_mbart import MBartForConditionalGeneration, MBartModel
-
-if is_tf_available():
-    from .modeling_tf_mbart import TFMBartForConditionalGeneration
diff --git a/src/transformers/models/old_mbart/configuration_mbart.py b/src/transformers/models/old_mbart/configuration_mbart.py
deleted file mode 100644
index c8b4540e1efd53..00000000000000
--- a/src/transformers/models/old_mbart/configuration_mbart.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" MBART configuration """
-
-from ...utils import logging
-from ..bart.configuration_bart import BartConfig
-
-
-logger = logging.get_logger(__name__)
-
-MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
-    "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/config.json",
-}
-
-
-class MBartConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 250027):
-            Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartForConditionalGeneration`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings. Only True for Bart.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
-
-    model_type = "mbart"
-    keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py b/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
deleted file mode 100644
index 46c933d7a4edc2..00000000000000
--- a/src/transformers/models/old_mbart/convert_mbart_original_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import torch
-
-from transformers import BartForConditionalGeneration, MBartConfig
-
-from ..bart.convert_bart_original_pytorch_checkpoint_to_pytorch import remove_ignore_keys_
-
-
-def convert_fairseq_mbart_checkpoint_from_disk(checkpoint_path, hf_config_path="facebook/mbart-large-en-ro"):
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
-    remove_ignore_keys_(state_dict)
-    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
-    mbart_config = MBartConfig.from_pretrained(hf_config_path, vocab_size=vocab_size)
-    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
-    model = BartForConditionalGeneration(mbart_config)
-    model.model.load_state_dict(state_dict)
-    return model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem."
-    )
-    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    parser.add_argument(
-        "--hf_config",
-        default="facebook/mbart-large-cc25",
-        type=str,
-        help="Which huggingface architecture to use: bart-large-xsum",
-    )
-    args = parser.parse_args()
-    model = convert_fairseq_mbart_checkpoint_from_disk(args.fairseq_path, hf_config_path=args.hf_config)
-    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/old_mbart/modeling_mbart.py b/src/transformers/models/old_mbart/modeling_mbart.py
deleted file mode 100644
index f4aa39b075145c..00000000000000
--- a/src/transformers/models/old_mbart/modeling_mbart.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..bart.modeling_bart import BartForConditionalGeneration, BartModel
-from .configuration_mbart import MBartConfig
-
-
-_CONFIG_FOR_DOC = "MBartConfig"
-_TOKENIZER_FOR_DOC = "MBartTokenizer"
-
-MBART_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/mbart-large-cc25",
-    "facebook/mbart-large-en-ro",
-    # See all multilingual BART models at https://huggingface.co/models?filter=mbart
-]
-
-
-class MBartModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = MBartConfig
-    _keys_to_ignore_on_load_missing = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
-
-
-class MBartForConditionalGeneration(BartForConditionalGeneration):
-    r"""
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-
-    Examples::
-        >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-        >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
-        >>> article = "UN Chief Says There Is No Military Solution in Syria"
-        >>> batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], return_tensors="pt")
-        >>> translated_tokens = model.generate(**batch)
-        >>> translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-        >>> assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
-    """
-    model_type = "mbart"
-    config_class = MBartConfig
-    _keys_to_ignore_on_load_missing = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
diff --git a/src/transformers/models/old_mbart/modeling_tf_mbart.py b/src/transformers/models/old_mbart/modeling_tf_mbart.py
deleted file mode 100644
index 23b30fd4b36683..00000000000000
--- a/src/transformers/models/old_mbart/modeling_tf_mbart.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF mBART model, originally from fairseq."""
-from ...file_utils import add_start_docstrings
-from ...utils import logging
-from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
-from .configuration_mbart import MBartConfig
-
-
-_CONFIG_FOR_DOC = "MBartConfig"
-
-START_DOCSTRING = BART_START_DOCSTRING.replace(
-    "inherits from :class:`~transformers.TFPreTrainedModel`",
-    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
-).replace("BartConfig", _CONFIG_FOR_DOC)
-
-
-logger = logging.get_logger(__name__)
-
-
-@add_start_docstrings("mBART (multilingual BART) model for machine translation", START_DOCSTRING)
-class TFMBartForConditionalGeneration(TFBartForConditionalGeneration):
-    config_class = MBartConfig
-    # All the code is in src/transformers/models/bart/modeling_tf_bart.py
diff --git a/src/transformers/models/old_mbart/tokenization_mbart.py b/src/transformers/models/old_mbart/tokenization_mbart.py
deleted file mode 100644
index e8425fe8c539e1..00000000000000
--- a/src/transformers/models/old_mbart/tokenization_mbart.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from ...file_utils import add_start_docstrings
-from ...tokenization_utils import BatchEncoding
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
-from ...utils import logging
-from ..xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
-
-FAIRSEQ_LANGUAGE_CODES = [
-    "ar_AR",
-    "cs_CZ",
-    "de_DE",
-    "en_XX",
-    "es_XX",
-    "et_EE",
-    "fi_FI",
-    "fr_XX",
-    "gu_IN",
-    "hi_IN",
-    "it_IT",
-    "ja_XX",
-    "kk_KZ",
-    "ko_KR",
-    "lt_LT",
-    "lv_LV",
-    "my_MM",
-    "ne_NP",
-    "nl_XX",
-    "ro_RO",
-    "ru_RU",
-    "si_LK",
-    "tr_TR",
-    "vi_VN",
-    "zh_CN",
-]
-
-
-class MBartTokenizer(XLMRobertaTokenizer):
-    """
-    Construct an MBART tokenizer.
-
-    :class:`~transformers.MBartTokenizer` is a subclass of :class:`~transformers.XLMRobertaTokenizer` and adds a new
-    :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch`
-
-    Refer to superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning the
-    initialization parameters and other methods.
-
-    .. warning::
-
-        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
-        properly.
-
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
-    <tokens> <eos>``` for target language documents.
-
-    Examples::
-
-        >>> from transformers import MBartTokenizer
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro')
-        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> batch: dict = tokenizer.prepare_seq2seq_batch(
-        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
-        ... )
-
-    """
-
-    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
-    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
-
-    prefix_tokens: List[int] = []
-    suffix_tokens: List[int] = []
-
-    def __init__(self, *args, tokenizer_file=None, **kwargs):
-        super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs)
-
-        self.sp_model_size = len(self.sp_model)
-        self.lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
-        }
-        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
-        self.cur_lang_code = self.lang_code_to_id["en_XX"]
-        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
-
-        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        self._additional_special_tokens = list(self.lang_code_to_id.keys())
-        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
-
-        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
-        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[List[str]] = None,
-        tgt_lang: str = "ro_RO",
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        truncation: bool = True,
-        padding: str = "longest",
-        return_tensors: Optional[str] = None,
-        add_prefix_space: bool = False,  # ignored
-        **kwargs,
-    ) -> BatchEncoding:
-        if max_length is None:
-            max_length = self.model_max_length
-        self.set_src_lang_special_tokens(src_lang)
-        model_inputs: BatchEncoding = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
-        )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        self.set_tgt_lang_special_tokens(tgt_lang)
-
-        labels = self(
-            tgt_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_target_length,
-            truncation=True,
-            **kwargs,
-        )["input_ids"]
-        model_inputs["labels"] = labels
-        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
-        return model_inputs
-
-    def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
-        self.cur_lang_code = self.lang_code_to_id[src_lang]
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-
-    def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
-        self.cur_lang_code = self.lang_code_to_id[lang]
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
diff --git a/src/transformers/models/old_mbart/tokenization_mbart_fast.py b/src/transformers/models/old_mbart/tokenization_mbart_fast.py
deleted file mode 100644
index 56e7c065f39146..00000000000000
--- a/src/transformers/models/old_mbart/tokenization_mbart_fast.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from tokenizers import processors
-
-from ...file_utils import add_start_docstrings, is_sentencepiece_available
-from ...tokenization_utils import BatchEncoding
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
-from ...utils import logging
-from ..xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
-
-
-if is_sentencepiece_available():
-    from .tokenization_mbart import MBartTokenizer
-else:
-    MBartTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
-SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
-tokenizer_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json"
-
-FAIRSEQ_LANGUAGE_CODES = [
-    "ar_AR",
-    "cs_CZ",
-    "de_DE",
-    "en_XX",
-    "es_XX",
-    "et_EE",
-    "fi_FI",
-    "fr_XX",
-    "gu_IN",
-    "hi_IN",
-    "it_IT",
-    "ja_XX",
-    "kk_KZ",
-    "ko_KR",
-    "lt_LT",
-    "lv_LV",
-    "my_MM",
-    "ne_NP",
-    "nl_XX",
-    "ro_RO",
-    "ru_RU",
-    "si_LK",
-    "tr_TR",
-    "vi_VN",
-    "zh_CN",
-]
-
-
-class MBartTokenizerFast(XLMRobertaTokenizerFast):
-    """
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
-
-    :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
-    a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
-
-    Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning
-    the initialization parameters and other methods.
-
-    .. warning::
-        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
-        properly.
-
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
-    <tokens> <eos>``` for target language documents.
-
-    Examples::
-
-        >>> from transformers import MBartTokenizerFast
-        >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro')
-        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> batch: dict = tokenizer.prepare_seq2seq_batch(
-        ...     example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt"
-        ... )
-    """
-
-    vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
-    max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
-    pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
-    slow_tokenizer_class = MBartTokenizer
-
-    prefix_tokens: List[int] = []
-    suffix_tokens: List[int] = []
-
-    def __init__(self, *args, tokenizer_file=None, **kwargs):
-        super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs)
-
-        self.cur_lang_code = self.convert_tokens_to_ids("en_XX")
-        self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
-
-        self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An MBART sequence has the following format, where ``X`` represents the sequence:
-
-        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
-        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[List[str]] = None,
-        tgt_lang: str = "ro_RO",
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        truncation: bool = True,
-        padding: str = "longest",
-        return_tensors: str = None,
-        **kwargs,
-    ) -> BatchEncoding:
-        if max_length is None:
-            max_length = self.model_max_length
-        self.set_src_lang_special_tokens(src_lang)
-        model_inputs: BatchEncoding = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
-        )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        self.set_tgt_lang_special_tokens(tgt_lang)
-
-        labels = self(
-            tgt_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_target_length,
-            truncation=True,
-            **kwargs,
-        )["input_ids"]
-        model_inputs["labels"] = labels
-        self.set_src_lang_special_tokens(src_lang)  # sets to src_lang
-        return model_inputs
-
-    def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
-        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
-        self.cur_lang_code = self.convert_tokens_to_ids(lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
diff --git a/src/transformers/models/old_pegasus/__init__.py b/src/transformers/models/old_pegasus/__init__.py
deleted file mode 100644
index 20d1c3872dc1c3..00000000000000
--- a/src/transformers/models/old_pegasus/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_pegasus import PegasusConfig
-
-
-if is_sentencepiece_available():
-    from .tokenization_pegasus import PegasusTokenizer
-
-if is_tokenizers_available():
-    from .tokenization_pegasus_fast import PegasusTokenizerFast
-
-if is_torch_available():
-    from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
-
-if is_tf_available():
-    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
diff --git a/src/transformers/models/old_pegasus/configuration_pegasus.py b/src/transformers/models/old_pegasus/configuration_pegasus.py
deleted file mode 100644
index 585f06ddb46e6c..00000000000000
--- a/src/transformers/models/old_pegasus/configuration_pegasus.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PEGASUS model configuration """
-
-from ...utils import logging
-from ..bart.configuration_bart import BartConfig
-
-
-logger = logging.get_logger(__name__)
-
-# These config values do not vary between checkpoints
-DEFAULTS = dict(
-    vocab_size=96103,
-    max_position_embeddings=512,
-    d_model=1024,
-    encoder_ffn_dim=4096,
-    decoder_ffn_dim=4096,
-    encoder_attention_heads=16,
-    decoder_attention_heads=16,
-    encoder_layers=16,
-    decoder_layers=16,
-    dropout=0.1,
-    attention_dropout=0.1,
-    activation_dropout=0.1,
-    pad_token_id=0,
-    eos_token_id=1,
-    is_encoder_decoder=True,
-    normalize_before=True,
-    scale_embedding=True,
-    normalize_embedding=False,
-    add_final_layer_norm=True,
-    static_position_embeddings=True,
-    num_beams=8,
-    activation_function="relu",
-)
-# Config values that vary between checkpoints: for testing and conversion
-task_specific_params = {
-    # These are task specific params for pegasus-large and normal params for finetuned checkpoints
-    "summarization_xsum": {"length_penalty": 0.6, "max_length": 64, "max_position_embeddings": 512},
-    "summarization_cnn_dailymail": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 1024},
-    "summarization_newsroom": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_wikihow": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 512},
-    "summarization_multi_news": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_reddit_tifu": {"length_penalty": 0.6, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_big_patent": {"length_penalty": 0.7, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_arxiv": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_pubmed": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_gigaword": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 128},
-    "summarization_aeslc": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 512},
-    "summarization_billsum": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 1024},
-    # this last entry is useless -- just for consistency
-    "summarization_large": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-}
-
-
-class PegasusConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
-    specified arguments, defining the model architecture.
-
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-
-    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 96103):
-            Vocabulary size of the Pegasus model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusForConditionalGeneration`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 16):
-            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 16):
-            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
-
-    model_type = "pegasus"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # The implementation of the config object is in BartConfig
diff --git a/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
deleted file mode 100644
index 9254a0ba941100..00000000000000
--- a/src/transformers/models/old_pegasus/convert_pegasus_tf_to_pytorch.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-from pathlib import Path
-from typing import Dict
-
-import tensorflow as tf
-import torch
-from tqdm import tqdm
-
-from transformers import PegasusConfig, PegasusForConditionalGeneration, PegasusTokenizer
-from transformers.models.pegasus.configuration_pegasus import DEFAULTS, task_specific_params
-
-
-PATTERNS = [
-    # replace left string with right string to get the relevant state_dict key (identical state dict to bart)
-    ["memory_attention", "encoder_attn"],
-    ["attention", "attn"],
-    ["/", "."],
-    [".LayerNorm.gamma", "_layer_norm.weight"],
-    [".LayerNorm.beta", "_layer_norm.bias"],
-    ["r.layer_", "r.layers."],
-    ["output_proj", "out_proj"],
-    ["ffn.dense_1.", "fc2."],
-    ["ffn.dense.", "fc1."],
-    ["ffn_layer_norm", "final_layer_norm"],
-    ["kernel", "weight"],
-    ["encoder_layer_norm.", "encoder.layer_norm."],
-    ["decoder_layer_norm.", "decoder.layer_norm."],
-    ["embeddings.weights", "shared.weight"],
-]
-
-
-def rename_state_dict_key(k):
-
-    for pegasus_name, hf_name in PATTERNS:
-        k = k.replace(pegasus_name, hf_name)
-    return k
-
-
-# See appendix C of paper for all hyperparams
-
-
-def convert_pegasus(tf_weights: dict, cfg_updates: dict) -> PegasusForConditionalGeneration:
-    cfg_kwargs = DEFAULTS.copy()
-    cfg_kwargs.update(cfg_updates)
-    cfg = PegasusConfig(**cfg_kwargs)
-    torch_model = PegasusForConditionalGeneration(cfg)
-    sd = torch_model.model.state_dict()
-    mapping = {}
-    for k, v in tf_weights.items():
-        new_k = rename_state_dict_key(k)
-        if new_k not in sd:
-            raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})")
-
-        if "dense" in k or "proj" in new_k:
-            v = v.T
-        mapping[new_k] = torch.tensor(v, dtype=sd[new_k].dtype)
-        assert v.shape == sd[new_k].shape, f"{new_k}, {k}, {v.shape}, {sd[new_k].shape}"
-    # make sure embedding.padding_idx is respected
-    mapping["shared.weight"][cfg.pad_token_id] = torch.zeros_like(mapping["shared.weight"][cfg.pad_token_id + 1])
-    mapping["encoder.embed_tokens.weight"] = mapping["shared.weight"]
-    mapping["decoder.embed_tokens.weight"] = mapping["shared.weight"]
-    empty_biases = {k: torch.zeros_like(v) for k, v in sd.items() if k.endswith("bias") and k not in mapping}
-    mapping.update(**empty_biases)
-    missing, extra = torch_model.model.load_state_dict(mapping, strict=False)
-    unexpected_missing = [
-        k for k in missing if k not in ["encoder.embed_positions.weight", "decoder.embed_positions.weight"]
-    ]
-    assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}"
-    assert extra == [], f"no matches found for the following tf keys {extra}"
-    return torch_model
-
-
-def get_tf_weights_as_numpy(path="./ckpt/aeslc/model.ckpt-32000") -> Dict:
-    init_vars = tf.train.list_variables(path)
-    tf_weights = {}
-    ignore_name = ["Adafactor", "global_step"]
-    for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"):
-        skip_key = any([pat in name for pat in ignore_name])
-        if skip_key:
-            continue
-        array = tf.train.load_variable(path, name)
-        tf_weights[name] = array
-    return tf_weights
-
-
-def convert_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str):
-    # save tokenizer first
-    dataset = Path(ckpt_path).parent.name
-    desired_max_model_length = task_specific_params[f"summarization_{dataset}"]["max_position_embeddings"]
-    tok = PegasusTokenizer.from_pretrained("sshleifer/pegasus", model_max_length=desired_max_model_length)
-    assert tok.model_max_length == desired_max_model_length
-    tok.save_pretrained(save_dir)
-
-    # convert model
-    tf_weights = get_tf_weights_as_numpy(ckpt_path)
-    cfg_updates = task_specific_params[f"summarization_{dataset}"]
-    if dataset == "large":
-        cfg_updates["task_specific_params"] = task_specific_params
-    torch_model = convert_pegasus(tf_weights, cfg_updates)
-    torch_model.save_pretrained(save_dir)
-    sd = torch_model.state_dict()
-    sd.pop("model.decoder.embed_positions.weight")
-    sd.pop("model.encoder.embed_positions.weight")
-    torch.save(sd, Path(save_dir) / "pytorch_model.bin")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("tf_ckpt_path", type=str, help="passed to tf.train.list_variables")
-    parser.add_argument("save_dir", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-    if args.save_dir is None:
-        dataset = Path(args.tf_ckpt_path).parent.name
-        args.save_dir = os.path.join("pegasus", dataset)
-    convert_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir)
diff --git a/src/transformers/models/old_pegasus/modeling_pegasus.py b/src/transformers/models/old_pegasus/modeling_pegasus.py
deleted file mode 100644
index c7fde4164330db..00000000000000
--- a/src/transformers/models/old_pegasus/modeling_pegasus.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Pegasus model, ported from https://github.com/google-research/pegasus"""
-
-
-from ...file_utils import add_start_docstrings
-from ..bart.modeling_bart import BART_START_DOCSTRING, BartForConditionalGeneration, BartModel
-from .configuration_pegasus import PegasusConfig
-
-
-@add_start_docstrings(
-    "The bare Pegasus Model transformer outputting raw hidden-states without any specific head on top.",
-    BART_START_DOCSTRING,
-)
-class PegasusModel(BartModel):
-    r"""
-    This class overrides :class:`~transformers.BartModel`. Please check the superclass for the appropriate
-    documentation alongside usage examples.
-    """
-
-    config_class = PegasusConfig
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-        "encoder.embed_positions",
-        "decoder.embed_positions",
-    ]
-    _keys_to_ignore_on_save = [
-        "encoder.embed_positions.weight",
-        "decoder.embed_positions.weight",
-    ]
-
-
-@add_start_docstrings("The Pegasus Model for summarization ", BART_START_DOCSTRING)
-class PegasusForConditionalGeneration(BartForConditionalGeneration):
-    r"""
-    Pytorch version of google's pegasus model for summarization. Available models are listed `here
-    <https://huggingface.co/models?search=pegasus>`__.
-
-    This class overrides :class:`~transformers.BartForConditionalGeneration`. Please check the superclass for the
-    appropriate documentation alongside usage examples.
-
-    Examples::
-
-        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
-        >>> from typing import List
-        >>> PGE_ARTICLE = "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
-        >>> mname = "google/pegasus-xsum"
-
-        >>> model = PegasusForConditionalGeneration.from_pretrained(mname)
-        >>> tok = PegasusTokenizer.from_pretrained(mname)
-        >>> batch = tok.prepare_seq2seq_batch(src_texts=[PGE_ARTICLE], return_tensors="pt")  # don't need tgt_text for inference
-        >>> gen = model.generate(**batch)  # for forward pass: model(**batch)
-        >>> summary: List[str] = tok.batch_decode(gen, skip_special_tokens=True)
-        >>> assert summary == "California's largest electricity provider has turned off power to tens of thousands of customers."
-
-    """
-    # All the code is in src/transformers/models/bart/modeling_bart.py
-    config_class = PegasusConfig
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"encoder\.version",
-        r"decoder\.version",
-        "model.encoder.embed_positions",
-        "model.decoder.embed_positions",
-    ]
-    _keys_to_ignore_on_save = [
-        "model.encoder.embed_positions.weight",
-        "model.decoder.embed_positions.weight",
-    ]
diff --git a/src/transformers/models/old_pegasus/modeling_tf_pegasus.py b/src/transformers/models/old_pegasus/modeling_tf_pegasus.py
deleted file mode 100644
index bec856575d191a..00000000000000
--- a/src/transformers/models/old_pegasus/modeling_tf_pegasus.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TF Pegasus model, ported from the fairseq repo."""
-from ...file_utils import add_start_docstrings
-from ...utils import logging
-from ..bart.modeling_tf_bart import BART_START_DOCSTRING, TFBartForConditionalGeneration
-from .configuration_pegasus import PegasusConfig
-
-
-_CONFIG_FOR_DOC = "PegasusConfig"
-
-START_DOCSTRING = BART_START_DOCSTRING.replace(
-    "inherits from :class:`~transformers.TFPreTrainedModel`",
-    "inherits from :class:`~transformers.TFBartForConditionalGeneration`",
-).replace("BartConfig", _CONFIG_FOR_DOC)
-
-
-logger = logging.get_logger(__name__)
-
-
-@add_start_docstrings("Pegasus model for summarization", START_DOCSTRING)
-class TFPegasusForConditionalGeneration(TFBartForConditionalGeneration):
-    _keys_to_ignore_on_load_missing = [
-        r"final_logits_bias",
-        r"model.encoder.embed_positions.weight",
-        r"model.decoder.embed_positions.weight",
-    ]
-    config_class = PegasusConfig
-    # All the code is in src/transformers/models/bart/modeling_tf_bart.py
diff --git a/src/transformers/models/old_pegasus/tokenization_pegasus.py b/src/transformers/models/old_pegasus/tokenization_pegasus.py
deleted file mode 100644
index 099bdf3e7b3136..00000000000000
--- a/src/transformers/models/old_pegasus/tokenization_pegasus.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from shutil import copyfile
-from typing import Dict, List, Optional, Tuple
-
-import sentencepiece as spm
-
-from ...file_utils import add_start_docstrings
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ...utils import logging
-
-
-SPIECE_UNDERLINE = "▁"
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/pegasus-xsum": 512,
-}
-
-
-logger = logging.get_logger(__name__)
-
-
-class PegasusTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The end of sequence token.
-
-            .. note::
-
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
-            The token used for masking single token values. This is the token used when training this model with masked
-            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
-            The token used for masking whole target sentences. This is the token used when training this model with gap
-            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
-            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
-            that uses the tokens 2 - 104 only for pretraining
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    offset = 103  # entries 2 - 104 are only used for pretraining
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        pad_token="<pad>",
-        eos_token="</s>",
-        unk_token="<unk>",
-        mask_token="<mask_2>",
-        mask_token_sent="<mask_1>",
-        additional_special_tokens=None,
-        **kwargs
-    ):
-        if additional_special_tokens is not None:
-            assert isinstance(
-                additional_special_tokens, list
-            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
-
-            additional_special_tokens_extended = (
-                ([mask_token_sent] + additional_special_tokens)
-                if mask_token_sent not in additional_special_tokens
-                else additional_special_tokens
-            )
-            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
-            additional_special_tokens_extended += [
-                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
-            ]
-
-            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
-                raise ValueError(
-                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
-                )
-            additional_special_tokens = additional_special_tokens_extended
-        else:
-            additional_special_tokens = [mask_token_sent]
-            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
-
-        super().__init__(
-            eos_token=eos_token,
-            unk_token=unk_token,
-            mask_token=mask_token,
-            pad_token=pad_token,
-            mask_token_sent=mask_token_sent,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
-        self.mask_token_sent = mask_token_sent
-
-        # add special tokens to encoder dict
-        self.encoder: Dict[int, str] = {
-            0: self.pad_token,
-            1: self.eos_token,
-            2: self.mask_token_sent,
-            3: self.mask_token,
-        }
-        # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
-        # mask_token_sent is already added to list -> so start at 1
-        self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
-        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model) + self.offset
-
-    def get_vocab(self) -> Dict[str, int]:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text, sample=False):
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        if not sample:
-            pieces = self.sp_model.EncodeAsPieces(text)
-        else:
-            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-        return pieces
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token (str) to an id using the vocab. """
-        if token in self.decoder:
-            return self.decoder[token]
-        elif token in self.added_tokens_decoder:
-            return self.added_tokens_decoder[token]
-        sp_id = self.sp_model.piece_to_id(token)
-        return sp_id + self.offset
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) to a token (str) using the vocab."""
-        if index in self.encoder:
-            return self.encoder[index]
-        elif index in self.added_tokens_encoder:
-            return self.added_tokens_encoder[index]
-        else:
-            token = self.sp_model.IdToPiece(index - self.offset)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = self.sp_model.decode_pieces(tokens)
-        return out_string
-
-    def num_special_tokens_to_add(self, pair=False):
-        """Just EOS"""
-        return 1
-
-    def _special_token_mask(self, seq):
-        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
-        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-
-        assert all_special_ids == set(
-            range(len(self.additional_special_tokens) + 3)
-        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
-
-        return [1 if x in all_special_ids else 0 for x in seq]
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
-        if already_has_special_tokens:
-            return self._special_token_mask(token_ids_0)
-        elif token_ids_1 is None:
-            return self._special_token_mask(token_ids_0) + [1]
-        else:
-            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
-
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        tgt_texts: Optional[List[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        return_tensors: str = None,
-        truncation=True,
-        padding="longest",
-        **unused,
-    ) -> BatchEncoding:
-        if "" in src_texts:
-            raise ValueError(f"found empty string in src_texts: {src_texts}")
-        tokenizer_kwargs = dict(
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-        )
-        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
-        if tgt_texts is None:
-            return model_inputs
-        if max_target_length is not None:
-            tokenizer_kwargs["max_length"] = max_target_length
-        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
-        model_inputs["labels"] = labels
-        return model_inputs
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py b/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
deleted file mode 100644
index c9b0d076314057..00000000000000
--- a/src/transformers/models/old_pegasus/tokenization_pegasus_fast.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Tokenization class for model PEGASUS."""
-
-
-import os
-from shutil import copyfile
-from typing import List, Optional, Tuple
-
-from ...file_utils import add_start_docstrings, is_sentencepiece_available
-from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_pegasus import PegasusTokenizer
-else:
-    PegasusTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-
-SPIECE_UNDERLINE = "▁"
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"},
-    "tokenizer_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/tokenizer.json"},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/pegasus-xsum": 512,
-}
-
-
-class PegasusTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The end of sequence token.
-
-            .. note::
-
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
-            The token used for masking single token values. This is the token used when training this model with masked
-            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
-            The token used for masking whole target sentences. This is the token used when training this model with gap
-            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
-            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
-            that uses the tokens 2 - 104 only for pretraining
-    """
-    offset = 103  # entries 2-104 are only used for pretraining
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    slow_tokenizer_class = PegasusTokenizer
-    model_input_names = ["attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        tokenizer_file=None,
-        pad_token="<pad>",
-        eos_token="</s>",
-        unk_token="<unk>",
-        mask_token="<mask_2>",
-        mask_token_sent="<mask_1>",
-        additional_special_tokens=None,
-        **kwargs
-    ):
-        if additional_special_tokens is not None:
-            assert isinstance(
-                additional_special_tokens, list
-            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
-
-            additional_special_tokens_extended = (
-                ([mask_token_sent] + additional_special_tokens)
-                if mask_token_sent not in additional_special_tokens
-                else additional_special_tokens
-            )
-            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
-            additional_special_tokens_extended += [
-                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
-            ]
-
-            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
-                raise ValueError(
-                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
-                )
-            additional_special_tokens = additional_special_tokens_extended
-        else:
-            additional_special_tokens = [mask_token_sent]
-            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            pad_token=pad_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            mask_token=mask_token,
-            mask_token_sent=mask_token_sent,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-    def _special_token_mask(self, seq):
-        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
-        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-
-        assert all_special_ids == set(
-            range(len(self.additional_special_tokens) + 3)
-        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
-
-        return [1 if x in all_special_ids else 0 for x in seq]
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
-        if already_has_special_tokens:
-            return self._special_token_mask(token_ids_0)
-        elif token_ids_1 is None:
-            return self._special_token_mask(token_ids_0) + [1]
-        else:
-            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
-        """
-        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
-
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: List[str],
-        tgt_texts: Optional[List[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        return_tensors: str = None,
-        truncation=True,
-        padding="longest",
-        **unused,
-    ) -> BatchEncoding:
-        if "" in src_texts:
-            raise ValueError(f"found empty string in src_texts: {src_texts}")
-        tokenizer_kwargs = dict(
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            truncation=truncation,
-            padding=padding,
-        )
-        model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
-        if tgt_texts is None:
-            return model_inputs
-        if max_target_length is not None:
-            tokenizer_kwargs["max_length"] = max_target_length
-        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
-        model_inputs["labels"] = labels
-        return model_inputs
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
diff --git a/tests/test_modeling_old_blenderbot.py b/tests/test_modeling_old_blenderbot.py
deleted file mode 100644
index 668569a59553b9..00000000000000
--- a/tests/test_modeling_old_blenderbot.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for BlenderBot"""
-import unittest
-
-from transformers import is_torch_available
-from transformers.file_utils import cached_property
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-from .test_configuration_common import ConfigTester
-from .test_modeling_common import ModelTesterMixin, ids_tensor
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        AutoModelForSeq2SeqLM,
-        AutoTokenizer,
-        BlenderbotConfig,
-        BlenderbotForConditionalGeneration,
-        BlenderbotModel,
-        BlenderbotSmallTokenizer,
-        BlenderbotTokenizer,
-    )
-
-TOK_DECODE_KW = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
-FASTER_GEN_KWARGS = dict(num_beams=1, early_stopping=True, min_length=15, max_length=25)
-
-
-@require_torch
-class BlenderbotModelTester:
-    # Required attributes
-    vocab_size = 99
-    batch_size = 13
-    seq_length = 7
-    num_hidden_layers = 2
-    hidden_size = 16
-    num_attention_heads = 4
-    is_training = True
-
-    def __init__(self, parent):
-        torch.manual_seed(0)
-        self.parent = parent
-        self.config = BlenderbotConfig(
-            d_model=self.hidden_size,
-            dropout=0.0,
-            activation_function="gelu",
-            vocab_size=self.vocab_size,
-            encoder_layers=self.num_hidden_layers,
-            decoder_layers=self.num_hidden_layers,
-            encoder_attention_heads=self.num_attention_heads,
-            decoder_attention_heads=self.num_attention_heads,
-            attention_dropout=0.0,
-            encoder_ffn_dim=4,
-            decoder_ffn_dim=4,
-            do_blenderbot_90_layernorm=False,
-            normalize_before=True,
-            max_position_embeddings=50,
-            static_position_embeddings=False,
-            scale_embedding=True,
-            bos_token_id=0,
-            eos_token_id=2,
-            pad_token_id=1,
-            num_beams=1,
-            min_length=3,
-            max_length=10,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
-        return self.config, inputs_dict
-
-
-@require_torch
-class BlenderbotTesterMixin(ModelTesterMixin, unittest.TestCase):
-    if is_torch_available():
-        all_generative_model_classes = (BlenderbotForConditionalGeneration,)
-        all_model_classes = (BlenderbotForConditionalGeneration, BlenderbotModel)
-    else:
-        all_generative_model_classes = ()
-        all_model_classes = ()
-    is_encoder_decoder = True
-    test_head_masking = False
-    test_pruning = False
-    test_missing_keys = False
-    test_torchscript = False
-
-    def setUp(self):
-        self.model_tester = BlenderbotModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=BlenderbotConfig)
-
-    def test_initialization_module(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = BlenderbotForConditionalGeneration(config).model
-        model.to(torch_device)
-        model.eval()
-        enc_embeds = model.encoder.embed_tokens.weight
-        assert (enc_embeds == model.shared.weight).all().item()
-        self.assertAlmostEqual(torch.std(enc_embeds).item(), config.init_std, 2)
-
-    def test_embed_pos_shape(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        model = BlenderbotForConditionalGeneration(config)
-        expected_shape = (config.max_position_embeddings + config.extra_pos_embeddings, config.d_model)
-        assert model.model.encoder.embed_positions.weight.shape == expected_shape
-        model.model.decoder.embed_positions.weight.shape == expected_shape
-
-    @unittest.skip("This test is flaky")
-    def test_feed_forward_chunking(self):
-        pass
-
-    @unittest.skip("TODO: Decoder embeddings cannot be resized at the moment")
-    def test_resize_embeddings_untied(self):
-        pass
-
-
-@unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.")
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class Blenderbot3BIntegrationTests(unittest.TestCase):
-    ckpt = "facebook/blenderbot-3B"
-
-    @cached_property
-    def tokenizer(self):
-        return BlenderbotTokenizer.from_pretrained(self.ckpt)
-
-    @slow
-    def test_generation_from_short_input_same_as_parlai_3B(self):
-        torch.cuda.empty_cache()
-        model = BlenderbotForConditionalGeneration.from_pretrained(self.ckpt).half().to(torch_device)
-
-        src_text = ["Sam"]
-        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
-
-        generated_utterances = model.generate(**model_inputs, **FASTER_GEN_KWARGS)
-        tgt_text = 'Sam is a great name. It means "sun" in Gaelic.'
-
-        generated_txt = self.tokenizer.batch_decode(generated_utterances, **TOK_DECODE_KW)
-        assert generated_txt[0].strip() == tgt_text
-
-        src_text = "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like i'm going to throw up.\nand why is that?"
-
-        model_inputs = self.tokenizer([src_text], return_tensors="pt").to(torch_device)
-
-        generated_ids = model.generate(**model_inputs, **FASTER_GEN_KWARGS)[0]
-        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
-
-        assert "I think it's because we are so worried about what people think of us." == reply.strip()
-        del model
-
-
-@require_torch
-class Blenderbot90MIntegrationTests(unittest.TestCase):
-    ckpt = "facebook/blenderbot-90M"
-
-    @cached_property
-    def model(self):
-        model = AutoModelForSeq2SeqLM.from_pretrained(self.ckpt).to(torch_device)
-        if torch_device == "cuda":
-            model = model.half()
-        return model
-
-    @cached_property
-    def tokenizer(self):
-        return AutoTokenizer.from_pretrained(self.ckpt)
-
-    @slow
-    def test_90_generation_from_long_input(self):
-
-        src_text = [
-            "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel like\
-       i'm going to throw up.\nand why is that?"
-        ]
-
-        model_inputs = self.tokenizer(src_text, return_tensors="pt").to(torch_device)
-
-        # model does not have "token_type_ids"
-        model_inputs.pop("token_type_ids")
-        assert isinstance(self.tokenizer, BlenderbotSmallTokenizer)
-        generated_ids = self.model.generate(**model_inputs)[0]
-        reply = self.tokenizer.decode(generated_ids, **TOK_DECODE_KW)
-
-        assert reply in (
-            "i don't know. i just feel like i'm going to throw up. it's not fun.",
-            "i'm not sure. i just feel like i've been feeling like i have to be in a certain place",
-        )
-
-    def test_90_generation_from_short_input(self):
-        model_inputs = self.tokenizer(["sam"], return_tensors="pt").to(torch_device)
-
-        # model does not have "token_type_ids"
-        model_inputs.pop("token_type_ids")
-        generated_utterances = self.model.generate(**model_inputs)
-
-        clean_txt = self.tokenizer.decode(generated_utterances[0], **TOK_DECODE_KW)
-        assert clean_txt in (
-            "have you ever been to a sam club? it's a great club in the south.",
-            "have you ever heard of sam harris? he's an american singer, songwriter, and actor.",
-        )
diff --git a/tests/test_modeling_old_marian.py b/tests/test_modeling_old_marian.py
deleted file mode 100644
index 3fc3338fec6f49..00000000000000
--- a/tests/test_modeling_old_marian.py
+++ /dev/null
@@ -1,334 +0,0 @@
-# coding=utf-8
-# Copyright 2020 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-from transformers import AutoConfig, AutoTokenizer, MarianConfig, MarianTokenizer, is_torch_available
-from transformers.file_utils import cached_property
-from transformers.hf_api import HfApi
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-from .test_modeling_common import ModelTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import AutoModelWithLMHead, MarianMTModel
-    from transformers.models.bart.modeling_bart import shift_tokens_right
-    from transformers.models.marian.convert_marian_to_pytorch import (
-        ORG_NAME,
-        convert_hf_name_to_opus_name,
-        convert_opus_name_to_hf_name,
-    )
-    from transformers.pipelines import TranslationPipeline
-
-
-class ModelTester:
-    def __init__(self, parent):
-        self.config = MarianConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
-
-
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (MarianMTModel,) if is_torch_available() else ()
-
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
-
-    def setUp(self):
-        self.model_tester = ModelTester(self)
-
-
-class ModelManagementTests(unittest.TestCase):
-    @slow
-    @require_torch
-    def test_model_names(self):
-        model_list = HfApi().model_list()
-        model_ids = [x.modelId for x in model_list if x.modelId.startswith(ORG_NAME)]
-        bad_model_ids = [mid for mid in model_ids if "+" in model_ids]
-        self.assertListEqual([], bad_model_ids)
-        self.assertGreater(len(model_ids), 500)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class MarianIntegrationTest(unittest.TestCase):
-    src = "en"
-    tgt = "de"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-        "Tom asked his teacher for advice.",
-        "That's how I would do it.",
-        "Tom really admired Mary's courage.",
-        "Turn around and close your eyes.",
-    ]
-    expected_text = [
-        "Ich bin ein kleiner Frosch.",
-        "Jetzt kann ich die 100 Wörter des Deutschen vergessen, die ich kenne.",
-        "Tom bat seinen Lehrer um Rat.",
-        "So würde ich das machen.",
-        "Tom bewunderte Marias Mut wirklich.",
-        "Drehen Sie sich um und schließen Sie die Augen.",
-    ]
-    # ^^ actual C++ output differs slightly: (1) des Deutschen removed, (2) ""-> "O", (3) tun -> machen
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.model_name = f"Helsinki-NLP/opus-mt-{cls.src}-{cls.tgt}"
-        return cls
-
-    @cached_property
-    def tokenizer(self) -> MarianTokenizer:
-        return AutoTokenizer.from_pretrained(self.model_name)
-
-    @property
-    def eos_token_id(self) -> int:
-        return self.tokenizer.eos_token_id
-
-    @cached_property
-    def model(self):
-        model: MarianMTModel = AutoModelWithLMHead.from_pretrained(self.model_name).to(torch_device)
-        c = model.config
-        self.assertListEqual(c.bad_words_ids, [[c.pad_token_id]])
-        self.assertEqual(c.max_length, 512)
-        self.assertEqual(c.decoder_start_token_id, c.pad_token_id)
-
-        if torch_device == "cuda":
-            return model.half()
-        else:
-            return model
-
-    def _assert_generated_batch_equal_expected(self, **tokenizer_kwargs):
-        generated_words = self.translate_src_text(**tokenizer_kwargs)
-        self.assertListEqual(self.expected_text, generated_words)
-
-    def translate_src_text(self, **tokenizer_kwargs):
-        model_inputs = self.tokenizer.prepare_seq2seq_batch(
-            src_texts=self.src_text, return_tensors="pt", **tokenizer_kwargs
-        ).to(torch_device)
-        self.assertEqual(self.model.device, model_inputs.input_ids.device)
-        generated_ids = self.model.generate(
-            model_inputs.input_ids, attention_mask=model_inputs.attention_mask, num_beams=2, max_length=128
-        )
-        generated_words = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        return generated_words
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_DE_More(MarianIntegrationTest):
-    @slow
-    def test_forward(self):
-        src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
-        expected_ids = [38, 121, 14, 697, 38848, 0]
-
-        model_inputs: dict = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts=tgt, return_tensors="pt").to(
-            torch_device
-        )
-
-        self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
-
-        desired_keys = {
-            "input_ids",
-            "attention_mask",
-            "labels",
-        }
-        self.assertSetEqual(desired_keys, set(model_inputs.keys()))
-        model_inputs["decoder_input_ids"] = shift_tokens_right(model_inputs.labels, self.tokenizer.pad_token_id)
-        model_inputs["return_dict"] = True
-        model_inputs["use_cache"] = False
-        with torch.no_grad():
-            outputs = self.model(**model_inputs)
-        max_indices = outputs.logits.argmax(-1)
-        self.tokenizer.batch_decode(max_indices)
-
-    def test_unk_support(self):
-        t = self.tokenizer
-        ids = t.prepare_seq2seq_batch(["||"], return_tensors="pt").to(torch_device).input_ids[0].tolist()
-        expected = [t.unk_token_id, t.unk_token_id, t.eos_token_id]
-        self.assertEqual(expected, ids)
-
-    def test_pad_not_split(self):
-        input_ids_w_pad = (
-            self.tokenizer.prepare_seq2seq_batch(["I am a small frog <pad>"], return_tensors="pt")
-            .input_ids[0]
-            .tolist()
-        )
-        expected_w_pad = [38, 121, 14, 697, 38848, self.tokenizer.pad_token_id, 0]  # pad
-        self.assertListEqual(expected_w_pad, input_ids_w_pad)
-
-    @slow
-    def test_batch_generation_en_de(self):
-        self._assert_generated_batch_equal_expected()
-
-    def test_auto_config(self):
-        config = AutoConfig.from_pretrained(self.model_name)
-        self.assertIsInstance(config, MarianConfig)
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_EN_FR(MarianIntegrationTest):
-    src = "en"
-    tgt = "fr"
-    src_text = [
-        "I am a small frog.",
-        "Now I can forget the 100 words of german that I know.",
-    ]
-    expected_text = [
-        "Je suis une petite grenouille.",
-        "Maintenant, je peux oublier les 100 mots d'allemand que je connais.",
-    ]
-
-    @slow
-    def test_batch_generation_en_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_FR_EN(MarianIntegrationTest):
-    src = "fr"
-    tgt = "en"
-    src_text = [
-        "Donnez moi le micro.",
-        "Tom et Mary étaient assis à une table.",  # Accents
-    ]
-    expected_text = [
-        "Give me the microphone.",
-        "Tom and Mary were sitting at a table.",
-    ]
-
-    @slow
-    def test_batch_generation_fr_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_RU_FR(MarianIntegrationTest):
-    src = "ru"
-    tgt = "fr"
-    src_text = ["Он показал мне рукопись своей новой пьесы."]
-    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]
-
-    @slow
-    def test_batch_generation_ru_fr(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_MT_EN(MarianIntegrationTest):
-    """Cover low resource/high perplexity setting. This breaks without adjust_logits_generation overwritten"""
-
-    src = "mt"
-    tgt = "en"
-    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
-    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]
-
-    @slow
-    def test_batch_generation_mt_en(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_zh(MarianIntegrationTest):
-    src = "en"
-    tgt = "zh"
-    src_text = ["My name is Wolfgang and I live in Berlin"]
-    expected_text = ["我叫沃尔夫冈 我住在柏林"]
-
-    @slow
-    def test_batch_generation_eng_zho(self):
-        self._assert_generated_batch_equal_expected()
-
-
-@require_sentencepiece
-@require_tokenizers
-class TestMarian_en_ROMANCE(MarianIntegrationTest):
-    """Multilingual on target side."""
-
-    src = "en"
-    tgt = "ROMANCE"
-    src_text = [
-        ">>fr<< Don't spend so much time watching TV.",
-        ">>pt<< Your message has been sent.",
-        ">>es<< He's two years older than me.",
-    ]
-    expected_text = [
-        "Ne passez pas autant de temps à regarder la télé.",
-        "A sua mensagem foi enviada.",
-        "Es dos años más viejo que yo.",
-    ]
-
-    @slow
-    def test_batch_generation_en_ROMANCE_multi(self):
-        self._assert_generated_batch_equal_expected()
-
-    def test_tokenizer_handles_empty(self):
-        normalized = self.tokenizer.normalize("")
-        self.assertIsInstance(normalized, str)
-        with self.assertRaises(ValueError):
-            self.tokenizer.prepare_seq2seq_batch([""], return_tensors="pt")
-
-    @slow
-    def test_pipeline(self):
-        device = 0 if torch_device == "cuda" else -1
-        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="pt", device=device)
-        output = pipeline(self.src_text)
-        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
-
-
-@require_torch
-class TestConversionUtils(unittest.TestCase):
-    def test_renaming_multilingual(self):
-        old_names = [
-            "opus-mt-cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-            "opus-mt-cmn+cn-fi",  # no group
-            "opus-mt-en-de",  # standard name
-            "opus-mt-en-de",  # standard name
-        ]
-        expected = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
-        self.assertListEqual(expected, [convert_opus_name_to_hf_name(x) for x in old_names])
-
-    def test_undoing_renaming(self):
-        hf_names = ["opus-mt-ZH-fi", "opus-mt-cmn_cn-fi", "opus-mt-en-de", "opus-mt-en-de"]
-        converted_opus_names = [convert_hf_name_to_opus_name(x) for x in hf_names]
-        expected_opus_names = [
-            "cmn+cn+yue+ze_zh+zh_cn+zh_CN+zh_HK+zh_tw+zh_TW+zh_yue+zhs+zht+zh-fi",
-            "cmn+cn-fi",
-            "en-de",  # standard name
-            "en-de",
-        ]
-        self.assertListEqual(expected_opus_names, converted_opus_names)
diff --git a/tests/test_modeling_old_mbart.py b/tests/test_modeling_old_mbart.py
deleted file mode 100644
index 2a43650febbd25..00000000000000
--- a/tests/test_modeling_old_mbart.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import is_torch_available
-from transformers.file_utils import cached_property
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-
-from .test_modeling_common import ModelTesterMixin
-
-
-if is_torch_available():
-    import torch
-
-    from transformers import (
-        AutoModelForSeq2SeqLM,
-        AutoTokenizer,
-        BatchEncoding,
-        MBartConfig,
-        MBartForConditionalGeneration,
-        MBartModel,
-    )
-
-
-EN_CODE = 250004
-RO_CODE = 250020
-
-
-@require_torch
-class ModelTester:
-    def __init__(self, parent):
-        self.config = MBartConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
-
-
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (MBartForConditionalGeneration, MBartModel) if is_torch_available() else ()
-
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
-
-    def setUp(self):
-        self.model_tester = ModelTester(self)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class AbstractSeq2SeqIntegrationTest(unittest.TestCase):
-    maxDiff = 1000  # longer string compare tracebacks
-    checkpoint_name = None
-
-    @classmethod
-    def setUpClass(cls):
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.checkpoint_name, use_fast=False)
-        return cls
-
-    @cached_property
-    def model(self):
-        """Only load the model if needed."""
-        model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
-        if "cuda" in torch_device:
-            model = model.half()
-        return model
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "facebook/mbart-large-en-ro"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""",
-    ]
-    tgt_text = [
-        "Şeful ONU declară că nu există o soluţie militară în Siria",
-        'Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor face decât să înrăutăţească violenţa şi mizeria pentru milioane de oameni.',
-    ]
-    expected_src_tokens = [8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2, EN_CODE]
-
-    @slow
-    def test_enro_generate_one(self):
-        batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(
-            ["UN Chief Says There Is No Military Solution in Syria"], return_tensors="pt"
-        ).to(torch_device)
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-        # self.assertEqual(self.tgt_text[1], decoded[1])
-
-    @slow
-    def test_enro_generate_batch(self):
-        batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(self.src_text, return_tensors="pt").to(
-            torch_device
-        )
-        translated_tokens = self.model.generate(**batch)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-    def test_mbart_enro_config(self):
-        mbart_models = ["facebook/mbart-large-en-ro"]
-        expected = {"scale_embedding": True, "output_past": True}
-        for name in mbart_models:
-            config = MBartConfig.from_pretrained(name)
-            self.assertTrue(config.is_valid_mbart())
-            for k, v in expected.items():
-                try:
-                    self.assertEqual(v, getattr(config, k))
-                except AssertionError as e:
-                    e.args += (name, k)
-                    raise
-
-    def test_mbart_fast_forward(self):
-        config = MBartConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-        lm_model = MBartForConditionalGeneration(config).to(torch_device)
-        context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
-        summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
-        result = lm_model(input_ids=context, decoder_input_ids=summary, labels=summary)
-        expected_shape = (*summary.shape, config.vocab_size)
-        self.assertEqual(result.logits.shape, expected_shape)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "facebook/mbart-large-cc25"
-    src_text = [
-        " UN Chief Says There Is No Military Solution in Syria",
-        " I ate lunch twice yesterday",
-    ]
-    tgt_text = ["Şeful ONU declară că nu există o soluţie militară în Siria", "to be padded"]
-
-    @unittest.skip("This test is broken, still generates english")
-    def test_cc25_generate(self):
-        inputs = self.tokenizer.prepare_seq2seq_batch([self.src_text[0]], return_tensors="pt").to(torch_device)
-        translated_tokens = self.model.generate(
-            input_ids=inputs["input_ids"].to(torch_device),
-            decoder_start_token_id=self.tokenizer.lang_code_to_id["ro_RO"],
-        )
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        self.assertEqual(self.tgt_text[0], decoded[0])
-
-    @slow
-    def test_fill_mask(self):
-        inputs = self.tokenizer.prepare_seq2seq_batch(["One of the best <mask> I ever read!"], return_tensors="pt").to(
-            torch_device
-        )
-        outputs = self.model.generate(
-            inputs["input_ids"], decoder_start_token_id=self.tokenizer.lang_code_to_id["en_XX"], num_beams=1
-        )
-        prediction: str = self.tokenizer.batch_decode(
-            outputs, clean_up_tokenization_spaces=True, skip_special_tokens=True
-        )[0]
-        self.assertEqual(prediction, "of the best books I ever read!")
diff --git a/tests/test_modeling_old_pegasus.py b/tests/test_modeling_old_pegasus.py
deleted file mode 100644
index dc9fdf522547cf..00000000000000
--- a/tests/test_modeling_old_pegasus.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import AutoConfig, AutoTokenizer, is_torch_available
-from transformers.file_utils import cached_property
-from transformers.models.pegasus.configuration_pegasus import task_specific_params
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
-from transformers.utils.logging import ERROR, set_verbosity
-
-from .test_modeling_bart import PGE_ARTICLE
-from .test_modeling_common import ModelTesterMixin
-from .test_modeling_mbart import AbstractSeq2SeqIntegrationTest
-
-
-if is_torch_available():
-    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
-
-XSUM_ENTRY_LONGER = """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """
-
-set_verbosity(ERROR)
-
-
-@require_torch
-class ModelTester:
-    def __init__(self, parent):
-        self.config = PegasusConfig(
-            vocab_size=99,
-            d_model=24,
-            encoder_layers=2,
-            decoder_layers=2,
-            encoder_attention_heads=2,
-            decoder_attention_heads=2,
-            encoder_ffn_dim=32,
-            decoder_ffn_dim=32,
-            max_position_embeddings=48,
-            add_final_layer_norm=True,
-        )
-
-    def prepare_config_and_inputs_for_common(self):
-        return self.config, {}
-
-
-@require_torch
-class SelectiveCommonTest(unittest.TestCase):
-    all_model_classes = (PegasusForConditionalGeneration, PegasusModel) if is_torch_available() else ()
-
-    test_save_load__keys_to_ignore_on_save = ModelTesterMixin.test_save_load__keys_to_ignore_on_save
-
-    def setUp(self):
-        self.model_tester = ModelTester(self)
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest):
-    checkpoint_name = "google/pegasus-xsum"
-    src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER]
-    tgt_text = [
-        "California's largest electricity provider has turned off power to hundreds of thousands of customers.",
-        "Pop group N-Dubz have revealed they were surprised to get four nominations for this year's Mobo Awards.",
-    ]
-
-    @cached_property
-    def model(self):
-        return AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint_name).to(torch_device)
-
-    @slow
-    def test_pegasus_xsum_summary(self):
-        assert self.tokenizer.model_max_length == 512
-        inputs = self.tokenizer(self.src_text, return_tensors="pt", truncation=True, max_length=512, padding=True).to(
-            torch_device
-        )
-        assert inputs.input_ids.shape == (2, 421)
-        translated_tokens = self.model.generate(**inputs, num_beams=2)
-        decoded = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
-        assert self.tgt_text == decoded
-
-        if "cuda" not in torch_device:
-            return
-        # Demonstrate fp16 issue, Contributions welcome!
-        self.model.half()
-        translated_tokens_fp16 = self.model.generate(**inputs, max_length=10)
-        decoded_fp16 = self.tokenizer.batch_decode(translated_tokens_fp16, skip_special_tokens=True)
-        assert decoded_fp16 == [
-            "California's largest electricity provider has begun",
-            "N-Dubz have revealed they were",
-        ]
-
-
-class PegasusConfigTests(unittest.TestCase):
-    @slow
-    def test_task_specific_params(self):
-        """Test that task_specific params['summarization_xsum'] == config['pegasus_xsum'] """
-        failures = []
-        pegasus_prefix = "google/pegasus"
-        n_prefix_chars = len("summarization_")
-        for task, desired_settings in task_specific_params.items():
-            dataset = task[n_prefix_chars:]
-            mname = f"{pegasus_prefix}-{dataset}"
-            cfg = AutoConfig.from_pretrained(mname)
-            for k, v in desired_settings.items():
-                actual_value = getattr(cfg, k)
-                if actual_value != v:
-                    failures.append(f"config for {mname} had {k}: {actual_value}, expected {v}")
-            tokenizer = AutoTokenizer.from_pretrained(mname)
-            n_pos_embeds = desired_settings["max_position_embeddings"]
-            if n_pos_embeds != tokenizer.model_max_length:
-                failures.append(f"tokenizer.model_max_length {tokenizer.model_max_length} expected {n_pos_embeds}")
-
-        # error
-        all_fails = "\n".join(failures)
-        assert not failures, f"The following configs have unexpected settings: {all_fails}"

From 8ebab5e5e389f5400fe355d16461a9ce5fbc4db0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 19:25:37 +0000
Subject: [PATCH 16/51] correct "add statement"

---
 src/transformers/models/auto/modeling_auto.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 24cf466732f7df..4396b0796f8619 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -21,6 +21,8 @@
 from ...configuration_utils import PretrainedConfig
 from ...file_utils import add_start_docstrings
 from ...utils import logging
+
+# Add modeling imports here
 from ..albert.modeling_albert import (
     AlbertForMaskedLM,
     AlbertForMultipleChoice,
@@ -49,8 +51,6 @@
 )
 from ..bert_generation.modeling_bert_generation import BertGenerationDecoder, BertGenerationEncoder
 from ..blenderbot.modeling_blenderbot import BlenderbotForConditionalGeneration, BlenderbotModel
-
-# Add modeling imports here
 from ..blenderbot_small.modeling_blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
 from ..camembert.modeling_camembert import (
     CamembertForCausalLM,

From c206f6de6065a3d54f5597a04beef7957ccadd37 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 20:55:05 +0000
Subject: [PATCH 17/51] adapt config for tf comp

---
 .../models/bart/configuration_bart.py         | 30 ++++++++++++++++
 .../blenderbot/configuration_blenderbot.py    | 34 +++++++++++++++++++
 .../models/marian/configuration_marian.py     | 34 +++++++++++++++++++
 .../models/mbart/configuration_mbart.py       | 34 +++++++++++++++++++
 .../models/pegasus/configuration_pegasus.py   | 34 +++++++++++++++++++
 tests/test_modeling_tf_pegasus.py             |  6 ++--
 6 files changed, 170 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 0c5c7a395932dd..9565a3cb3b9c53 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -169,3 +169,33 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    # IMPORTANT
+    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+    @property
+    def extra_pos_embeddings(self) -> int:
+        return 2
+
+    @property
+    def normalize_before(self) -> bool:
+        return False
+
+    @property
+    def add_final_layer_norm(self) -> bool:
+        return False
+
+    @property
+    def do_blenderbot_90_layernorm(self) -> bool:
+        return False
+
+    @property
+    def normalize_embedding(self) -> bool:
+        return True
+
+    @property
+    def static_position_embeddings(self) -> bool:
+        return False
+
+    @property
+    def add_bias_logits(self) -> bool:
+        return False
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 52304b6ea4f7e5..6bd583315f5b4b 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -163,3 +163,37 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    # IMPORTANT
+    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+    @property
+    def extra_pos_embeddings(self) -> int:
+        return 0
+
+    @property
+    def normalize_before(self) -> bool:
+        return True
+
+    @property
+    def add_final_layer_norm(self) -> bool:
+        return True
+
+    @property
+    def do_blenderbot_90_layernorm(self) -> bool:
+        return True
+
+    @property
+    def normalize_embedding(self) -> bool:
+        return False
+
+    @property
+    def static_position_embeddings(self) -> bool:
+        return False
+
+    @property
+    def add_bias_logits(self) -> bool:
+        return False
+
+    @property
+    def force_bos_token_to_be_generated(self) -> bool:
+        return False
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 885ef5e8a67a85..411d8a582b0441 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -163,3 +163,37 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    # IMPORTANT
+    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+    @property
+    def extra_pos_embeddings(self) -> int:
+        return 0
+
+    @property
+    def normalize_before(self) -> bool:
+        return False
+
+    @property
+    def add_final_layer_norm(self) -> bool:
+        return False
+
+    @property
+    def do_blenderbot_90_layernorm(self) -> bool:
+        return False
+
+    @property
+    def normalize_embedding(self) -> bool:
+        return False
+
+    @property
+    def static_position_embeddings(self) -> bool:
+        return True
+
+    @property
+    def add_bias_logits(self) -> bool:
+        return False
+
+    @property
+    def force_bos_token_to_be_generated(self) -> bool:
+        return False
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index c65823585bc9c5..0705cb6266130a 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -163,3 +163,37 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    # IMPORTANT
+    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+    @property
+    def extra_pos_embeddings(self) -> int:
+        return 2
+
+    @property
+    def normalize_before(self) -> bool:
+        return True
+
+    @property
+    def add_final_layer_norm(self) -> bool:
+        return True
+
+    @property
+    def do_blenderbot_90_layernorm(self) -> bool:
+        return False
+
+    @property
+    def normalize_embedding(self) -> bool:
+        return True
+
+    @property
+    def static_position_embeddings(self) -> bool:
+        return False
+
+    @property
+    def add_bias_logits(self) -> bool:
+        return False
+
+    @property
+    def force_bos_token_to_be_generated(self) -> bool:
+        return False
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 2bfbc40d55b2f2..708875b784f1e1 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -163,3 +163,37 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
+
+    # IMPORTANT
+    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+    @property
+    def extra_pos_embeddings(self) -> int:
+        return 0
+
+    @property
+    def normalize_before(self) -> bool:
+        return True
+
+    @property
+    def add_final_layer_norm(self) -> bool:
+        return True
+
+    @property
+    def do_blenderbot_90_layernorm(self) -> bool:
+        return False
+
+    @property
+    def normalize_embedding(self) -> bool:
+        return False
+
+    @property
+    def static_position_embeddings(self) -> bool:
+        return True
+
+    @property
+    def add_bias_logits(self) -> bool:
+        return False
+
+    @property
+    def force_bos_token_to_be_generated(self) -> bool:
+        return False
diff --git a/tests/test_modeling_tf_pegasus.py b/tests/test_modeling_tf_pegasus.py
index b6e16f75d57d5f..774b881631f57d 100644
--- a/tests/test_modeling_tf_pegasus.py
+++ b/tests/test_modeling_tf_pegasus.py
@@ -21,7 +21,6 @@
 from transformers.testing_utils import is_pt_tf_cross_test, require_sentencepiece, require_tf, require_tokenizers, slow
 
 from .test_configuration_common import ConfigTester
-from .test_modeling_pegasus import PGE_ARTICLE, XSUM_ENTRY_LONGER
 from .test_modeling_tf_bart import TFBartModelTester
 from .test_modeling_tf_common import TFModelTesterMixin
 
@@ -115,7 +114,10 @@ def test_model_common_attributes(self):
 @require_sentencepiece
 @require_tokenizers
 class TFPegasusIntegrationTests(unittest.TestCase):
-    src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER]
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""",
+        """ The London trio are up for best UK act and best album, as well as getting two nominations in the best song category."We got told like this morning 'Oh I think you're nominated'", said Dappy."And I was like 'Oh yeah, which one?' And now we've got nominated for four awards. I mean, wow!"Bandmate Fazer added: "We thought it's best of us to come down and mingle with everyone and say hello to the cameras. And now we find we've got four nominations."The band have two shots at the best song prize, getting the nod for their Tynchy Stryder collaboration Number One, and single Strong Again.Their album Uncle B will also go up against records by the likes of Beyonce and Kanye West.N-Dubz picked up the best newcomer Mobo in 2007, but female member Tulisa said they wouldn't be too disappointed if they didn't win this time around."At the end of the day we're grateful to be where we are in our careers."If it don't happen then it don't happen - live to fight another day and keep on making albums and hits for the fans."Dappy also revealed they could be performing live several times on the night.The group will be doing Number One and also a possible rendition of the War Child single, I Got Soul.The charity song is a  re-working of The Killers' All These Things That I've Done and is set to feature artists like Chipmunk, Ironik and Pixie Lott.This year's Mobos will be held outside of London for the first time, in Glasgow on 30 September.N-Dubz said they were looking forward to performing for their Scottish fans and boasted about their recent shows north of the border."We just done Edinburgh the other day," said Dappy."We smashed up an N-Dubz show over there. We done Aberdeen about three or four months ago - we smashed up that show over there! Everywhere we go we smash it up!" """,
+    ]
     expected_text = [
         "California's largest electricity provider has cut power to hundreds of thousands of customers in an effort to reduce the risk of wildfires.",
         'N-Dubz have revealed they\'re "grateful" to have been nominated for four Mobo Awards.',

From 0b3ba485433ec439cdff2cc99ebe9fcfd5f911f3 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 22:24:50 +0000
Subject: [PATCH 18/51] correct configs for tf

---
 .../models/bart/configuration_bart.py         | 40 +++++------------
 .../blenderbot/configuration_blenderbot.py    | 45 +++++--------------
 .../models/marian/configuration_marian.py     | 45 +++++--------------
 .../models/mbart/configuration_mbart.py       | 45 +++++--------------
 .../models/pegasus/configuration_pegasus.py   | 45 +++++--------------
 tests/test_modeling_tf_bart.py                |  7 ++-
 6 files changed, 59 insertions(+), 168 deletions(-)

diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 9565a3cb3b9c53..0112ea92db0698 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -162,6 +162,16 @@ def __init__(
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.force_bos_token_to_be_generated = force_bos_token_to_be_generated  # only relevant for CNN
 
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 2
+        self.normalize_before = False
+        self.add_final_layer_norm = False
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = True
+        self.static_position_embeddings = False
+        self.add_bias_logits = False
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -169,33 +179,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    # IMPORTANT
-    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
-    @property
-    def extra_pos_embeddings(self) -> int:
-        return 2
-
-    @property
-    def normalize_before(self) -> bool:
-        return False
-
-    @property
-    def add_final_layer_norm(self) -> bool:
-        return False
-
-    @property
-    def do_blenderbot_90_layernorm(self) -> bool:
-        return False
-
-    @property
-    def normalize_embedding(self) -> bool:
-        return True
-
-    @property
-    def static_position_embeddings(self) -> bool:
-        return False
-
-    @property
-    def add_bias_logits(self) -> bool:
-        return False
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 6bd583315f5b4b..88753bf8e83def 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -156,6 +156,17 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = True
+        self.normalize_embedding = False
+        self.static_position_embeddings = False
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -163,37 +174,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    # IMPORTANT
-    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
-    @property
-    def extra_pos_embeddings(self) -> int:
-        return 0
-
-    @property
-    def normalize_before(self) -> bool:
-        return True
-
-    @property
-    def add_final_layer_norm(self) -> bool:
-        return True
-
-    @property
-    def do_blenderbot_90_layernorm(self) -> bool:
-        return True
-
-    @property
-    def normalize_embedding(self) -> bool:
-        return False
-
-    @property
-    def static_position_embeddings(self) -> bool:
-        return False
-
-    @property
-    def add_bias_logits(self) -> bool:
-        return False
-
-    @property
-    def force_bos_token_to_be_generated(self) -> bool:
-        return False
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 411d8a582b0441..54a12c1710f367 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -156,6 +156,17 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = False
+        self.add_final_layer_norm = False
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = False
+        self.static_position_embeddings = True
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -163,37 +174,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    # IMPORTANT
-    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
-    @property
-    def extra_pos_embeddings(self) -> int:
-        return 0
-
-    @property
-    def normalize_before(self) -> bool:
-        return False
-
-    @property
-    def add_final_layer_norm(self) -> bool:
-        return False
-
-    @property
-    def do_blenderbot_90_layernorm(self) -> bool:
-        return False
-
-    @property
-    def normalize_embedding(self) -> bool:
-        return False
-
-    @property
-    def static_position_embeddings(self) -> bool:
-        return True
-
-    @property
-    def add_bias_logits(self) -> bool:
-        return False
-
-    @property
-    def force_bos_token_to_be_generated(self) -> bool:
-        return False
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index 0705cb6266130a..d157c5d295198b 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -156,6 +156,17 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 2
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = True
+        self.static_position_embeddings = False
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -163,37 +174,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    # IMPORTANT
-    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
-    @property
-    def extra_pos_embeddings(self) -> int:
-        return 2
-
-    @property
-    def normalize_before(self) -> bool:
-        return True
-
-    @property
-    def add_final_layer_norm(self) -> bool:
-        return True
-
-    @property
-    def do_blenderbot_90_layernorm(self) -> bool:
-        return False
-
-    @property
-    def normalize_embedding(self) -> bool:
-        return True
-
-    @property
-    def static_position_embeddings(self) -> bool:
-        return False
-
-    @property
-    def add_bias_logits(self) -> bool:
-        return False
-
-    @property
-    def force_bos_token_to_be_generated(self) -> bool:
-        return False
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 708875b784f1e1..17f902b4aad39d 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -156,6 +156,17 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
 
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = False
+        self.static_position_embeddings = True
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
     @property
     def num_attention_heads(self) -> int:
         return self.encoder_attention_heads
@@ -163,37 +174,3 @@ def num_attention_heads(self) -> int:
     @property
     def hidden_size(self) -> int:
         return self.d_model
-
-    # IMPORTANT
-    # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
-    @property
-    def extra_pos_embeddings(self) -> int:
-        return 0
-
-    @property
-    def normalize_before(self) -> bool:
-        return True
-
-    @property
-    def add_final_layer_norm(self) -> bool:
-        return True
-
-    @property
-    def do_blenderbot_90_layernorm(self) -> bool:
-        return False
-
-    @property
-    def normalize_embedding(self) -> bool:
-        return False
-
-    @property
-    def static_position_embeddings(self) -> bool:
-        return True
-
-    @property
-    def add_bias_logits(self) -> bool:
-        return False
-
-    @property
-    def force_bos_token_to_be_generated(self) -> bool:
-        return False
diff --git a/tests/test_modeling_tf_bart.py b/tests/test_modeling_tf_bart.py
index 5e3377db94a2e9..80d3bff416023c 100644
--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -246,13 +246,16 @@ def _long_tensor(tok_lst):
 class TFBartModelIntegrationTest(unittest.TestCase):
     def test_inference_no_head(self):
         model = TFBartModel.from_pretrained("facebook/bart-large", from_pt=True)
+
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
-        # with torch.no_grad():
         output = model(**inputs_dict)[0]
         expected_shape = (1, 11, 1024)
         self.assertEqual(output.shape, expected_shape)
-        expected_slice = tf.Tensor(
+        import ipdb
+
+        ipdb.set_trace()
+        expected_slice = tf.convert_to_tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
         )
         self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))

From a9757e2b5cbd33ea91586c663a049e4a16adbf77 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 22:30:07 +0000
Subject: [PATCH 19/51] remove ipdb

---
 tests/test_modeling_tf_bart.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_modeling_tf_bart.py b/tests/test_modeling_tf_bart.py
index 80d3bff416023c..6e1f794a51bcb5 100644
--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -252,10 +252,7 @@ def test_inference_no_head(self):
         output = model(**inputs_dict)[0]
         expected_shape = (1, 11, 1024)
         self.assertEqual(output.shape, expected_shape)
-        import ipdb
-
-        ipdb.set_trace()
-        expected_slice = tf.convert_to_tensor(
+        expected_slice = tf.Tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
         )
         self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))

From 953b11033c96336110e336ec95e0e658f6fc24ac Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 22:45:21 +0000
Subject: [PATCH 20/51] fix more stuff

---
 docs/source/index.rst                       |  2 +
 docs/source/model_doc/blenderbot.rst        |  6 --
 src/transformers/models/marian/__init__.py  |  6 +-
 src/transformers/utils/dummy_pt_objects.py  | 66 ++++++++++++++++
 tests/test_tokenization_mbart.py            | 13 +++-
 tests/test_tokenization_small_blenderbot.py | 83 +++++++++++++++++++++
 6 files changed, 165 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_tokenization_small_blenderbot.py

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1ada9c18d71c1a..80b1f2cc391cf5 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -220,6 +220,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index df43c90ef07661..c14c004e1dbe87 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -93,12 +93,6 @@ BlenderbotTokenizer
 .. autoclass:: transformers.BlenderbotTokenizer
     :members: build_inputs_with_special_tokens
 
-BlenderbotSmallTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BlenderbotSmallTokenizer
-    :members:
-
 
 BlenderbotModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/transformers/models/marian/__init__.py b/src/transformers/models/marian/__init__.py
index e9e2ee4cc7d112..d4c551fbf87ba2 100644
--- a/src/transformers/models/marian/__init__.py
+++ b/src/transformers/models/marian/__init__.py
@@ -15,11 +15,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig
-from .tokenization_marian import MarianTokenizer
 
 
+if is_sentencepiece_available():
+    from .tokenization_marian import MarianTokenizer
+
 if is_torch_available():
     from .modeling_marian import (
         MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c2ed563ef0502b..0f1bc511080711 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -609,6 +609,27 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
+BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BlenderbotSmallForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class BlenderbotSmallModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -1288,6 +1309,15 @@ def __init__(self, *args, **kwargs):
         requires_pytorch(self)
 
 
+class MarianModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class MarianMTModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
@@ -1306,6 +1336,24 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
+class MBartForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MBartForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class MBartModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
@@ -1578,6 +1626,24 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
+class PegasusForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class PegasusForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class PegasusModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py
index 0df2490099898d..fb6d10c8c3dbc2 100644
--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
@@ -195,7 +195,10 @@ def test_batch_fairseq_parity(self):
         batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
+        )
+
         for k in batch:
             batch[k] = batch[k].tolist()
         # batch = {k: v.tolist() for k,v in batch.items()}
@@ -227,13 +230,17 @@ def test_seq2seq_max_target_length(self):
         batch = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, max_length=3, max_target_length=10, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
+        )
         self.assertEqual(batch.input_ids.shape[1], 3)
         self.assertEqual(batch.decoder_input_ids.shape[1], 10)
         # max_target_length will default to max_length if not specified
         batch = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, max_length=3, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
+        )
         self.assertEqual(batch.input_ids.shape[1], 3)
         self.assertEqual(batch.decoder_input_ids.shape[1], 3)
diff --git a/tests/test_tokenization_small_blenderbot.py b/tests/test_tokenization_small_blenderbot.py
new file mode 100644
index 00000000000000..40a23daa62858f
--- /dev/null
+++ b/tests/test_tokenization_small_blenderbot.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
+import json
+import os
+import unittest
+
+from transformers.models.blenderbot.tokenization_blenderbot import VOCAB_FILES_NAMES, BlenderbotSmallTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = BlenderbotSmallTokenizer
+
+    def setUp(self):
+        super().setUp()
+
+        vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
+        self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(vocab_tokens) + "\n")
+        with open(self.merges_file, "w", encoding="utf-8") as fp:
+            fp.write("\n".join(merges))
+
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "adapt act apte"
+        output_text = "adapt act apte"
+        return input_text, output_text
+
+    def test_full_blenderbot_small_tokenizer(self):
+        tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "adapt act apte"
+        bpe_tokens = ["adapt", "act", "ap@@", "te"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
+
+        input_bpe_tokens = [0, 1, 2, 3, 4, 5]
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def test_special_tokens_small_tok(self):
+        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+        assert tok("sam").input_ids == [1384]
+        src_text = "I am a small frog."
+        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
+        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        assert src_text != decoded  # I wish it did!
+        assert decoded == "i am a small frog ."
+
+    def test_empty_word_small_tok(self):
+        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+        src_text = "I am a small frog ."
+        src_text_dot = "."
+        encoded = tok(src_text)["input_ids"]
+        encoded_dot = tok(src_text_dot)["input_ids"]
+
+        assert encoded[-1] == encoded_dot[0]

From 165f27102342fd41ab83c84cf5f64af20cea5ab1 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 23:09:24 +0000
Subject: [PATCH 21/51] fix mbart

---
 docs/source/index.rst                         |  1 +
 docs/source/model_doc/marian.rst              |  6 +++++
 docs/source/model_doc/mbart.rst               | 13 +++++++++++
 docs/source/model_doc/pegasus.rst             | 12 ++++++++++
 src/transformers/models/mbart/__init__.py     |  1 -
 .../models/mbart/configuration_mbart.py       |  2 --
 .../models/mbart/modeling_mbart.py            | 22 ++++++++++---------
 tests/test_tokenization_mbart.py              | 12 +++-------
 utils/check_repo.py                           | 20 +++++++++++++----
 9 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 80b1f2cc391cf5..d3fdce6ac8131b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -360,6 +360,7 @@ TensorFlow and/or Flax.
     model_doc/bert
     model_doc/bertgeneration
     model_doc/blenderbot
+    model_doc/blenderbot_small
     model_doc/camembert
     model_doc/ctrl
     model_doc/deberta
diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst
index d6384bbdf3e2a3..d637f5ce41ba09 100644
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -182,6 +182,12 @@ MarianTokenizer
     :members: prepare_seq2seq_batch
 
 
+MarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianModel
+
+
 MarianMTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/mbart.rst b/docs/source/model_doc/mbart.rst
index 4ac391255eb582..edd37b4969bf7d 100644
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -111,6 +111,19 @@ MBartForConditionalGeneration
     :members:
 
 
+MBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForQuestionAnswering
+    :members:
+
+
+MBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForSequenceClassification
+
+
 TFMBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst
index 3fab320ebcbc5d..47886bd97b0116 100644
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -131,6 +131,18 @@ PegasusForConditionalGeneration
 .. autoclass:: transformers.PegasusForConditionalGeneration
 
 
+PegasusForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusForQuestionAnswering
+
+
+PegasusForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusForSequenceClassification
+
+
 TFPegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index 7dab087f6fd425..e5e10659178b30 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
-from .tokenization_mbart import MBartTokenizer
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index d157c5d295198b..adf47f31f825a3 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -116,7 +116,6 @@ def __init__(
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        decoder_start_token_id=2,
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
@@ -130,7 +129,6 @@ def __init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
-            decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index af2b2fa9d807b7..8a6aa90b4e303d 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -57,19 +57,23 @@
 ]
 
 
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
     """
-    Shift input ids one token to the right.
+    Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
+    have a single `decoder_start_token_id` in contrast to other Bart-like models.
     """
-    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
-    shifted_input_ids[:, 0] = decoder_start_token_id
+    prev_output_tokens = input_ids.clone()
 
     assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
     # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
 
-    return shifted_input_ids
+    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
+    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
+    prev_output_tokens[:, 0] = decoder_start_tokens
+
+    return prev_output_tokens
 
 
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
@@ -1150,9 +1154,7 @@ def forward(
 
         if labels is not None:
             if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(
-                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
-                )
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
 
         outputs = self.model(
             input_ids,
diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py
index fb6d10c8c3dbc2..59877e2dd9924e 100644
--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
@@ -195,9 +195,7 @@ def test_batch_fairseq_parity(self):
         batch: BatchEncoding = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(
-            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
-        )
+        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
 
         for k in batch:
             batch[k] = batch[k].tolist()
@@ -230,17 +228,13 @@ def test_seq2seq_max_target_length(self):
         batch = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, max_length=3, max_target_length=10, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(
-            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
-        )
+        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
         self.assertEqual(batch.input_ids.shape[1], 3)
         self.assertEqual(batch.decoder_input_ids.shape[1], 10)
         # max_target_length will default to max_length if not specified
         batch = self.tokenizer.prepare_seq2seq_batch(
             self.src_text, tgt_texts=self.tgt_text, max_length=3, return_tensors="pt"
         )
-        batch["decoder_input_ids"] = shift_tokens_right(
-            batch.labels, self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
-        )
+        batch["decoder_input_ids"] = shift_tokens_right(batch.labels, self.tokenizer.pad_token_id)
         self.assertEqual(batch.input_ids.shape[1], 3)
         self.assertEqual(batch.decoder_input_ids.shape[1], 3)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 596113d9ed84c1..43c51bdc3802e5 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -30,11 +30,17 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = [
     # models to ignore for not tested
-    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
-    "BlenderbotSmallDecoder",  # Building part of bigger (tested) model.
     "BartDecoder",  # Building part of bigger (tested) model.
     "BartEncoder",  # Building part of bigger (tested) model.
     "BertLMHeadModel",  # Needs to be setup as decoder.
+    "BlenderbotSmallEncoder",  # Building part of bigger (tested) model.
+    "BlenderbotSmallDecoder",  # Building part of bigger (tested) model.
+    "BlenderbotEncoder",  # Building part of bigger (tested) model.
+    "BlenderbotDecoder",  # Building part of bigger (tested) model.
+    "MBartEncoder",  # Building part of bigger (tested) model.
+    "MBartDecoder",  # Building part of bigger (tested) model.
+    "PegasusEncoder",  # Building part of bigger (tested) model.
+    "PegasusDecoder",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
     "DPRSpanPredictor",  # Building part of bigger (tested) model.
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
@@ -66,10 +72,12 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = [
     # models to ignore for model xxx mapping
-    "BlenderbotSmallEncoder",
-    "BlenderbotSmallDecoder",
     "BartDecoder",
     "BartEncoder",
+    "BlenderbotSmallEncoder",
+    "BlenderbotSmallDecoder",
+    "BlenderbotEncoder",
+    "BlenderbotDecoder",
     "DPRContextEncoder",
     "DPREncoder",
     "DPRReader",
@@ -78,7 +86,11 @@
     "FunnelBaseModel",
     "GPT2DoubleHeadsModel",
     "MT5EncoderModel",
+    "MBartEncoder",
+    "MBartDecoder",
     "OpenAIGPTDoubleHeadsModel",
+    "PegasusEncoder",
+    "PegasusDecoder",
     "ProphetNetDecoder",
     "ProphetNetEncoder",
     "ProphetNetDecoderWrapper",

From e8afa3efe97abdd70f7b5bdf000f6e5f16a89038 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 23:13:09 +0000
Subject: [PATCH 22/51] push pegasus fix

---
 src/transformers/models/pegasus/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index 2ea286ab201ee9..c8efb9d64b1322 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -15,11 +15,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
-from .tokenization_pegasus import PegasusTokenizer
 
 
+if is_sentencepiece_available():
+    from .tokenization_pegasus import PegasusTokenizer
+
 if is_tokenizers_available():
     from .tokenization_pegasus_fast import PegasusTokenizerFast
 

From 441446dcf571ee0e39694727f26c629747360b5c Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 23:15:24 +0000
Subject: [PATCH 23/51] fix mbart

---
 tests/test_tokenization_mbart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py
index 59877e2dd9924e..5bd987eaba52f3 100644
--- a/tests/test_tokenization_mbart.py
+++ b/tests/test_tokenization_mbart.py
@@ -31,7 +31,7 @@
 
 
 if is_torch_available():
-    from transformers.models.bart.modeling_bart import shift_tokens_right
+    from transformers.models.mbart.modeling_mbart import shift_tokens_right
 
 EN_CODE = 250004
 RO_CODE = 250020

From 3fd722bddb6625206f58364f36339122ac905b21 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 23:25:21 +0000
Subject: [PATCH 24/51] more fixes

---
 examples/seq2seq/utils.py      | 9 +--------
 tests/test_modeling_marian.py  | 3 +--
 tests/test_modeling_tf_bart.py | 8 ++++----
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py
index 437cdf2e63239c..9df658f0218bed 100644
--- a/examples/seq2seq/utils.py
+++ b/examples/seq2seq/utils.py
@@ -33,9 +33,8 @@
 from torch.utils.data import Dataset, Sampler
 
 from sentence_splitter import add_newline_to_end_of_each_sentence
-from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
+from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer
 from transformers.file_utils import cached_property
-from transformers.models.bart.modeling_bart import shift_tokens_right
 
 
 try:
@@ -305,15 +304,9 @@ def __call__(self, batch) -> Dict[str, torch.Tensor]:
             labels = trim_batch(labels, self.pad_token_id)
             input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
 
-        if isinstance(self.tokenizer, T5Tokenizer):
-            decoder_input_ids = self._shift_right_t5(labels)
-        else:
-            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
-
         batch = {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
             "labels": labels,
         }
         return batch
diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py
index e877817394dac6..3bed7f67e986e1 100644
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -40,7 +40,6 @@
         MarianConfig,
         MarianModel,
         MarianMTModel,
-        MarianTokenizer,
         TranslationPipeline,
     )
     from transformers.models.marian.convert_marian_to_pytorch import (
@@ -320,7 +319,7 @@ def setUpClass(cls) -> None:
         return cls
 
     @cached_property
-    def tokenizer(self) -> MarianTokenizer:
+    def tokenizer(self):
         return AutoTokenizer.from_pretrained(self.model_name)
 
     @property
diff --git a/tests/test_modeling_tf_bart.py b/tests/test_modeling_tf_bart.py
index 6e1f794a51bcb5..d937fa1ac0d206 100644
--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -248,14 +248,14 @@ def test_inference_no_head(self):
         model = TFBartModel.from_pretrained("facebook/bart-large", from_pt=True)
 
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
-        output = model(**inputs_dict)[0]
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, model.config.pad_token_id), tf.int8)
+        output = model(input_ids=input_ids, attention_mask=attention_mask)[0]
         expected_shape = (1, 11, 1024)
         self.assertEqual(output.shape, expected_shape)
-        expected_slice = tf.Tensor(
+        expected_slice = tf.convert_to_tensor(
             [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
         )
-        self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)
 
     def test_cnn_summarization_same_as_fairseq_hard(self):
         hf = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn", from_pt=True)

From 85f36c450896c2c26e0e659c67de3f74176b3a91 Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Sat, 2 Jan 2021 23:34:46 +0000
Subject: [PATCH 25/51] fix research projects code

---
 .../seq2seq-distillation/distillation.py      |   2 +-
 .../seq2seq-distillation/finetune.py          |   2 +-
 .../seq2seq-distillation/utils copy.py        | 645 ------------------
 .../seq2seq-distillation/utils.py             |   2 +-
 examples/seq2seq/test_datasets.py             |   2 +-
 5 files changed, 4 insertions(+), 649 deletions(-)
 delete mode 100644 examples/research_projects/seq2seq-distillation/utils copy.py

diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
index 3b3bd805894151..e2b2ad22d60d11 100755
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -16,7 +16,7 @@
 from finetune import main as ft_main
 from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
 from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.models.mbart.modeling_mbart import shift_tokens_right
 from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params
 
 
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
index 156b4695a67e72..0ca4e6f9bccdaa 100755
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -17,7 +17,7 @@
 
 from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from transformers import MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.models.mbart.modeling_mbart import shift_tokens_right
 from utils import (
     ROUGE_KEYS,
     LegacySeq2SeqDataset,
diff --git a/examples/research_projects/seq2seq-distillation/utils copy.py b/examples/research_projects/seq2seq-distillation/utils copy.py
deleted file mode 100644
index b6994a1831da0a..00000000000000
--- a/examples/research_projects/seq2seq-distillation/utils copy.py	
+++ /dev/null
@@ -1,645 +0,0 @@
-import itertools
-import json
-import linecache
-import math
-import os
-import pickle
-import socket
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Dict, Iterable, List, Tuple, Union
-
-import git
-import numpy as np
-import torch
-import torch.distributed as dist
-from rouge_score import rouge_scorer, scoring
-from sacrebleu import corpus_bleu
-from torch import nn
-from torch.utils.data import Dataset, Sampler
-
-from sentence_splitter import add_newline_to_end_of_each_sentence
-from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
-from transformers.file_utils import cached_property
-from transformers.models.bart.modeling_bart import shift_tokens_right
-
-
-try:
-    from fairseq.data.data_utils import batch_by_size
-
-    FAIRSEQ_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    FAIRSEQ_AVAILABLE = False
-
-
-def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
-    """From fairseq"""
-    if target.dim() == lprobs.dim() - 1:
-        target = target.unsqueeze(-1)
-    nll_loss = -lprobs.gather(dim=-1, index=target)
-    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
-    if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss.masked_fill_(pad_mask, 0.0)
-        smooth_loss.masked_fill_(pad_mask, 0.0)
-    else:
-        nll_loss = nll_loss.squeeze(-1)
-        smooth_loss = smooth_loss.squeeze(-1)
-
-    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
-    smooth_loss = smooth_loss.sum()
-    eps_i = epsilon / lprobs.size(-1)
-    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
-    return loss, nll_loss
-
-
-def lmap(f: Callable, x: Iterable) -> List:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
-    """Uses sacrebleu's corpus_bleu implementation."""
-    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
-
-
-def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
-    def non_pad_len(tokens: np.ndarray) -> int:
-        return np.count_nonzero(tokens != tokenizer.pad_token_id)
-
-    def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
-        pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
-        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
-        pred_str = lmap(str.strip, pred_str)
-        label_str = lmap(str.strip, label_str)
-        return pred_str, label_str
-
-    def summarization_metrics(pred: EvalPrediction) -> Dict:
-        pred_str, label_str = decode_pred(pred)
-        rouge: Dict = calculate_rouge(pred_str, label_str)
-        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        rouge.update({"gen_len": summ_len})
-        return rouge
-
-    def translation_metrics(pred: EvalPrediction) -> Dict:
-        pred_str, label_str = decode_pred(pred)
-        bleu: Dict = calculate_bleu(pred_str, label_str)
-        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        bleu.update({"gen_len": gen_len})
-        return bleu
-
-    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
-    return compute_metrics_fn
-
-
-def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class AbstractSeq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        prefix="",
-        **dataset_kwargs
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.len_file = Path(data_dir).joinpath(type_path + ".len")
-        if os.path.exists(self.len_file):
-            self.src_lens = pickle_load(self.len_file)
-            self.used_char_len = False
-        else:
-            self.src_lens = self.get_char_lens(self.src_file)
-            self.used_char_len = True
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix if prefix is not None else ""
-
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.dataset_kwargs = dataset_kwargs
-        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open().readlines()]
-
-    @cached_property
-    def tgt_lens(self):
-        """Length in characters of target documents"""
-        return self.get_char_lens(self.tgt_file)
-
-    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
-        if distributed:
-            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
-        else:
-            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
-
-    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
-        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
-        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
-        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
-
-        def num_tokens_in_example(i):
-            return min(self.src_lens[i], self.max_target_length)
-
-        # call fairseq cython function
-        batch_sampler: List[List[int]] = batch_by_size(
-            sorted_indices,
-            num_tokens_fn=num_tokens_in_example,
-            max_tokens=max_tokens_per_batch,
-            required_batch_size_multiple=64,
-        )
-        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
-        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
-        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
-        largest_batch_idx = np.argmax(approximate_toks_per_batch)
-        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
-            shuffled_batches[largest_batch_idx],
-            shuffled_batches[0],
-        )
-        return shuffled_batches
-
-    def __getitem__(self, item):
-        raise NotImplementedError("You must implement this")
-
-    def collate_fn(self, batch):
-        raise NotImplementedError("You must implement this")
-
-
-class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
-    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
-        """Call tokenizer on src and tgt_lines"""
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
-        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "labels": target_ids,
-        }
-
-    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
-        """Only used by LegacyDataset"""
-        return tokenizer(
-            [line],
-            max_length=max_length,
-            padding="max_length" if pad_to_max_length else None,
-            truncation=True,
-            return_tensors=return_tensors,
-            **self.dataset_kwargs,
-        )
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["labels"] for x in batch])
-        pad_token_id = self.pad_token_id
-        y = trim_batch(target_ids, pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "labels": y,
-        }
-        return batch
-
-
-class Seq2SeqDataset(AbstractSeq2SeqDataset):
-    """A dataset that calls prepare_seq2seq_batch."""
-
-    def __getitem__(self, index) -> Dict[str, str]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
-
-    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
-        """Call prepare_seq2seq_batch."""
-        batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.max_source_length,
-            max_target_length=self.max_target_length,
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        ).data
-        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
-        return batch_encoding
-
-
-class Seq2SeqDataCollator:
-    def __init__(self, tokenizer, data_args, tpu_num_cores=None):
-        self.tokenizer = tokenizer
-        self.pad_token_id = tokenizer.pad_token_id
-        assert (
-            self.pad_token_id is not None
-        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
-        self.data_args = data_args
-        self.tpu_num_cores = tpu_num_cores
-        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
-        if data_args.src_lang is not None:
-            self.dataset_kwargs["src_lang"] = data_args.src_lang
-        if data_args.tgt_lang is not None:
-            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
-
-    def __call__(self, batch) -> Dict[str, torch.Tensor]:
-        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
-            batch = self._encode(batch)
-            input_ids, attention_mask, labels = (
-                batch["input_ids"],
-                batch["attention_mask"],
-                batch["labels"],
-            )
-        else:
-            input_ids = torch.stack([x["input_ids"] for x in batch])
-            attention_mask = torch.stack([x["attention_mask"] for x in batch])
-            labels = torch.stack([x["labels"] for x in batch])
-
-            labels = trim_batch(labels, self.pad_token_id)
-            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
-
-        if isinstance(self.tokenizer, T5Tokenizer):
-            decoder_input_ids = self._shift_right_t5(labels)
-        else:
-            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
-
-        batch = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "labels": labels,
-        }
-        return batch
-
-    def _shift_right_t5(self, input_ids):
-        # shift inputs to the right
-        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-        shifted_input_ids[..., 0] = self.pad_token_id
-        return shifted_input_ids
-
-    def _encode(self, batch) -> Dict[str, torch.Tensor]:
-        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.data_args.max_source_length,
-            max_target_length=self.data_args.max_target_length,
-            padding="max_length" if self.tpu_num_cores is not None else "longest",  # TPU hack
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        )
-        return batch_encoding.data
-
-
-class SortishSampler(Sampler):
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-
-    def __init__(self, data, batch_size, shuffle=True):
-        self.data, self.bs, self.shuffle = data, batch_size, shuffle
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def __iter__(self):
-        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
-
-
-def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-    if not shuffle:
-        return np.argsort(np.array(data) * -1)
-
-    def key_fn(i):
-        return data[i]
-
-    idxs = np.random.permutation(len(data))
-    sz = bs * 50
-    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
-    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
-    sz = bs
-    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
-    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
-    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
-    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int)
-    sort_idx = np.concatenate((ck_idx[0], sort_idx))
-    return sort_idx
-
-
-class DistributedSortishSampler(Sampler):
-    """Copied from torch DistributedSampler"""
-
-    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        if add_extra_examples:
-            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
-            self.total_size = self.num_samples * self.num_replicas
-        else:
-            self.total_size = len(dataset)
-            self.num_samples = len(self.available_indices)
-        self.batch_size = batch_size
-        self.add_extra_examples = add_extra_examples
-        self.shuffle = shuffle
-
-    def __iter__(self) -> Iterable:
-        g = torch.Generator()
-        g.manual_seed(self.epoch)
-
-        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
-        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
-        indices = [self.available_indices[i] for i in sortish_indices]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-
-    @cached_property
-    def available_indices(self) -> np.array:
-        indices = list(range(len(self.dataset)))
-        # add extra samples to make it evenly divisible
-        indices += indices[: (self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        # subsample
-        available_indices = indices[self.rank : self.total_size : self.num_replicas]
-        return available_indices
-
-    def __len__(self):
-        return self.num_samples
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-logger = getLogger(__name__)
-
-
-def use_task_specific_params(model, task):
-    """Update config with summarization specific params."""
-    task_specific_params = model.config.task_specific_params
-
-    if task_specific_params is not None:
-        pars = task_specific_params.get(task, {})
-        logger.info(f"using task specific params for {task}: {pars}")
-        model.config.update(pars)
-
-
-def pickle_load(path):
-    """pickle.load(path)"""
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def flatten_list(summary_ids: List[List]):
-    return [x for x in itertools.chain.from_iterable(summary_ids)]
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path, indent=4, **json_dump_kwargs):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=indent, **json_dump_kwargs)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    try:
-        repo = git.Repo(search_parent_directories=True)
-        repo_infos = {
-            "repo_id": str(repo),
-            "repo_sha": str(repo.head.object.hexsha),
-            "repo_branch": str(repo.active_branch),
-            "hostname": str(socket.gethostname()),
-        }
-        return repo_infos
-    except TypeError:
-        return {
-            "repo_id": None,
-            "repo_sha": None,
-            "repo_branch": None,
-            "hostname": None,
-        }
-
-
-ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
-
-
-def extract_rouge_mid_statistics(dct):
-    new_dict = {}
-    for k1, v1 in dct.items():
-        mid = v1.mid
-        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
-    return new_dict
-
-
-def calculate_rouge(
-    pred_lns: List[str],
-    tgt_lns: List[str],
-    use_stemmer=True,
-    rouge_keys=ROUGE_KEYS,
-    return_precision_and_recall=False,
-    bootstrap_aggregation=True,
-    newline_sep=True,
-) -> Dict:
-    """Calculate rouge using rouge_scorer package.
-
-    Args:
-        pred_lns: list of summaries generated by model
-        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
-        use_stemmer:  Bool indicating whether Porter stemmer should be used to
-        strip word suffixes to improve matching.
-        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
-        return_precision_and_recall: (False) whether to also return precision and recall.
-        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
-            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
-        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
-        on multi sentence summaries (CNN/DM dataset).
-
-    Returns:
-         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
-
-    """
-    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
-    aggregator = scoring.BootstrapAggregator()
-    for pred, tgt in zip(tgt_lns, pred_lns):
-        # rougeLsum expects "\n" separated sentences within a summary
-        if newline_sep:
-            pred = add_newline_to_end_of_each_sentence(pred)
-            tgt = add_newline_to_end_of_each_sentence(tgt)
-        scores = scorer.score(pred, tgt)
-        aggregator.add_scores(scores)
-
-    if bootstrap_aggregation:
-        result = aggregator.aggregate()
-        if return_precision_and_recall:
-            return extract_rouge_mid_statistics(result)  # here we return dict
-        else:
-            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
-
-    else:
-        return aggregator._scores  # here we return defaultdict(list)
-
-
-# Utilities for freezing parameters and checking whether they are frozen
-
-
-def freeze_params(model: nn.Module):
-    """Set requires_grad=False for each of model.parameters()"""
-    for par in model.parameters():
-        par.requires_grad = False
-
-
-def freeze_embeds(model):
-    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
-    model_type = model.config.model_type
-
-    if model_type == "t5":
-        freeze_params(model.shared)
-        for d in [model.encoder, model.decoder]:
-            freeze_params(d.embed_tokens)
-    elif model_type == "fsmt":
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-    else:
-        freeze_params(model.model.shared)
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-
-
-def grad_status(model: nn.Module) -> Iterable:
-    return (par.requires_grad for par in model.parameters())
-
-
-def any_requires_grad(model: nn.Module) -> bool:
-    return any(grad_status(model))
-
-
-def assert_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    n_require_grad = sum(lmap(int, model_grads))
-    npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
-
-
-def assert_not_all_frozen(model):
-    model_grads: List[bool] = list(grad_status(model))
-    npars = len(model_grads)
-    assert any(model_grads), f"none of {npars} weights require grad"
-
-
-def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
-    """
-    Parse an argv list of unspecified command line args to a dict.
-    Assumes all values are either numeric or boolean in the form of true/false.
-    """
-    result = {}
-    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
-    num_pairs = len(unparsed_args) // 2
-    for pair_num in range(num_pairs):
-        i = 2 * pair_num
-        assert unparsed_args[i].startswith("--")
-        if unparsed_args[i + 1].lower() == "true":
-            value = True
-        elif unparsed_args[i + 1].lower() == "false":
-            value = False
-        else:
-            try:
-                value = int(unparsed_args[i + 1])
-            except ValueError:
-                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
-
-        result[unparsed_args[i][2:]] = value
-    return result
-
-
-def write_txt_file(ordered_tgt, path):
-    f = Path(path).open("w")
-    for ln in ordered_tgt:
-        f.write(ln + "\n")
-        f.flush()
-
-
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
-
-
-def check_output_dir(args, expected_items=0):
-    """
-    Checks whether to bail out if output_dir already exists and has more than expected_items in it
-
-    `args`: needs to have the following attributes of `args`:
-      - output_dir
-      - do_train
-      - overwrite_output_dir
-
-    `expected_items`: normally 0 (default) - i.e. empty dir, but in some cases a few files are expected (e.g. recovery from OOM)
-    """
-    if (
-        os.path.exists(args.output_dir)
-        and len(os.listdir(args.output_dir)) > expected_items
-        and args.do_train
-        and not args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({args.output_dir}) already exists and "
-            f"has {len(os.listdir(args.output_dir))} items in it (expected {expected_items} items). "
-            "Use --overwrite_output_dir to overcome."
-        )
diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py
index b6994a1831da0a..c7f255e7e85267 100644
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ b/examples/research_projects/seq2seq-distillation/utils.py
@@ -21,7 +21,7 @@
 from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.file_utils import cached_property
-from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.models.mbart.modeling_mbart import shift_tokens_right
 
 
 try:
diff --git a/examples/seq2seq/test_datasets.py b/examples/seq2seq/test_datasets.py
index 7ef962b9c13af6..6792fcf6ddd6f5 100644
--- a/examples/seq2seq/test_datasets.py
+++ b/examples/seq2seq/test_datasets.py
@@ -23,7 +23,7 @@
 from parameterized import parameterized
 from save_len_file import save_len_file
 from transformers import AutoTokenizer
-from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.models.mbart.modeling_mbart import shift_tokens_right
 from transformers.testing_utils import TestCasePlus, require_torch_non_multi_gpu_but_fix_me, slow
 from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset
 

From a7442fb736bd60712c91bd500e748e98b5064432 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 00:09:17 +0100
Subject: [PATCH 26/51] finish docs for bart, mbart, and marian

---
 docs/source/model_doc/bart.rst                |    1 -
 docs/source/model_doc/marian.rst              |    9 +-
 src/transformers/models/bart/modeling_bart.py |   51 +-
 .../modeling_encoder_decoder.py               |   11 +
 .../models/marian/configuration_marian.py     |    2 +-
 .../models/marian/modeling_marian.py          |   53 +-
 .../models/marian/tokenization_marian.py      |    2 +-
 .../models/mbart/modeling_mbart.py            |   56 +-
 .../models/prophetnet/modeling_prophetnet.py  |   14 +-
 src/transformers/models/t5/modeling_t5.py     |   14 +-
 ~                                             | 1667 +++++++++++++++++
 11 files changed, 1807 insertions(+), 73 deletions(-)
 create mode 100644 ~

diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst
index b167cf64fd8a0c..26c4e54d8019f0 100644
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -64,7 +64,6 @@ Implementation Notes
   summarization, see the example in that docstrings.
 - Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
   mask-filling tasks.
-- For training/forward passes that don't involve beam search, pass :obj:`use_cache=False`.
 
 Mask Filling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst
index d637f5ce41ba09..b7d97aae990e85 100644
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -33,7 +33,6 @@ Implementation Notes
 - The modeling code is the same as :class:`~transformers.BartForConditionalGeneration` with a few minor modifications:
 
     - static (sinusoid) positional embeddings (:obj:`MarianConfig.static_position_embeddings=True`)
-    - a new final_logits_bias (:obj:`MarianConfig.add_bias_logits=True`)
     - no layernorm_embedding (:obj:`MarianConfig.normalize_embedding=False`)
     - the model starts generating with :obj:`pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
       :obj:`<s/>`),
@@ -56,12 +55,10 @@ Examples
 
 - Since Marian models are smaller than many other translation models available in the library, they can be useful for
   fine-tuning experiments and integration tests.
-- `Fine-tune on TPU
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh>`__
 - `Fine-tune on GPU
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh>`__
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_enro_teacher.sh>`__
 - `Fine-tune on GPU with pytorch-lightning
-  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/distil_marian_no_teacher.sh>`__
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_no_teacher.sh>`__
 
 Multilingual Models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -186,12 +183,14 @@ MarianModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.MarianModel
+    :members: forward
 
 
 MarianMTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.MarianMTModel
+    :members: forward
 
 
 TFMarianMTModel
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 3d085b7d01756f..d3acf41cbab39c 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -517,6 +517,22 @@ def __init_subclass__(self):
         >>> # Generate Summary
         >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
         >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+
+    Mask filling example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
 """
 
 BART_INPUTS_DOCSTRING = r"""
@@ -538,8 +554,21 @@ def __init_subclass__(self):
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+            For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
+            :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+            the right for denoising pre-training following the paper.
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
@@ -1139,22 +1168,6 @@ def forward(
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
 
         Returns:
-
-        Conditional generation example::
-
-            >>> from transformers import BartTokenizer, BartForConditionalGeneration
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1446,8 +1459,6 @@ def forward(
             end_logits=end_logits,
             past_key_values=outputs.past_key_values,
             decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 8d20a68c61abc1..90bae1d2c49397 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -77,6 +77,17 @@
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
             Provide for sequence to sequence training to the decoder. Indices can be obtained using
             :class:`~transformers.PretrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
             :meth:`transformers.PreTrainedTokenizer.__call__` for details.
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 54a12c1710f367..eb9272bfd4968a 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -81,7 +81,7 @@ class MarianConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Examples::
 
         >>> from transformers import MarianModel, MarianConfig
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 70182c1d01bc1c..b000e30aacdc38 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -27,7 +27,6 @@
 
 from ...activations import ACT2FN
 from ...file_utils import (
-    add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -523,8 +522,17 @@ def dummy_inputs(self):
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            Marian uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
@@ -964,12 +972,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(MARIAN_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="Helsinki-NLP/opus-mt-en-de",
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -985,6 +988,23 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import MarianTokenizer, MarianModel
+
+            >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+            >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
+            ... return_tensors="pt", add_special_tokens=False).input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1117,21 +1137,6 @@ def forward(
 
         Returns:
 
-        Conditional generation example::
-
-            >>> from transformers import MarianTokenizer, MarianMTModel
-            >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 3b4ede693f0baa..0fb03c53855391 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -84,7 +84,7 @@ class MarianTokenizer(PreTrainedTokenizer):
         >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
         >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
         >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
+        >>> batch_enc = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
         >>> # keys  [input_ids, attention_mask, labels].
         >>> # model(**batch) should work
     """
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 8a6aa90b4e303d..adc9cf456f7d35 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -507,12 +507,29 @@ def dummy_inputs(self):
         >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
         >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> ARTICLE_TO_SUMMARIZE = "Meine Freunde sind cool, aber sie essen zu viel Kuchen."
         >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
 
         >>> # Generate Summary
         >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
         >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+
+    Mask filling example::
+
+        >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> # de_DE is the language symbol id <LID> for German
+        >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
+
+        >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
 """
 
 MBART_INPUTS_DOCSTRING = r"""
@@ -536,6 +553,23 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the :obj:`input_ids` to the right, following the paper.
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            MBart uses a specific language id token as the starting token for :obj:`decoder_input_ids` generation that
+            varies according to source and target language, *e.g.* 25004 for `en_XX`, and 25003 for `de_DE`. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+            For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
+            :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+            the right for denoising pre-training following the paper.
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
@@ -1015,6 +1049,11 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # different to other models, MBart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
+
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids,
@@ -1134,21 +1173,6 @@ def forward(
 
         Returns:
 
-        Conditional generation example::
-
-            >>> from transformers import MBartTokenizer, MBartForConditionalGeneration
-            >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 17db02b5b29d5e..682e039d4f3a78 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -87,8 +87,18 @@
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            ProphetNet uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
         decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 0ce2be3c62ac94..63b16f9e974bb4 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1025,9 +1025,17 @@ def forward(
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for sequence to sequence training. T5 uses the :obj:`pad_token_id` as the starting token for
-            :obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
-            :obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
 
             To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
             <./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
diff --git a/~ b/~
new file mode 100644
index 00000000000000..71658d4cc16fca
--- /dev/null
+++ b/~
@@ -0,0 +1,1667 @@
+# coding=utf-8
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model. """
+
+
+import copy
+import math
+import os
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import logging
+from ...utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_t5 import T5Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "T5Config"
+_TOKENIZER_FOR_DOC = "T5Tokenizer"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "t5-small",
+    "t5-base",
+    "t5-large",
+    "t5-3b",
+    "t5-11b",
+    # See all T5 models at https://huggingface.co/models?filter=t5
+]
+
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        tf_weights[name] = array
+
+    for txt_name in names:
+        name = txt_name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        if "_slot_" in name[-1]:
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        pointer = model
+        array = tf_weights[txt_name]
+
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] in ["kernel", "scale", "embedding"]:
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "self_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[0]
+            elif scope_names[0] == "enc_dec_attention":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[1]
+            elif scope_names[0] == "dense_relu_dense":
+                pointer = getattr(pointer, "layer")
+                pointer = pointer[2]
+            elif scope_names[0] == "rms_norm":
+                if hasattr(pointer, "layer_norm"):
+                    pointer = getattr(pointer, "layer_norm")
+                elif hasattr(pointer, "final_layer_norm"):
+                    pointer = getattr(pointer, "final_layer_norm")
+            elif scope_names[0] == "scale":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            elif scope_names[0] == "decoder" and name[1] == "logits":
+                continue
+            elif scope_names[0] == "logits":
+                pointer = getattr(pointer, "lm_head")
+            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
+                pointer = getattr(pointer, f"wi_{scope_names[1]}")
+                continue
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if scope_names[0] not in ["kernel", "scale", "embedding"]:
+            pointer = getattr(pointer, "weight")
+        if scope_names[0] != "embedding":
+            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+####################################################
+PARALLELIZE_DOCSTRING = r"""
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
+            following number of attention modules:
+
+                - t5-small: 6
+                - t5-base: 12
+                - t5-large: 24
+                - t5-3b: 24
+                - t5-11b: 24
+
+    Example::
+
+            # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
+            model = T5ForConditionalGeneration.from_pretrained('t5-3b')
+            device_map = {0: [0, 1, 2],
+
+                         1: [3, 4, 5, 6, 7, 8, 9],
+                         2: [10, 11, 12, 13, 14, 15, 16],
+                         3: [17, 18, 19, 20, 21, 22, 23]}
+            model.parallelize(device_map)
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example::
+
+        # On a 4 GPU machine with t5-3b:
+        model = T5ForConditionalGeneration.from_pretrained('t5-3b')
+        device_map = {0: [0, 1, 2],
+
+                     1: [3, 4, 5, 6, 7, 8, 9],
+                     2: [10, 11, 12, 13, 14, 15, 16],
+                     3: [17, 18, 19, 20, 21, 22, 23]}
+        model.parallelize(device_map) # Splits the model across several devices
+        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+"""
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # layer norm should always be calculated in float32
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into float16 if necessary
+        if self.weight.dtype == torch.float16:
+            hidden_states = hidden_states.to(torch.float16)
+        return self.weight * hidden_states
+
+
+class T5DenseReluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = F.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.feed_forward_proj == "relu":
+            self.DenseReluDense = T5DenseReluDense(config)
+        elif config.feed_forward_proj == "gated-gelu":
+            self.DenseReluDense = T5DenseGatedGeluDense(config)
+        else:
+            raise ValueError(
+                f"{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`"
+            )
+
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5Attention(nn.Module):
+    def __init__(self, config: T5Config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """ Compute binned relative position bias """
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+        )
+        relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), "past_key_value should have 2 past states: keys and values. Got {} past states".format(
+                len(past_key_value)
+            )
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """  projection """
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """  reshape """
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """ projects hidden states correctly to key/query states """
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
+        )
+        value_states = project(
+            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
+        )
+
+        # compute scores
+        scores = torch.matmul(
+            query_states, key_states.transpose(3, 2)
+        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -seq_length:, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = F.softmax(scores.float(), dim=-1).type_as(
+            scores
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = F.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            head_mask=head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            head_mask=head_mask,
+            past_key_value=past_key_value,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(T5LayerCrossAttention(config))
+
+        self.layer.append(T5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        head_mask=None,
+        past_key_value=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+    ):
+
+        if past_key_value is not None:
+            assert self.is_decoder, "Only decoder can use `past_key_values`"
+            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
+
+            error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format(
+                expected_num_past_key_values,
+                "2 (past / key) for cross attention" if expected_num_past_key_values == 4 else "",
+                len(past_key_value),
+            )
+            assert len(past_key_value) == expected_num_past_key_values, error_message
+
+            self_attn_past_key_value = past_key_value[:2]
+            cross_attn_past_key_value = past_key_value[2:]
+        else:
+            self_attn_past_key_value, cross_attn_past_key_value = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            head_mask=head_mask,
+            past_key_value=self_attn_past_key_value,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask,
+                past_key_value=cross_attn_past_key_value,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+        outputs = (hidden_states,)
+
+        outputs = outputs + (present_key_value_state,) + attention_outputs
+        return outputs  # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = T5Config
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, T5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5DenseGatedGeluDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class T5Stack(T5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = embed_tokens
+        self.is_decoder = config.is_decoder
+
+        self.block = nn.ModuleList(
+            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.init_weights()
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.block))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for layer in v:
+                cuda_device = "cuda:" + str(k)
+                self.block[layer] = self.block[layer].to(cuda_device)
+
+        # Set embed_tokens to first layer
+        self.embed_tokens = self.embed_tokens.to(self.first_device)
+        # Set final layer norm to last device
+        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        for i in range(len(self.block)):
+            self.block[i] = self.block[i].to("cpu")
+        self.embed_tokens = self.embed_tokens.to("cpu")
+        self.final_layer_norm = self.final_layer_norm.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}inputs and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, ":obj:`use_cache` can only be set to `True` if {} is used as a decoder".format(
+                self
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
+            )
+
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)
+
+        if self.is_decoder and encoder_attention_mask is not None:
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                position_bias=position_bias,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_extended_attention_mask,
+                encoder_decoder_position_bias=encoder_decoder_position_bias,
+                head_mask=head_mask[i],
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention weights),
+            # (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+T5_START_DOCSTRING = r"""
+
+    The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
+    <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text
+    denoising generative setting.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            detail.
+
+            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
+            <./t5.html#training>`__.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__ 
+
+            T5 uses the :obj:`pad_token_id` as the starting token
+            for :obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
+            :obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
+
+            To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
+            <./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
+            :obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, :obj:`optional`: `hidden_states`, :obj:`optional`:
+            `attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
+            sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+T5_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
+            should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            detail.
+
+            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
+            <./t5.html#training>`__.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5Model(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder\.embed_tokens\.weight",
+        r"decoder\.embed_tokens\.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        head_mask=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import T5Tokenizer, T5Model
+
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5Model.from_pretrained('t5-small')
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
+class T5ForConditionalGeneration(T5PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"encoder\.embed_tokens\.weight",
+        r"decoder\.embed_tokens\.weight",
+        r"lm_head\.weight",
+    ]
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = T5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.decoder.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.decoder.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.decoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.decoder = self.decoder.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        head_mask=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
+            labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+
+            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt').input_ids
+            >>> outputs = model(input_ids=input_ids, labels=labels)
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
+
+            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model.generate(input_ids)
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # If decoding with past key value states, only the last tokens
+        # should be given as an input
+        if past_key_values is not None:
+            assert labels is None, "Decoder should not use cached key value states when training."
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids[:, -1:]
+            if decoder_inputs_embeds is not None:
+                decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.decoder.first_device)
+            hidden_states = hidden_states.to(self.decoder.first_device)
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(self.decoder.first_device)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.encoder.first_device)
+            self.lm_head = self.lm_head.to(self.encoder.first_device)
+            sequence_output = sequence_output.to(self.lm_head.weight.device)
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim ** -0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past,
+            "encoder_outputs": encoder_outputs,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        # if decoder past is not included in output
+        # speedy decoding is disabled and no need to reorder
+        if past is None:
+            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
+            return past
+
+        reordered_decoder_past = ()
+        for layer_past_states in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` is at 2nd position
+            reordered_layer_past_states = ()
+            for layer_past_state in layer_past_states:
+                # need to set correct `past` for each of the four key / value states
+                reordered_layer_past_states = reordered_layer_past_states + (
+                    layer_past_state.index_select(0, beam_idx),
+                )
+
+            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
+            assert len(reordered_layer_past_states) == len(layer_past_states)
+
+            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
+        return reordered_decoder_past
+
+
+@add_start_docstrings(
+    "The bare T5 Model transformer outputting encoder's raw hidden-states" "without any specific head on top.",
+    T5_START_DOCSTRING,
+)
+class T5EncoderModel(T5PreTrainedModel):
+    authorized_missing_keys = [
+        r"encoder\.embed_tokens\.weight",
+    ]
+
+    def __init__(self, config: T5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = T5Stack(encoder_config, self.shared)
+
+        self.init_weights()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.encoder.block))
+        self.encoder.parallelize(self.device_map)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.encoder.deparallelize()
+        self.encoder = self.encoder.to("cpu")
+        self.model_parallel = False
+        self.device_map = None
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import T5Tokenizer, T5EncoderModel
+            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+            >>> model = T5EncoderModel.from_pretrained('t5-small')
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids)
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs

From 9332a0629a264511934257ed37bb280bbd96595a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 00:09:34 +0100
Subject: [PATCH 27/51] delete unnecessary file

---
 ~ | 1667 -------------------------------------------------------------
 1 file changed, 1667 deletions(-)
 delete mode 100644 ~

diff --git a/~ b/~
deleted file mode 100644
index 71658d4cc16fca..00000000000000
--- a/~
+++ /dev/null
@@ -1,1667 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch T5 model. """
-
-
-import copy
-import math
-import os
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss
-
-from ...activations import ACT2FN
-from ...file_utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from ...modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-)
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
-from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_t5 import T5Config
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "T5Config"
-_TOKENIZER_FOR_DOC = "T5Tokenizer"
-
-####################################################
-# This dict contains ids and associated url
-# for the pretrained weights provided with the models
-####################################################
-T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "t5-small",
-    "t5-base",
-    "t5-large",
-    "t5-3b",
-    "t5-11b",
-    # See all T5 models at https://huggingface.co/models?filter=t5
-]
-
-
-####################################################
-# This is a conversion method from TF 1.0 to PyTorch
-# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
-####################################################
-def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    tf_weights = {}
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        tf_weights[name] = array
-
-    for txt_name in names:
-        name = txt_name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info("Skipping {}".format("/".join(name)))
-            tf_weights.pop(txt_name, None)
-            continue
-        if "_slot_" in name[-1]:
-            logger.info("Skipping {}".format("/".join(name)))
-            tf_weights.pop(txt_name, None)
-            continue
-        pointer = model
-        array = tf_weights[txt_name]
-
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] in ["kernel", "scale", "embedding"]:
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "self_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[0]
-            elif scope_names[0] == "enc_dec_attention":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[1]
-            elif scope_names[0] == "dense_relu_dense":
-                pointer = getattr(pointer, "layer")
-                pointer = pointer[2]
-            elif scope_names[0] == "rms_norm":
-                if hasattr(pointer, "layer_norm"):
-                    pointer = getattr(pointer, "layer_norm")
-                elif hasattr(pointer, "final_layer_norm"):
-                    pointer = getattr(pointer, "final_layer_norm")
-            elif scope_names[0] == "scale":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            elif scope_names[0] == "decoder" and name[1] == "logits":
-                continue
-            elif scope_names[0] == "logits":
-                pointer = getattr(pointer, "lm_head")
-            elif scope_names[0] == "wi" and len(scope_names) > 1 and scope_names[1].isdigit():
-                pointer = getattr(pointer, f"wi_{scope_names[1]}")
-                continue
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if scope_names[0] not in ["kernel", "scale", "embedding"]:
-            pointer = getattr(pointer, "weight")
-        if scope_names[0] != "embedding":
-            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
-            array = np.transpose(array)
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array.astype(np.float32))
-        tf_weights.pop(txt_name, None)
-
-    logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys())))
-    return model
-
-
-####################################################
-# PyTorch Models are constructed by sub-classing
-# - torch.nn.Module for the layers and
-# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
-####################################################
-PARALLELIZE_DOCSTRING = r"""
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
-            following number of attention modules:
-
-                - t5-small: 6
-                - t5-base: 12
-                - t5-large: 24
-                - t5-3b: 24
-                - t5-11b: 24
-
-    Example::
-
-            # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
-            model = T5ForConditionalGeneration.from_pretrained('t5-3b')
-            device_map = {0: [0, 1, 2],
-
-                         1: [3, 4, 5, 6, 7, 8, 9],
-                         2: [10, 11, 12, 13, 14, 15, 16],
-                         3: [17, 18, 19, 20, 21, 22, 23]}
-            model.parallelize(device_map)
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example::
-
-        # On a 4 GPU machine with t5-3b:
-        model = T5ForConditionalGeneration.from_pretrained('t5-3b')
-        device_map = {0: [0, 1, 2],
-
-                     1: [3, 4, 5, 6, 7, 8, 9],
-                     2: [10, 11, 12, 13, 14, 15, 16],
-                     3: [17, 18, 19, 20, 21, 22, 23]}
-        model.parallelize(device_map) # Splits the model across several devices
-        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-"""
-
-
-class T5LayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Construct a layernorm module in the T5 style No bias and no subtraction of mean.
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        # layer norm should always be calculated in float32
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into float16 if necessary
-        if self.weight.dtype == torch.float16:
-            hidden_states = hidden_states.to(torch.float16)
-        return self.weight * hidden_states
-
-
-class T5DenseReluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        hidden_states = self.wi(hidden_states)
-        hidden_states = F.relu(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5DenseGatedGeluDense(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
-        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.gelu_act = ACT2FN["gelu_new"]
-
-    def forward(self, hidden_states):
-        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
-        hidden_linear = self.wi_1(hidden_states)
-        hidden_states = hidden_gelu * hidden_linear
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.wo(hidden_states)
-        return hidden_states
-
-
-class T5LayerFF(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.feed_forward_proj == "relu":
-            self.DenseReluDense = T5DenseReluDense(config)
-        elif config.feed_forward_proj == "gated-gelu":
-            self.DenseReluDense = T5DenseGatedGeluDense(config)
-        else:
-            raise ValueError(
-                f"{self.config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`"
-            )
-
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(self, hidden_states):
-        forwarded_states = self.layer_norm(hidden_states)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        hidden_states = hidden_states + self.dropout(forwarded_states)
-        return hidden_states
-
-
-class T5Attention(nn.Module):
-    def __init__(self, config: T5Config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.has_relative_attention_bias = has_relative_attention_bias
-
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.d_model = config.d_model
-        self.key_value_proj_dim = config.d_kv
-        self.n_heads = config.num_heads
-        self.dropout = config.dropout_rate
-        self.inner_dim = self.n_heads * self.key_value_proj_dim
-
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
-        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
-
-        if self.has_relative_attention_bias:
-            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
-        )
-        # Prune linear layers
-        self.q = prune_linear_layer(self.q, index)
-        self.k = prune_linear_layer(self.k, index)
-        self.v = prune_linear_layer(self.v, index)
-        self.o = prune_linear_layer(self.o, index, dim=1)
-        # Update hyper params
-        self.n_heads = self.n_heads - len(heads)
-        self.inner_dim = self.key_value_proj_dim * self.n_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
-
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_postion_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact)
-            / math.log(max_distance / max_exact)
-            * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_postion_if_large = torch.min(
-            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
-        )
-
-        relative_buckets += torch.where(is_small, relative_position, relative_postion_if_large)
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length):
-        """ Compute binned relative position bias """
-        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=(not self.is_decoder),
-            num_buckets=self.relative_attention_num_buckets,
-        )
-        relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(
-        self,
-        hidden_states,
-        mask=None,
-        key_value_states=None,
-        position_bias=None,
-        past_key_value=None,
-        head_mask=None,
-        query_length=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        """
-        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
-        """
-        # Input is (batch_size, seq_length, dim)
-        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
-        # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
-        batch_size, seq_length = hidden_states.shape[:2]
-
-        real_seq_length = seq_length
-
-        if past_key_value is not None:
-            assert (
-                len(past_key_value) == 2
-            ), "past_key_value should have 2 past states: keys and values. Got {} past states".format(
-                len(past_key_value)
-            )
-            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-
-        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-        def shape(states):
-            """  projection """
-            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-        def unshape(states):
-            """  reshape """
-            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-        def project(hidden_states, proj_layer, key_value_states, past_key_value):
-            """ projects hidden states correctly to key/query states """
-            if key_value_states is None:
-                # self-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(hidden_states))
-            elif past_key_value is None:
-                # cross-attn
-                # (batch_size, n_heads, seq_length, dim_per_head)
-                hidden_states = shape(proj_layer(key_value_states))
-
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, n_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value
-            return hidden_states
-
-        # get query states
-        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
-
-        # get key/value states
-        key_states = project(
-            hidden_states, self.k, key_value_states, past_key_value[0] if past_key_value is not None else None
-        )
-        value_states = project(
-            hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
-        )
-
-        # compute scores
-        scores = torch.matmul(
-            query_states, key_states.transpose(3, 2)
-        )  # equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
-
-        if position_bias is None:
-            if not self.has_relative_attention_bias:
-                position_bias = torch.zeros(
-                    (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
-                )
-            else:
-                position_bias = self.compute_bias(real_seq_length, key_length)
-
-            # if key and values are already calculated
-            # we want only the last query position bias
-            if past_key_value is not None:
-                position_bias = position_bias[:, :, -seq_length:, :]
-
-            if mask is not None:
-                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        scores += position_bias
-        attn_weights = F.softmax(scores.float(), dim=-1).type_as(
-            scores
-        )  # (batch_size, n_heads, seq_length, key_length)
-        attn_weights = F.dropout(
-            attn_weights, p=self.dropout, training=self.training
-        )  # (batch_size, n_heads, seq_length, key_length)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = unshape(torch.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
-        attn_output = self.o(attn_output)
-
-        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
-
-        if output_attentions:
-            outputs = outputs + (attn_weights,)
-        return outputs
-
-
-class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.SelfAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            position_bias=position_bias,
-            head_mask=head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states = hidden_states + self.dropout(attention_output[0])
-        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=False)
-        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-    def forward(
-        self,
-        hidden_states,
-        key_value_states,
-        attention_mask=None,
-        position_bias=None,
-        head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        query_length=None,
-        output_attentions=False,
-    ):
-        normed_hidden_states = self.layer_norm(hidden_states)
-        attention_output = self.EncDecAttention(
-            normed_hidden_states,
-            mask=attention_mask,
-            key_value_states=key_value_states,
-            position_bias=position_bias,
-            head_mask=head_mask,
-            past_key_value=past_key_value,
-            use_cache=use_cache,
-            query_length=query_length,
-            output_attentions=output_attentions,
-        )
-        layer_output = hidden_states + self.dropout(attention_output[0])
-        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
-
-
-class T5Block(nn.Module):
-    def __init__(self, config, has_relative_attention_bias=False):
-        super().__init__()
-        self.is_decoder = config.is_decoder
-        self.layer = nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
-        if self.is_decoder:
-            self.layer.append(T5LayerCrossAttention(config))
-
-        self.layer.append(T5LayerFF(config))
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        position_bias=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        encoder_decoder_position_bias=None,
-        head_mask=None,
-        past_key_value=None,
-        use_cache=False,
-        output_attentions=False,
-        return_dict=True,
-    ):
-
-        if past_key_value is not None:
-            assert self.is_decoder, "Only decoder can use `past_key_values`"
-            expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
-
-            error_message = "There should be {} past states. 2 (past / key) for self attention.{} Got {} past key / value states".format(
-                expected_num_past_key_values,
-                "2 (past / key) for cross attention" if expected_num_past_key_values == 4 else "",
-                len(past_key_value),
-            )
-            assert len(past_key_value) == expected_num_past_key_values, error_message
-
-            self_attn_past_key_value = past_key_value[:2]
-            cross_attn_past_key_value = past_key_value[2:]
-        else:
-            self_attn_past_key_value, cross_attn_past_key_value = None, None
-
-        self_attention_outputs = self.layer[0](
-            hidden_states,
-            attention_mask=attention_mask,
-            position_bias=position_bias,
-            head_mask=head_mask,
-            past_key_value=self_attn_past_key_value,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        hidden_states, present_key_value_state = self_attention_outputs[:2]
-        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
-
-        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
-        if do_cross_attention:
-            # the actual query length is unknown for cross attention
-            # if using past key value states. Need to inject it here
-            if present_key_value_state is not None:
-                query_length = present_key_value_state[0].shape[2]
-            else:
-                query_length = None
-
-            cross_attention_outputs = self.layer[1](
-                hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask,
-                past_key_value=cross_attn_past_key_value,
-                query_length=query_length,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            hidden_states = cross_attention_outputs[0]
-            # Combine self attn and cross attn key value states
-            if present_key_value_state is not None:
-                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
-
-            # Keep cross-attention outputs and relative position weights
-            attention_outputs = attention_outputs + cross_attention_outputs[2:]
-
-        # Apply Feed Forward layer
-        hidden_states = self.layer[-1](hidden_states)
-        outputs = (hidden_states,)
-
-        outputs = outputs + (present_key_value_state,) + attention_outputs
-        return outputs  # hidden-states, present_key_value_states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-
-
-class T5PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = T5Config
-    load_tf_weights = load_tf_weights_in_t5
-    base_model_prefix = "transformer"
-
-    @property
-    def dummy_inputs(self):
-        input_ids = torch.tensor(DUMMY_INPUTS)
-        input_mask = torch.tensor(DUMMY_MASK)
-        dummy_inputs = {
-            "decoder_input_ids": input_ids,
-            "input_ids": input_ids,
-            "decoder_attention_mask": input_mask,
-        }
-        return dummy_inputs
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        factor = self.config.initializer_factor  # Used for testing weights initialization
-        if isinstance(module, T5LayerNorm):
-            module.weight.data.fill_(factor * 1.0)
-        elif isinstance(module, (T5Model, T5ForConditionalGeneration, T5EncoderModel)):
-            # Mesh TensorFlow embeddings initialization
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
-            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
-        elif isinstance(module, T5DenseReluDense):
-            # Mesh TensorFlow FF initialization
-            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
-            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
-            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi, "bias") and module.wi.bias is not None:
-                module.wi.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5DenseGatedGeluDense):
-            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
-                module.wi_0.bias.data.zero_()
-            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
-            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
-                module.wi_1.bias.data.zero_()
-            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
-            if hasattr(module.wo, "bias") and module.wo.bias is not None:
-                module.wo.bias.data.zero_()
-        elif isinstance(module, T5Attention):
-            # Mesh TensorFlow attention initialization to avoid scaling before softmax
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
-            d_model = self.config.d_model
-            key_value_proj_dim = self.config.d_kv
-            n_heads = self.config.num_heads
-            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
-            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
-            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5))
-            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
-            if module.has_relative_attention_bias:
-                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
-
-    def _shift_right(self, input_ids):
-        decoder_start_token_id = self.config.decoder_start_token_id
-        pad_token_id = self.config.pad_token_id
-
-        assert (
-            decoder_start_token_id is not None
-        ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information"
-
-        # shift inputs to the right
-        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-        shifted_input_ids[..., 0] = decoder_start_token_id
-
-        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
-        # replace possible -100 values in labels by `pad_token_id`
-        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
-
-        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
-
-        return shifted_input_ids
-
-
-class T5Stack(T5PreTrainedModel):
-    def __init__(self, config, embed_tokens=None):
-        super().__init__(config)
-
-        self.embed_tokens = embed_tokens
-        self.is_decoder = config.is_decoder
-
-        self.block = nn.ModuleList(
-            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
-        )
-        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout_rate)
-
-        self.init_weights()
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
-        )
-        assert_device_map(self.device_map, len(self.block))
-        self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for layer in v:
-                cuda_device = "cuda:" + str(k)
-                self.block[layer] = self.block[layer].to(cuda_device)
-
-        # Set embed_tokens to first layer
-        self.embed_tokens = self.embed_tokens.to(self.first_device)
-        # Set final layer norm to last device
-        self.final_layer_norm = self.final_layer_norm.to(self.last_device)
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        for i in range(len(self.block)):
-            self.block[i] = self.block[i].to("cpu")
-        self.embed_tokens = self.embed_tokens.to("cpu")
-        self.final_layer_norm = self.final_layer_norm.to("cpu")
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, new_embeddings):
-        self.embed_tokens = new_embeddings
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        inputs_embeds=None,
-        head_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        # Model parallel
-        if self.model_parallel:
-            torch.cuda.set_device(self.first_device)
-            self.embed_tokens = self.embed_tokens.to(self.first_device)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(
-                f"You cannot specify both {err_msg_prefix}inputs and {err_msg_prefix}inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            err_msg_prefix = "decoder_" if self.is_decoder else ""
-            raise ValueError(f"You have to specify either {err_msg_prefix}inputs or {err_msg_prefix}inputs_embeds")
-
-        if inputs_embeds is None:
-            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        batch_size, seq_length = input_shape
-
-        # required mask seq length can be calculated via length of past
-        mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
-
-        if use_cache is True:
-            assert self.is_decoder, ":obj:`use_cache` can only be set to `True` if {} is used as a decoder".format(
-                self
-            )
-
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length).to(inputs_embeds.device)
-        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
-            encoder_seq_length = encoder_hidden_states.shape[1]
-            encoder_attention_mask = torch.ones(
-                batch_size, encoder_seq_length, device=inputs_embeds.device, dtype=torch.long
-            )
-
-        # initialize past_key_values with `None` if past does not exist
-        if past_key_values is None:
-            past_key_values = [None] * len(self.block)
-
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, inputs_embeds.device)
-
-        if self.is_decoder and encoder_attention_mask is not None:
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
-        present_key_value_states = () if use_cache else None
-        all_hidden_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
-        position_bias = None
-        encoder_decoder_position_bias = None
-
-        hidden_states = self.dropout(inputs_embeds)
-
-        for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if position_bias is not None:
-                    position_bias = position_bias.to(hidden_states.device)
-                if encoder_hidden_states is not None:
-                    encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
-                if encoder_extended_attention_mask is not None:
-                    encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
-                if encoder_decoder_position_bias is not None:
-                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states,
-                attention_mask=extended_attention_mask,
-                position_bias=position_bias,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_extended_attention_mask,
-                encoder_decoder_position_bias=encoder_decoder_position_bias,
-                head_mask=head_mask[i],
-                past_key_value=past_key_value,
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-            # layer_outputs is a tuple with:
-            # hidden-states, key-value-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-            hidden_states, present_key_value_state = layer_outputs[:2]
-
-            # We share the position biases between the layers - the first layer store them
-            # layer_outputs = hidden-states, key-value-states (self-attention weights),
-            # (self-attention position bias), (cross-attention weights), (cross-attention position bias)
-            position_bias = layer_outputs[2]
-            if self.is_decoder and encoder_hidden_states is not None:
-                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
-            # append next layer key value states
-            if use_cache:
-                present_key_value_states = present_key_value_states + (present_key_value_state,)
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[3],)
-                if self.is_decoder:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-
-        # Add last layer
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    present_key_value_states,
-                    all_hidden_states,
-                    all_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=present_key_value_states,
-            hidden_states=all_hidden_states,
-            attentions=all_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-T5_START_DOCSTRING = r"""
-
-    The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
-    <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
-    Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. It's an encoder decoder transformer pre-trained in a text-to-text
-    denoising generative setting.
-
-    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
-    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    general usage and behavior.
-
-    Parameters:
-        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
-"""
-
-T5_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            detail.
-
-            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
-            <./t5.html#training>`__.
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Indices of decoder input sequence tokens in the vocabulary.
-
-            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            details.
-
-            `What are input IDs? <../glossary.html#input-ids>`__ 
-
-            T5 uses the :obj:`pad_token_id` as the starting token
-            for :obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
-            :obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
-
-            To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
-            <./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
-            :obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
-            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
-            also be used by default.
-        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
-            Tuple consists of (:obj:`last_hidden_state`, :obj:`optional`: `hidden_states`, :obj:`optional`:
-            `attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)` is a
-            sequence of hidden states at the output of the last layer of the encoder. Used in the cross-attention of
-            the decoder.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
-            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
-            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
-            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-
-            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
-            takes the value of :obj:`inputs_embeds`.
-
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-T5_ENCODER_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
-            should be able to pad the inputs on both the right and the left.
-
-            Indices can be obtained using :class:`~transformers.T5Tokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            detail.
-
-            To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
-            <./t5.html#training>`__.
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
-            vectors than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
-            tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
-            more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.",
-    T5_START_DOCSTRING,
-)
-class T5Model(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder\.embed_tokens\.weight",
-        r"decoder\.embed_tokens\.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to("cpu")
-        self.decoder = self.decoder.to("cpu")
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        head_mask=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
-
-        Example::
-
-            >>> from transformers import T5Tokenizer, T5Model
-
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5Model.from_pretrained('t5-small')
-
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
-class T5ForConditionalGeneration(T5PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [
-        r"encoder\.embed_tokens\.weight",
-        r"decoder\.embed_tokens\.weight",
-        r"lm_head\.weight",
-    ]
-    _keys_to_ignore_on_load_unexpected = [
-        r"decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight",
-    ]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model_dim = config.d_model
-
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        decoder_config = copy.deepcopy(config)
-        decoder_config.is_decoder = True
-        decoder_config.is_encoder_decoder = False
-        decoder_config.num_layers = config.num_decoder_layers
-        self.decoder = T5Stack(decoder_config, self.shared)
-
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
-
-        self.init_weights()
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.decoder.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.decoder.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.decoder.deparallelize()
-        self.encoder = self.encoder.to("cpu")
-        self.decoder = self.decoder.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-        self.decoder.set_input_embeddings(new_embeddings)
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def get_encoder(self):
-        return self.encoder
-
-    def get_decoder(self):
-        return self.decoder
-
-    @add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        past_key_values=None,
-        head_mask=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
-            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
-            labels in ``[0, ..., config.vocab_size]``
-
-        Returns:
-
-        Examples::
-
-            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
-
-            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt').input_ids
-            >>> outputs = model(input_ids=input_ids, labels=labels)
-            >>> loss = outputs.loss
-            >>> logits = outputs.logits
-
-            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model.generate(input_ids)
-        """
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # Encode if needed (training, first prediction pass)
-        if encoder_outputs is None:
-            # Convert encoder inputs in embeddings if needed
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-
-        hidden_states = encoder_outputs[0]
-
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-
-        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
-            # get decoder inputs from shifting lm labels to the right
-            decoder_input_ids = self._shift_right(labels)
-
-        # If decoding with past key value states, only the last tokens
-        # should be given as an input
-        if past_key_values is not None:
-            assert labels is None, "Decoder should not use cached key value states when training."
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids[:, -1:]
-            if decoder_inputs_embeds is not None:
-                decoder_inputs_embeds = decoder_inputs_embeds[:, -1:]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.decoder.first_device)
-            hidden_states = hidden_states.to(self.decoder.first_device)
-            if decoder_input_ids is not None:
-                decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(self.decoder.first_device)
-            if decoder_attention_mask is not None:
-                decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
-
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_values=past_key_values,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = decoder_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.encoder.first_device)
-            self.lm_head = self.lm_head.to(self.encoder.first_device)
-            sequence_output = sequence_output.to(self.lm_head.weight.device)
-
-        if self.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.model_dim ** -0.5)
-
-        lm_logits = self.lm_head(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
-            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
-
-        if not return_dict:
-            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqLMOutput(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
-    ):
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {
-            "decoder_input_ids": input_ids,
-            "past_key_values": past,
-            "encoder_outputs": encoder_outputs,
-            "attention_mask": attention_mask,
-            "use_cache": use_cache,
-        }
-
-    def _reorder_cache(self, past, beam_idx):
-        # if decoder past is not included in output
-        # speedy decoding is disabled and no need to reorder
-        if past is None:
-            logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
-            return past
-
-        reordered_decoder_past = ()
-        for layer_past_states in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` is at 2nd position
-            reordered_layer_past_states = ()
-            for layer_past_state in layer_past_states:
-                # need to set correct `past` for each of the four key / value states
-                reordered_layer_past_states = reordered_layer_past_states + (
-                    layer_past_state.index_select(0, beam_idx),
-                )
-
-            assert reordered_layer_past_states[0].shape == layer_past_states[0].shape
-            assert len(reordered_layer_past_states) == len(layer_past_states)
-
-            reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)
-        return reordered_decoder_past
-
-
-@add_start_docstrings(
-    "The bare T5 Model transformer outputting encoder's raw hidden-states" "without any specific head on top.",
-    T5_START_DOCSTRING,
-)
-class T5EncoderModel(T5PreTrainedModel):
-    authorized_missing_keys = [
-        r"encoder\.embed_tokens\.weight",
-    ]
-
-    def __init__(self, config: T5Config):
-        super().__init__(config)
-        self.shared = nn.Embedding(config.vocab_size, config.d_model)
-
-        encoder_config = copy.deepcopy(config)
-        encoder_config.use_cache = False
-        encoder_config.is_encoder_decoder = False
-        self.encoder = T5Stack(encoder_config, self.shared)
-
-        self.init_weights()
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.encoder.block))
-        self.encoder.parallelize(self.device_map)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.encoder.deparallelize()
-        self.encoder = self.encoder.to("cpu")
-        self.model_parallel = False
-        self.device_map = None
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.shared
-
-    def set_input_embeddings(self, new_embeddings):
-        self.shared = new_embeddings
-        self.encoder.set_input_embeddings(new_embeddings)
-
-    def get_encoder(self):
-        return self.encoder
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
-
-        Example::
-
-            >>> from transformers import T5Tokenizer, T5EncoderModel
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5EncoderModel.from_pretrained('t5-small')
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        encoder_outputs = self.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            inputs_embeds=inputs_embeds,
-            head_mask=head_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        return encoder_outputs

From 3cbfb7dd507ba0a55bbe0708bb21c8f3636446a9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 11:24:17 +0100
Subject: [PATCH 28/51] correct attn typo

---
 src/transformers/models/bart/modeling_bart.py              | 7 ++++++-
 src/transformers/models/blenderbot/modeling_blenderbot.py  | 5 ++++-
 .../models/blenderbot_small/modeling_blenderbot_small.py   | 5 ++++-
 src/transformers/models/marian/modeling_marian.py          | 5 ++++-
 src/transformers/models/mbart/modeling_mbart.py            | 5 ++++-
 src/transformers/models/pegasus/modeling_pegasus.py        | 5 ++++-
 6 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index d3acf41cbab39c..510c13a3599416 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1246,7 +1246,10 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
 
 
@@ -1459,6 +1462,8 @@ def forward(
             end_logits=end_logits,
             past_key_values=outputs.past_key_values,
             decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
             encoder_last_hidden_state=outputs.encoder_last_hidden_state,
             encoder_hidden_states=outputs.encoder_hidden_states,
             encoder_attentions=outputs.encoder_attentions,
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index a7bfc3a8714803..e2072b15b45ec9 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1191,5 +1191,8 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 8984f9c5649876..787c95d11298a5 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1189,5 +1189,8 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index b000e30aacdc38..e4bb38f9062c22 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1214,5 +1214,8 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index adc9cf456f7d35..9c5cc8a4564a74 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1247,7 +1247,10 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
 
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index bd72de47c15ad2..8e15e8386f39c9 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1234,7 +1234,10 @@ def _force_token_id_to_be_generated(scores, token_id) -> None:
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
         for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
         return reordered_past
 
 

From b08d165c7b1b70e05dbcd2dd3fea4dc3b3a211ac Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 12:25:34 +0100
Subject: [PATCH 29/51] correct configs

---
 .../blenderbot/configuration_blenderbot.py    | 30 +++++++++----------
 .../configuration_blenderbot_small.py         |  6 ++--
 .../models/marian/configuration_marian.py     |  6 ++--
 .../models/pegasus/configuration_pegasus.py   |  6 ++--
 .../models/pegasus/modeling_pegasus.py        | 13 ++++++--
 5 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 88753bf8e83def..acfb51b45c64e3 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/blenderbot-90M": "https://huggingface.co/facebook/blenderbot-90M/resolve/main/config.json",
+    "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/config.json",
     # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
 }
 
@@ -31,7 +31,7 @@ class BlenderbotConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
     to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
-    `facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__ architecture.
+    `facebook/blenderbot-3B <https://huggingface.co/facebook/blenderbot-3B>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -85,10 +85,10 @@ class BlenderbotConfig(PretrainedConfig):
 
         >>> from transformers import BlenderbotModel, BlenderbotConfig
 
-        >>> # Initializing a Blenderbot facebook/blenderbot-90M style configuration
+        >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
         >>> configuration = BlenderbotConfig()
 
-        >>> # Initializing a model from the facebook/blenderbot-90M style configuration
+        >>> # Initializing a model from the facebook/blenderbot-3B style configuration
         >>> model = BlenderbotModel(configuration)
 
         >>> # Accessing the model configuration
@@ -98,20 +98,20 @@ class BlenderbotConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=50265,
-        max_position_embeddings=1024,
-        encoder_layers=12,
-        encoder_ffn_dim=4096,
-        encoder_attention_heads=16,
-        decoder_layers=12,
-        decoder_ffn_dim=4096,
-        decoder_attention_heads=16,
+        vocab_size=8008,
+        max_position_embeddings=128,
+        encoder_layers=2,
+        encoder_ffn_dim=10240,
+        encoder_attention_heads=32,
+        decoder_layers=24,
+        decoder_ffn_dim=10240,
+        decoder_attention_heads=32,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
-        d_model=1024,
+        d_model=2560,
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
@@ -120,8 +120,8 @@ def __init__(
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
-        pad_token_id=1,
-        bos_token_id=0,
+        pad_token_id=0,
+        bos_token_id=1,
         eos_token_id=2,
         **kwargs
     ):
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index 8c10b6cf33a529..a496b23422fe66 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -116,12 +116,12 @@ def __init__(
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        decoder_start_token_id=2,
+        decoder_start_token_id=1,
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
-        pad_token_id=1,
-        bos_token_id=0,
+        pad_token_id=0,
+        bos_token_id=1,
         eos_token_id=2,
         **kwargs
     ):
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index eb9272bfd4968a..8e8eab3a7af595 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -120,14 +120,12 @@ def __init__(
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        pad_token_id=58100,
+        eos_token_id=0,
         **kwargs
     ):
         super().__init__(
             pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 17f902b4aad39d..5b48a4be0bbbd2 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -120,14 +120,12 @@ def __init__(
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        pad_token_id=0,
+        eos_token_id=1,
         **kwargs
     ):
         super().__init__(
             pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 8e15e8386f39c9..8b31c5caa5ac84 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -545,8 +545,17 @@ def dummy_inputs(self):
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            Pegasus uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.

From 93b9944ace870579c581d13f00d6b5f834e26f44 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 11:51:58 +0000
Subject: [PATCH 30/51] remove pegasus for seq class

---
 docs/source/model_doc/pegasus.rst             |  12 -
 src/transformers/__init__.py                  |   7 +-
 src/transformers/models/auto/modeling_auto.py |   9 +-
 src/transformers/models/pegasus/__init__.py   |   2 -
 .../models/pegasus/modeling_pegasus.py        | 217 ------------------
 src/transformers/utils/dummy_pt_objects.py    |  18 --
 tests/test_modeling_pegasus.py                |  46 +---
 7 files changed, 4 insertions(+), 307 deletions(-)

diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst
index 47886bd97b0116..3fab320ebcbc5d 100644
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -131,18 +131,6 @@ PegasusForConditionalGeneration
 .. autoclass:: transformers.PegasusForConditionalGeneration
 
 
-PegasusForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForQuestionAnswering
-
-
-PegasusForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.PegasusForSequenceClassification
-
-
 TFPegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index eedda621c6861c..97966cbeeefa3c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -575,12 +575,7 @@
         OpenAIGPTPreTrainedModel,
         load_tf_weights_in_openai_gpt,
     )
-    from .models.pegasus import (
-        PegasusForConditionalGeneration,
-        PegasusForQuestionAnswering,
-        PegasusForSequenceClassification,
-        PegasusModel,
-    )
+    from .models.pegasus import PegasusForConditionalGeneration, PegasusModel
     from .models.prophetnet import (
         PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST,
         ProphetNetDecoder,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4396b0796f8619..2fad314fa45826 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -138,12 +138,7 @@
 )
 from ..mt5.modeling_mt5 import MT5ForConditionalGeneration, MT5Model
 from ..openai.modeling_openai import OpenAIGPTForSequenceClassification, OpenAIGPTLMHeadModel, OpenAIGPTModel
-from ..pegasus.modeling_pegasus import (
-    PegasusForConditionalGeneration,
-    PegasusForQuestionAnswering,
-    PegasusForSequenceClassification,
-    PegasusModel,
-)
+from ..pegasus.modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
 from ..prophetnet.modeling_prophetnet import ProphetNetForCausalLM, ProphetNetForConditionalGeneration, ProphetNetModel
 from ..rag.modeling_rag import (  # noqa: F401 - need to import all RagModels to be in globals() function
     RagModel,
@@ -464,7 +459,6 @@
         (TransfoXLConfig, TransfoXLForSequenceClassification),
         (MPNetConfig, MPNetForSequenceClassification),
         (TapasConfig, TapasForSequenceClassification),
-        (PegasusConfig, PegasusForSequenceClassification),
     ]
 )
 
@@ -490,7 +484,6 @@
         (FunnelConfig, FunnelForQuestionAnswering),
         (LxmertConfig, LxmertForQuestionAnswering),
         (MPNetConfig, MPNetForQuestionAnswering),
-        (PegasusConfig, PegasusForQuestionAnswering),
     ]
 )
 
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index c8efb9d64b1322..d687203666879e 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -29,8 +29,6 @@
     from .modeling_pegasus import (
         PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
         PegasusForConditionalGeneration,
-        PegasusForQuestionAnswering,
-        PegasusForSequenceClassification,
         PegasusModel,
         PegasusPreTrainedModel,
     )
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 8b31c5caa5ac84..4e3bccbd8fa3c9 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1248,220 +1248,3 @@ def _reorder_cache(past, beam_idx):
                 tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
             )
         return reordered_past
-
-
-@add_start_docstrings(
-    """
-    Pegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
-    tasks.
-    """,
-    PEGASUS_START_DOCSTRING,
-)
-class PegasusForSequenceClassification(PegasusPreTrainedModel):
-    def __init__(self, config: PegasusConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = PegasusModel(config)
-        self.classification_head = PegasusClassificationHead(
-            config.d_model,
-            config.d_model,
-            config.num_labels,
-            config.classifier_dropout,
-        )
-        self.model._init_weights(self.classification_head.dense)
-        self.model._init_weights(self.classification_head.out_proj)
-
-    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="google/pegasus-large",
-        output_type=Seq2SeqSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-
-        eos_mask = input_ids.eq(self.config.eos_token_id)
-
-        if len(torch.unique(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    PEGASUS Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    PEGASUS_START_DOCSTRING,
-)
-class PegasusForQuestionAnswering(PegasusPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-
-        self.model = PegasusModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.model._init_weights(self.qa_outputs)
-
-    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="google/pegasus-large",
-        output_type=Seq2SeqQuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        start_positions=None,
-        end_positions=None,
-        inputs_embeds=None,
-        decoder_inputs_embeds=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 0f1bc511080711..9f4df67707cae6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1626,24 +1626,6 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
-class PegasusForQuestionAnswering:
-    def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
-
-    @classmethod
-    def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
-
-
-class PegasusForSequenceClassification:
-    def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
-
-    @classmethod
-    def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
-
-
 class PegasusModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
index 7f152fa1e84404..74047f7750aa13 100644
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch PEGASUS model. """
 
 
-import copy
 import tempfile
 import unittest
 
@@ -34,14 +33,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        AutoModelForSeq2SeqLM,
-        PegasusConfig,
-        PegasusForConditionalGeneration,
-        PegasusForQuestionAnswering,
-        PegasusForSequenceClassification,
-        PegasusModel,
-    )
+    from transformers import AutoModelForSeq2SeqLM, PegasusConfig, PegasusForConditionalGeneration, PegasusModel
     from transformers.models.pegasus.modeling_pegasus import PegasusDecoder, PegasusEncoder
 
 
@@ -204,11 +196,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 
 @require_torch
 class PegasusModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (PegasusModel, PegasusForConditionalGeneration, PegasusForSequenceClassification, PegasusForQuestionAnswering)
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (PegasusModel, PegasusForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (PegasusForConditionalGeneration,) if is_torch_available() else ()
     is_encoder_decoder = True
     test_pruning = False
@@ -240,36 +228,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    # PegasusForSequenceClassification does not support inputs_embeds
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in (PegasusModel, PegasusForConditionalGeneration, PegasusForQuestionAnswering):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                del inputs["input_ids"]
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                inputs["inputs_embeds"] = wte(input_ids)
-            else:
-                inputs["inputs_embeds"] = wte(encoder_input_ids)
-                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         input_ids = input_dict["input_ids"]

From 04172c15dae1148b7b9080d5f6401f52a1f8d98b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 12:52:33 +0100
Subject: [PATCH 31/51] correct peg docs

---
 docs/source/model_doc/pegasus.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/model_doc/pegasus.rst b/docs/source/model_doc/pegasus.rst
index 47886bd97b0116..b1ab0cc2836ca1 100644
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -66,7 +66,6 @@ Implementation Notes
 - Some key configuration differences:
 
     - static, sinusoidal position embeddings
-    - no :obj:`layernorm_embedding` (:obj:`PegasusConfig.normalize_embedding=False`)
     - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
     - more beams are used (:obj:`num_beams=8`)
 - All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum
@@ -123,24 +122,28 @@ PegasusModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.PegasusModel
+    :members: forward
 
 
 PegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.PegasusForConditionalGeneration
+    :members: forward
 
 
 PegasusForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.PegasusForQuestionAnswering
+    :members: forward
 
 
 PegasusForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.PegasusForSequenceClassification
+    :members: forward
 
 
 TFPegasusForConditionalGeneration

From ada1cd2dd344f96909e649dc5504aae782f1c202 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 12:52:43 +0100
Subject: [PATCH 32/51] correct peg docs

---
 .../models/pegasus/configuration_pegasus.py   |  2 +-
 .../models/pegasus/modeling_pegasus.py        | 53 ++++++++++---------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 5b48a4be0bbbd2..fda6907a37361e 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -81,7 +81,7 @@ class PegasusConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Example::
 
         >>> from transformers import PegasusModel, PegasusConfig
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 8b31c5caa5ac84..2666bbfa0f444f 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -513,16 +513,20 @@ def dummy_inputs(self):
 PEGASUS_GENERATION_EXAMPLE = r"""
     Summarization example::
 
-        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration, PegasusConfig
+        >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
 
-        >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
 
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> ARTICLE_TO_SUMMARIZE = (
+        ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+        ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+        ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+        ... )
         >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
 
         >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> summary_ids = model.generate(inputs['input_ids'])
         >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
 """
 
@@ -1004,12 +1008,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="google/pegasus-large",
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -1025,6 +1024,23 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import PegasusTokenizer, PegasusModel
+
+            >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+            >>> model = PegasusModel.from_pretrained("google/pegasus-large")
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1152,21 +1168,6 @@ def forward(
 
         Returns:
 
-        Conditional generation example::
-
-            >>> from transformers import PegasusTokenizer, PegasusForConditionalGeneration
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

From 0171fcd70ffe4a319472ce6bf22285faee3ae747 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:04:10 +0100
Subject: [PATCH 33/51] finish configs

---
 docs/source/model_doc/blenderbot.rst          | 33 ++-----
 docs/source/model_doc/blenderbot_small.rst    |  2 +-
 .../models/bart/configuration_bart.py         |  2 +-
 .../blenderbot/configuration_blenderbot.py    |  2 +-
 .../models/blenderbot/modeling_blenderbot.py  | 90 ++++++++++--------
 .../configuration_blenderbot_small.py         |  2 +-
 .../modeling_blenderbot_small.py              | 91 +++++++++++--------
 .../models/mbart/configuration_mbart.py       |  2 +-
 8 files changed, 118 insertions(+), 106 deletions(-)

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index c14c004e1dbe87..c8c22bc6c3d938 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -43,13 +43,9 @@ Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
-- It inherits completely from :class:`~transformers.BartForConditionalGeneration`
-- Even though blenderbot is one model, it uses two tokenizers :class:`~transformers.BlenderbotSmallTokenizer` for 90M
-  checkpoint and :class:`~transformers.BlenderbotTokenizer` for all other checkpoints.
-- :class:`~transformers.BlenderbotSmallTokenizer` will always return :class:`~transformers.BlenderbotSmallTokenizer`,
-  regardless of checkpoint. To use the 3B parameter checkpoint, you must call
-  :class:`~transformers.BlenderbotTokenizer` directly.
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
+- Note that Blenderbot has a different architecture than BlenderbotSmall. The only BlenderbotSmall checkpoint is
+  ``facebook/blenderbot-90M``.
 
 
 Usage
@@ -59,26 +55,15 @@ Here is an example of model usage:
 
 .. code-block::
 
-        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotForConditionalGeneration
-        >>> mname = 'facebook/blenderbot-90M'
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-400M-distill'
         >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
         >>> UTTERANCE = "My friends are cool but they eat too many carbs."
         >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
         >>> reply_ids = model.generate(**inputs)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
-
-
-Here is how you can check out config values:
-
-.. code-block::
-
-
-        >>> from transformers import BlenderbotConfig
-        >>> config_90 = BlenderbotConfig.from_pretrained("facebook/blenderbot-90M")
-        >>> config_90.to_diff_dict()  # show interesting Values.
-        >>> configuration_3B = BlenderbotConfig("facebook/blenderbot-3B")
-        >>> configuration_3B.to_diff_dict()
+        >>> print(tokenizer.batch_decode(reply_ids))
+        >>> # should return ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
 
 
 BlenderbotConfig
@@ -100,7 +85,7 @@ BlenderbotModel
 See :obj:`transformers.BartModel` for arguments to `forward` and `generate`
 
 .. autoclass:: transformers.BlenderbotModel
-    :members:
+    :members: forward
 
 
 BlenderbotForConditionalGeneration
@@ -109,7 +94,7 @@ BlenderbotForConditionalGeneration
 See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward` and `generate`
 
 .. autoclass:: transformers.BlenderbotForConditionalGeneration
-    :members:
+    :members: forward
 
 
 TFBlenderbotForConditionalGeneration
diff --git a/docs/source/model_doc/blenderbot_small.rst b/docs/source/model_doc/blenderbot_small.rst
index 2802af544fbc17..f3a21c04bded60 100644
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -10,7 +10,7 @@
     an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
     specific language governing permissions and limitations under the License.
 
-BLENDERBOT_SMALL
+Blenderbot Small
 -----------------------------------------------------------------------------------------------------------------------
 
 Note that :class:`~transformers.BlenderbotSmallModel` and
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 0112ea92db0698..19ca8923731771 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -83,7 +83,7 @@ class BartConfig(PretrainedConfig):
         num_labels: (:obj:`int`, `optional`, defaults to 3):
             The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
 
-        Example::
+    Example::
 
         >>> from transformers import BartModel, BartConfig
 
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index acfb51b45c64e3..eacde6d666b8bd 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -81,7 +81,7 @@ class BlenderbotConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Example::
 
         >>> from transformers import BlenderbotModel, BlenderbotConfig
 
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index e2072b15b45ec9..ed93296f77c4d7 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -26,7 +26,6 @@
 
 from ...activations import ACT2FN
 from ...file_utils import (
-    add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -50,7 +49,7 @@
 
 
 BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/blenderbot-90M",
+    "facebook/blenderbot-3B",
     # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
 ]
 
@@ -467,19 +466,28 @@ def dummy_inputs(self):
 """
 
 BLENDERBOT_GENERATION_EXAMPLE = r"""
-    Summarization example::
-
-        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration, BlenderbotConfig
-
-        >>> model = BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-90M')
-
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-
-        >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    Conversation example::
+
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-400M-distill'
+        >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+        >>> print("Human: ", UTTERANCE)
+        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+        >>> reply_ids = model.generate(**inputs)
+        >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
+
+        >>> REPLY = "I'm not sure"
+        >>> print("Human: ", REPLY)
+        >>> NEXT_UTTERANCE = (
+        ... "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
+        ... "Are they trying to lose weight or are they just trying to be healthier?</s> "
+        ... "<s> I'm not sure."
+        ... )
+        >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors='pt')
+        >>> next_reply_ids = model.generate(**inputs)
+        >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
 """
 
 BLENDERBOT_INPUTS_DOCSTRING = r"""
@@ -501,8 +509,17 @@ def dummy_inputs(self):
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            Blenderbot uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
@@ -953,12 +970,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/blenderbot-90M",
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -974,6 +986,22 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import BlenderbotTokenizer, BlenderbotModel
+
+            >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
+            >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1099,22 +1127,6 @@ def forward(
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
 
         Returns:
-
-        Conditional generation example::
-
-            >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-90M')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = BlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index a496b23422fe66..a7d6db8190eb15 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -81,7 +81,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Example::
 
         >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
 
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 787c95d11298a5..d702defb5685b6 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -26,7 +26,6 @@
 
 from ...activations import ACT2FN
 from ...file_utils import (
-    add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -467,19 +466,31 @@ def dummy_inputs(self):
 """
 
 BLENDERBOT_SMALL_GENERATION_EXAMPLE = r"""
-    Summarization example::
-
-        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration, BlenderbotSmallConfig
-
-        >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot-90M')
-
-        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
-
-        >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
-        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    Conversation example::
+
+        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-90M'
+        >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
+        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+        >>> print("Human: ", UTTERANCE)
+        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+        >>> inputs.pop("token_type_ids")
+        >>> reply_ids = model.generate(**inputs)
+        >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
+        what kind of carbs do they eat? i don't know much about carbs.
+
+        >>> REPLY = "I'm not sure"
+        >>> print("Human: ", REPLY)
+        >>> NEXT_UTTERANCE = (
+        ... "My friends are cool but they eat too many carbs.</s> "
+        ... "<s>what kind of carbs do they eat? i don't know much about carbs.</s> "
+        ... "<s>I'm not sure."
+        ... )
+        >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors='pt')
+        >>> inputs.pop("token_type_ids")
+        >>> next_reply_ids = model.generate(**inputs)
+        >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
 """
 
 BLENDERBOT_SMALL_INPUTS_DOCSTRING = r"""
@@ -501,8 +512,17 @@ def dummy_inputs(self):
 
             `What are attention masks? <../glossary.html#attention-mask>`__
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            BlenderbotSmall uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation.
+            If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
         decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
@@ -950,12 +970,7 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="facebook/blenderbot-90M",
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -971,6 +986,22 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+
+            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot-90M")
+            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+
+            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1097,22 +1128,6 @@ def forward(
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
 
         Returns:
-
-        Conditional generation example::
-
-            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot-90M')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot-90M')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index adf47f31f825a3..e3f56546d55e89 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -81,7 +81,7 @@ class MBartConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Example::
 
         >>> from transformers import MBartModel, MBartConfig
 

From b6058cef3b34b5921939dd7553702f1a5b392e8b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 15:01:23 +0100
Subject: [PATCH 34/51] further improve docs

---
 .../models/bart/configuration_bart.py             | 15 +++++++++++----
 .../models/blenderbot/configuration_blenderbot.py |  4 ++++
 .../configuration_blenderbot_small.py             |  4 ++++
 .../models/marian/configuration_marian.py         |  4 ++++
 .../models/mbart/configuration_mbart.py           |  4 ++++
 .../models/pegasus/configuration_pegasus.py       |  4 ++++
 6 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 19ca8923731771..483ca5afca1033 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -72,12 +72,19 @@ class BartConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
+            :obj:`True` for `bart-large-cnn`.
         encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the encoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
         num_labels: (:obj:`int`, `optional`, defaults to 3):
@@ -110,23 +117,23 @@ def __init__(
         decoder_attention_heads=16,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        use_cache=True,
-        is_encoder_decoder=True,
         activation_function="gelu",
         d_model=1024,
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        decoder_start_token_id=2,
         classifier_dropout=0.0,
         scale_embedding=False,
         gradient_checkpointing=False,
         force_bos_token_to_be_generated=False,
+        use_cache=True,
+        num_labels=3,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        num_labels=3,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
         **kwargs
     ):
         super().__init__(
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index eacde6d666b8bd..1325419c3266a5 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -78,6 +78,10 @@ class BlenderbotConfig(PretrainedConfig):
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index a7d6db8190eb15..ccac6d22183150 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -78,6 +78,10 @@ class BlenderbotSmallConfig(PretrainedConfig):
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 8e8eab3a7af595..e50bfb0ab11729 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -78,6 +78,10 @@ class MarianConfig(PretrainedConfig):
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index e3f56546d55e89..69cf8f909e580e 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -78,6 +78,10 @@ class MBartConfig(PretrainedConfig):
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index fda6907a37361e..c72a7af1eaea8e 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -78,6 +78,10 @@ class PegasusConfig(PretrainedConfig):
         decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
             The LayerDrop probability for the decoder. See the `LayerDrop paper <see
             https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 

From fcc944c2c7b2ae19cdd13fc16e62da64c291f4e8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:11:15 +0000
Subject: [PATCH 35/51] add copied from statements to mbart

---
 src/transformers/models/mbart/modeling_mbart.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 9c5cc8a4564a74..374119680521f9 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -76,6 +76,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
     return prev_output_tokens
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -91,6 +92,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -105,6 +107,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->MBart
 def MBartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
     if torch.cuda.is_available():
         try:
@@ -116,6 +119,7 @@ def MBartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart
 class MBartLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -138,6 +142,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         return super().forward(positions + self.offset)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart
 class MBartAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -433,6 +438,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart
 class MBartClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
@@ -457,6 +463,7 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
+# Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel with Bart->MBart
 class MBartPreTrainedModel(PreTrainedModel):
     config_class = MBartConfig
     base_model_prefix = "model"
@@ -1281,6 +1288,7 @@ def __init__(self, config: MBartConfig, **kwargs):
         output_type=Seq2SeqSequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
+    # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
     def forward(
         self,
         input_ids=None,
@@ -1382,6 +1390,7 @@ def __init__(self, config):
         output_type=Seq2SeqQuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
+    # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
     def forward(
         self,
         input_ids=None,

From 6f54cc0a53cd0e95e53ae7880058fea2183ee052 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:12:54 +0000
Subject: [PATCH 36/51] fix copied from in mbart

---
 src/transformers/models/mbart/modeling_mbart.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 374119680521f9..092a4dd814ce44 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -109,13 +109,12 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 # Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->MBart
 def MBartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
+    try:
+        from apex.normalization import FusedLayerNorm
 
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    except ImportError:
+        pass
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
@@ -128,7 +127,7 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
         num_embeddings
-        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models dont have this hack
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim, padding_idx=padding_idx)
@@ -463,7 +462,6 @@ def forward(self, hidden_states: torch.Tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel with Bart->MBart
 class MBartPreTrainedModel(PreTrainedModel):
     config_class = MBartConfig
     base_model_prefix = "model"

From 7b11e3314af3b4d2ff10a93c34e622764306f281 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:22:09 +0000
Subject: [PATCH 37/51] add copy statements to marian

---
 src/transformers/models/marian/modeling_marian.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index e4bb38f9062c22..88348c8216b1fd 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -55,6 +55,7 @@
 ]
 
 
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
@@ -70,6 +71,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -85,6 +87,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -99,6 +102,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Marian
 def MarianLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
     if torch.cuda.is_available():
         try:
@@ -144,6 +148,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         return super().forward(positions)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Marian
 class MarianAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -278,6 +283,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->Marian
 class MarianEncoderLayer(nn.Module):
     def __init__(self, config: MarianConfig):
         super().__init__()
@@ -332,6 +338,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
         return outputs
 
 
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Marian
 class MarianDecoderLayer(nn.Module):
     def __init__(self, config: MarianConfig):
         super().__init__()
@@ -463,6 +470,7 @@ def dummy_inputs(self):
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
         }
         return dummy_inputs
 
@@ -611,6 +619,7 @@ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] =
         self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.init_weights()
 
+    # Copied from transformers.models.bart.modeling_bart.BartEncoder.forward
     def forward(
         self,
         input_ids=None,
@@ -752,6 +761,7 @@ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] =
         self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.init_weights()
 
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder.forward
     def forward(
         self,
         input_ids=None,

From 36a5c2698badde35e7c10448e603654f0e3ad5a1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:29:53 +0000
Subject: [PATCH 38/51] add copied from to marian

---
 src/transformers/models/marian/modeling_marian.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 88348c8216b1fd..5fbdf2e9708ad3 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -104,13 +104,12 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 # Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Marian
 def MarianLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
+    try:
+        from apex.normalization import FusedLayerNorm
 
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    except ImportError:
+        pass
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
@@ -619,7 +618,6 @@ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] =
         self.layers = nn.ModuleList([MarianEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.init_weights()
 
-    # Copied from transformers.models.bart.modeling_bart.BartEncoder.forward
     def forward(
         self,
         input_ids=None,
@@ -761,7 +759,6 @@ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] =
         self.layers = nn.ModuleList([MarianDecoderLayer(config) for _ in range(config.decoder_layers)])
         self.init_weights()
 
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder.forward
     def forward(
         self,
         input_ids=None,

From b0762ca177f8b7416da39ac2c4059595f01bd88f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:35:26 +0000
Subject: [PATCH 39/51] add pegasus copied from

---
 .../models/pegasus/modeling_pegasus.py        | 33 +++++--------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 23e13fde6cce05..40596241d50e6d 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -55,6 +55,7 @@
 ]
 
 
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
@@ -70,6 +71,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -85,6 +87,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -99,6 +102,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Pegasus
 def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
     if torch.cuda.is_available():
         try:
@@ -110,6 +114,7 @@ def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwis
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Pegasus
 class PegasusSinusoidalPositionalEmbedding(nn.Embedding):
     """This module produces sinusoidal positional embeddings of any length."""
 
@@ -144,6 +149,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         return super().forward(positions)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Pegasus
 class PegasusAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -278,6 +284,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Pegasus
 class PegasusEncoderLayer(nn.Module):
     def __init__(self, config: PegasusConfig):
         super().__init__()
@@ -332,6 +339,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
         return outputs
 
 
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Pegasus
 class PegasusDecoderLayer(nn.Module):
     def __init__(self, config: PegasusConfig):
         super().__init__()
@@ -439,30 +447,6 @@ def forward(
         return outputs
 
 
-class PegasusClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, hidden_states: torch.Tensor):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-
-
 class PegasusPreTrainedModel(PreTrainedModel):
     config_class = PegasusConfig
     base_model_prefix = "model"
@@ -487,6 +471,7 @@ def dummy_inputs(self):
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
         }
         return dummy_inputs
 

From 7b307b66f44f81477863c4fcd2e0f03e7e97a39f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:36:18 +0000
Subject: [PATCH 40/51] finish pegasus

---
 src/transformers/models/pegasus/modeling_pegasus.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 40596241d50e6d..ffe32d1391c3e7 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -104,13 +104,12 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
 # Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Pegasus
 def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
+    try:
+        from apex.normalization import FusedLayerNorm
 
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    except ImportError:
+        pass
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 

From d47b5cd14e107d5c72ee85bbf7ee188c7a61c53f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 14:45:04 +0000
Subject: [PATCH 41/51] finish copied from

---
 .../models/blenderbot/modeling_blenderbot.py  | 19 ++++++++++++------
 .../modeling_blenderbot_small.py              | 20 +++++++++++++------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index ed93296f77c4d7..1b5ae35cd372e0 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -54,6 +54,7 @@
 ]
 
 
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
@@ -69,6 +70,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -84,6 +86,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -98,14 +101,14 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Blenderbot
 def BlenderbotLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
+    try:
+        from apex.normalization import FusedLayerNorm
 
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    except ImportError:
+        pass
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
@@ -128,6 +131,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         return super().forward(positions)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Blenderbot
 class BlenderbotAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -262,6 +266,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Blenderbot
 class BlenderbotEncoderLayer(nn.Module):
     def __init__(self, config: BlenderbotConfig):
         super().__init__()
@@ -316,6 +321,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
         return outputs
 
 
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Blenderbot
 class BlenderbotDecoderLayer(nn.Module):
     def __init__(self, config: BlenderbotConfig):
         super().__init__()
@@ -445,6 +451,7 @@ def dummy_inputs(self):
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
         }
         return dummy_inputs
 
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index d702defb5685b6..fe493a3a85f2d7 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -54,6 +54,7 @@
 ]
 
 
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
 def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
@@ -69,6 +70,7 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
     return shifted_input_ids
 
 
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
     """
     Make causal mask used for bi-directional self-attention.
@@ -84,6 +86,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 
 
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -98,17 +101,18 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->BlenderbotSmall
 def BlenderbotSmallLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
+    try:
+        from apex.normalization import FusedLayerNorm
 
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    except ImportError:
+        pass
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
 
+# Copied from transformers.models.blenderbot.modeling_blenderbot.BlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
 class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -128,6 +132,7 @@ def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
         return super().forward(positions)
 
 
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->BlenderbotSmall
 class BlenderbotSmallAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -262,6 +267,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->BlenderbotSmall
 class BlenderbotSmallEncoderLayer(nn.Module):
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__()
@@ -316,6 +322,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
         return outputs
 
 
+# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->BlenderbotSmall
 class BlenderbotSmallDecoderLayer(nn.Module):
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__()
@@ -445,6 +452,7 @@ def dummy_inputs(self):
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
+            "decoder_input_ids": input_ids,
         }
         return dummy_inputs
 

From 9e9b66f6d1022b79769bd72ae5145efe43eb1c1e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 16:59:47 +0100
Subject: [PATCH 42/51] Apply suggestions from code review

---
 docs/source/model_doc/blenderbot.rst                          | 2 +-
 src/transformers/models/auto/configuration_auto.py            | 4 ++--
 .../models/blenderbot_small/configuration_blenderbot_small.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index c8c22bc6c3d938..c43f8fb8f2816f 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -44,7 +44,7 @@ Implementation Notes
 
 - Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
-- Note that Blenderbot has a different architecture than BlenderbotSmall. The only BlenderbotSmall checkpoint is
+- Note that :class:`~transformers.BlenderbotModel` has a different architecture than :class:`~transformers.BlenderbotModelSmall`. :class:`~transformers.BlenderbotSmall` should be used for the checkpoint
   ``facebook/blenderbot-90M``.
 
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 442a113a2e3149..46931c014274c9 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -110,7 +110,7 @@
 CONFIG_MAPPING = OrderedDict(
     [
         # Add configs here
-        ("blenderbot_small", BlenderbotSmallConfig),
+        ("blenderbot-small", BlenderbotSmallConfig),
         ("retribert", RetriBertConfig),
         ("mt5", MT5Config),
         ("t5", T5Config),
@@ -156,7 +156,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-        ("blenderbot_small", "BlenderbotSmall"),
+        ("blenderbot-small", "BlenderbotSmall"),
         ("retribert", "RetriBERT"),
         ("t5", "T5"),
         ("mobilebert", "MobileBERT"),
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index ccac6d22183150..f8b30553f079d2 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -98,7 +98,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
         >>> # Accessing the model configuration
         >>> configuration = model.config
     """
-    model_type = "blenderbot_small"
+    model_type = "blenderbot-small"
 
     def __init__(
         self,

From 65fe5746abb022012f1d4b69688f69012cf42c29 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 4 Jan 2021 16:00:28 +0000
Subject: [PATCH 43/51] make style

---
 docs/source/model_doc/blenderbot.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index c43f8fb8f2816f..29531b080eb43a 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -44,7 +44,8 @@ Implementation Notes
 
 - Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
-- Note that :class:`~transformers.BlenderbotModel` has a different architecture than :class:`~transformers.BlenderbotModelSmall`. :class:`~transformers.BlenderbotSmall` should be used for the checkpoint
+- Note that :class:`~transformers.BlenderbotModel` has a different architecture than
+  :class:`~transformers.BlenderbotModelSmall`. :class:`~transformers.BlenderbotSmall` should be used for the checkpoint
   ``facebook/blenderbot-90M``.
 
 

From d3cbc5530824eb1cb8009dceabe2320e4f90b12d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 15:38:19 +0100
Subject: [PATCH 44/51] backward comp blenderbot

---
 .../models/blenderbot/modeling_blenderbot.py  | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index ed93296f77c4d7..c4433417af5ca0 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -16,8 +16,10 @@
 
 
 import math
+import os
 import random
-from typing import Optional, Tuple
+import warnings
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -39,6 +41,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import logging
+from ..blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
 from .configuration_blenderbot import BlenderbotConfig
 
 
@@ -955,6 +958,17 @@ def __init__(self, config: BlenderbotConfig):
 
         self.init_weights()
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
+            warnings.warn(
+                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical checkpoint `facebook/small_blenderbot-90M` with `BlenderbotSmallModel.from_pretrained('facebook/small_blenderbot-90M')` instead.",
+                FutureWarning,
+            )
+            return BlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)
+
+        return super(BlenderbotModel, cls).from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
     def get_input_embeddings(self):
         return self.shared
 
@@ -1075,6 +1089,19 @@ def __init__(self, config: BlenderbotConfig):
 
         self.init_weights()
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
+            warnings.warn(
+                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical checkpoint `facebook/small_blenderbot-90M` with `BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')` instead.",
+                FutureWarning,
+            )
+            return BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
+
+        return super(BlenderbotForConditionalGeneration, cls).from_pretrained(
+            pretrained_model_name_or_path, *model_args, **kwargs
+        )
+
     def get_encoder(self):
         return self.model.get_encoder()
 

From e69ec9454175083f4965ecd001f52c4513a79810 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 19:32:15 +0000
Subject: [PATCH 45/51] apply lysandres and sylvains suggestions

---
 docs/source/model_doc/blenderbot.rst          |  8 +--
 docs/source/model_doc/blenderbot_small.rst    |  4 --
 .../seq2seq-distillation/distillation.py      |  2 +-
 .../seq2seq-distillation/finetune.py          |  2 +-
 .../seq2seq-distillation/utils.py             |  2 +-
 .../models/bart/configuration_bart.py         |  5 +-
 src/transformers/models/bart/modeling_bart.py | 43 +++++++---------
 .../models/bart/modeling_tf_bart.py           |  2 +-
 .../blenderbot/configuration_blenderbot.py    |  3 +-
 .../models/blenderbot/modeling_blenderbot.py  | 46 +++++++----------
 .../blenderbot/tokenization_blenderbot.py     |  9 ++--
 .../configuration_blenderbot_small.py         | 27 +++++-----
 .../modeling_blenderbot_small.py              | 46 +++++++----------
 .../tokenization_blenderbot_small.py          |  9 ++--
 .../tokenization_blenderbot_small_fast.py     |  6 +--
 .../modeling_encoder_decoder.py               |  4 +-
 src/transformers/models/fsmt/modeling_fsmt.py |  2 +-
 .../models/marian/configuration_marian.py     |  3 +-
 .../models/marian/modeling_marian.py          | 39 ++++++---------
 .../models/mbart/configuration_mbart.py       |  3 +-
 .../models/mbart/modeling_mbart.py            | 50 ++++++++-----------
 .../models/pegasus/configuration_pegasus.py   |  3 +-
 .../models/pegasus/modeling_pegasus.py        | 45 +++++++----------
 .../models/prophetnet/modeling_prophetnet.py  |  2 +-
 src/transformers/models/t5/modeling_t5.py     |  2 +-
 src/transformers/models/t5/modeling_tf_t5.py  |  2 +-
 ...on_{{cookiecutter.lowercase_modelname}}.py |  4 ++
 ...tf_{{cookiecutter.lowercase_modelname}}.py |  2 +-
 ...ng_{{cookiecutter.lowercase_modelname}}.py | 42 ++++++----------
 ...ng_{{cookiecutter.lowercase_modelname}}.py |  2 -
 tests/test_modeling_bart.py                   |  2 +-
 tests/test_modeling_blenderbot.py             |  4 +-
 tests/test_modeling_blenderbot_small.py       |  4 +-
 tests/test_modeling_marian.py                 |  4 +-
 tests/test_modeling_mbart.py                  |  4 +-
 tests/test_modeling_pegasus.py                |  4 +-
 tests/test_tokenization_small_blenderbot.py   |  2 +-
 37 files changed, 184 insertions(+), 259 deletions(-)

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index 29531b080eb43a..1f570088955e83 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -44,9 +44,9 @@ Implementation Notes
 
 - Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
-- Note that :class:`~transformers.BlenderbotModel` has a different architecture than
-  :class:`~transformers.BlenderbotModelSmall`. :class:`~transformers.BlenderbotSmall` should be used for the checkpoint
-  ``facebook/blenderbot-90M``.
+- This is the `default` Blenderbot model class. However, some smaller checkpoints, such as
+  ``facebook/blenderbot_small_90M``, have a different architecture and consequently should be used with
+  `BlenderbotSmall <./blenderbot_small.rst>`.
 
 
 Usage
@@ -64,7 +64,7 @@ Here is an example of model usage:
         >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
         >>> reply_ids = model.generate(**inputs)
         >>> print(tokenizer.batch_decode(reply_ids))
-        >>> # should return ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
+        ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
 
 
 BlenderbotConfig
diff --git a/docs/source/model_doc/blenderbot_small.rst b/docs/source/model_doc/blenderbot_small.rst
index f3a21c04bded60..f44b0b73f2c91f 100644
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -41,10 +41,6 @@ failure cases of our models.*
 
 The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
 
-
-Implementation Notes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 BlenderbotSmallConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
index e2b2ad22d60d11..3b3bd805894151 100755
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -16,7 +16,7 @@
 from finetune import main as ft_main
 from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
 from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.mbart.modeling_mbart import shift_tokens_right
+from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params
 
 
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
index 0ca4e6f9bccdaa..156b4695a67e72 100755
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -17,7 +17,7 @@
 
 from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from transformers import MBartTokenizer, T5ForConditionalGeneration
-from transformers.models.mbart.modeling_mbart import shift_tokens_right
+from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import (
     ROUGE_KEYS,
     LegacySeq2SeqDataset,
diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py
index c7f255e7e85267..b6994a1831da0a 100644
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ b/examples/research_projects/seq2seq-distillation/utils.py
@@ -21,7 +21,7 @@
 from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.file_utils import cached_property
-from transformers.models.mbart.modeling_mbart import shift_tokens_right
+from transformers.models.bart.modeling_bart import shift_tokens_right
 
 
 try:
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 483ca5afca1033..1b128df0dcca67 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 class BartConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
-    instantiate an BART model according to the specified arguments, defining the model architecture. Instantiating a
+    instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
     <https://huggingface.co/facebook/bart-large>`__ architecture.
 
@@ -104,6 +104,7 @@ class BartConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "bart"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 510c13a3599416..3c8b56e69e8379 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -102,16 +102,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-def BartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 class BartLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -119,7 +109,6 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
         # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models dont have this hack
         self.offset = 2
@@ -277,13 +266,13 @@ def __init__(self, config: BartConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BartLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -291,8 +280,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states, attn_weights, _ = self.self_attn(
@@ -337,17 +327,17 @@ def __init__(self, config: BartConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = BartAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = BartLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BartLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -368,8 +358,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
 
@@ -569,7 +560,7 @@ def __init_subclass__(self):
             For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
             :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
             the right for denoising pre-training following the paper.
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -632,7 +623,7 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -645,7 +636,7 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(embed_dim)
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         self.init_weights()
 
@@ -789,7 +780,7 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = BartLayerNorm(config.d_model)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -1059,7 +1050,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 03c24c209c5a1c..aece700d5c1529 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -545,7 +545,7 @@ def __init_subclass__(self):
         decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
         encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
             hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 1325419c3266a5..1b48f8952b0596 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -99,6 +99,7 @@ class BlenderbotConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "blenderbot"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index a99248742d37ba..4a79aa86a4baf6 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -104,17 +104,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Blenderbot
-def BlenderbotLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 class BlenderbotLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -122,7 +111,6 @@ class BlenderbotLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
 
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
@@ -279,13 +267,13 @@ def __init__(self, config: BlenderbotConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -293,8 +281,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -340,17 +329,17 @@ def __init__(self, config: BlenderbotConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = BlenderbotAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BlenderbotLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -371,8 +360,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -521,7 +511,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            Indices can be obtained using :class:`~transformers.BlenderbotTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -530,7 +520,7 @@ def dummy_inputs(self):
             Blenderbot uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
             :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
             :obj:`past_key_values`).
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -593,7 +583,7 @@ def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -606,7 +596,7 @@ def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = BlenderbotLayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -752,7 +742,7 @@ def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layer_norm = BlenderbotLayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -1039,7 +1029,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index bf96a63d04a4da..2aa785d00a251c 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
+"""Tokenization class for Blenderbot."""
+
 import json
 import os
 from typing import Dict, List, Optional, Tuple
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index f8b30553f079d2..b3c06258747772 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -42,19 +42,19 @@ class BlenderbotSmallConfig(PretrainedConfig):
             Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
             represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
             :class:`~transformers.TFBlenderbotSmallModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+        d_model (:obj:`int`, `optional`, defaults to 512):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (:obj:`int`, `optional`, defaults to 8):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (:obj:`int`, `optional`, defaults to 8):
             Number of decoder layers.
         encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
         decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
@@ -67,7 +67,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, `optional`, defaults to 0.02):
@@ -99,23 +99,24 @@ class BlenderbotSmallConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "blenderbot-small"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
         vocab_size=50265,
-        max_position_embeddings=1024,
-        encoder_layers=12,
-        encoder_ffn_dim=4096,
+        max_position_embeddings=512,
+        encoder_layers=8,
+        encoder_ffn_dim=2048,
         encoder_attention_heads=16,
-        decoder_layers=12,
-        decoder_ffn_dim=4096,
+        decoder_layers=8,
+        decoder_ffn_dim=2048,
         decoder_attention_heads=16,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
-        d_model=1024,
+        d_model=512,
         dropout=0.1,
         attention_dropout=0.0,
         activation_dropout=0.0,
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index fe493a3a85f2d7..dbb6b6f8e05e93 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -101,17 +101,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->BlenderbotSmall
-def BlenderbotSmallLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 # Copied from transformers.models.blenderbot.modeling_blenderbot.BlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall
 class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
     """
@@ -120,7 +109,6 @@ class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
 
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
@@ -277,13 +265,13 @@ def __init__(self, config: BlenderbotSmallConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -291,8 +279,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states, attn_weights, _ = self.self_attn(
@@ -338,17 +327,17 @@ def __init__(self, config: BlenderbotSmallConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = BlenderbotSmallAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = BlenderbotSmallLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -369,8 +358,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
 
@@ -522,7 +512,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            Indices can be obtained using :class:`~transformers.BlenderbotSmallTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -531,7 +521,7 @@ def dummy_inputs(self):
             BlenderbotSmall uses the :obj:`bos_token_id` as the starting token for :obj:`decoder_input_ids` generation.
             If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
             :obj:`past_key_values`).
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -594,7 +584,7 @@ def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embe
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -607,7 +597,7 @@ def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embe
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = BlenderbotSmallLayerNorm(embed_dim)
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         self.init_weights()
 
@@ -752,7 +742,7 @@ def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embe
             self.padding_idx,
         )
         self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = BlenderbotSmallLayerNorm(config.d_model)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -1026,7 +1016,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 72aae43d8672ba..10dee6cbf9e3c4 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
+"""Tokenization class for BlenderbotSmall."""
+
 import json
 import os
 from typing import Dict, List, Optional, Tuple
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 86d3437ce2f186..7e436953c98975 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for BlenderbotSmall."""
+"""Fast tokenization class for BlenderbotSmall."""
 from typing import List, Optional
 
 from tokenizers import ByteLevelBPETokenizer
@@ -29,7 +29,7 @@
 PRETRAINED_VOCAB_FILES_MAP = {}
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/blenderbot-90M": 1024,
+    "facebook/blenderbot-90M": 512,
 }
 
 
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 90bae1d2c49397..e6b19842ba6769 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -79,7 +79,7 @@
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -91,7 +91,7 @@
             Provide for sequence to sequence training to the decoder. Indices can be obtained using
             :class:`~transformers.PretrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
             :meth:`transformers.PreTrainedTokenizer.__call__` for details.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
         encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 0cd07ed6e6cd77..19e91a49d4c1f1 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -235,7 +235,7 @@
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default. If you want to change padding behavior, you should read
             :func:`modeling_fstm._prepare_fstm_decoder_inputs` and modify. See diagram 1 in the paper for more info on
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index e50bfb0ab11729..cc54540abaf142 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -99,6 +99,7 @@ class MarianConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "marian"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 5fbdf2e9708ad3..6db24068f88b4d 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -102,17 +102,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Marian
-def MarianLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 class MarianSinusoidalPositionalEmbedding(nn.Embedding):
     """This module produces sinusoidal positional embeddings of any length."""
 
@@ -292,13 +281,13 @@ def __init__(self, config: MarianConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -306,8 +295,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states, attn_weights, _ = self.self_attn(
@@ -353,17 +343,17 @@ def __init__(self, config: MarianConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = MarianAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = MarianLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -384,8 +374,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
 
@@ -540,7 +531,7 @@ def dummy_inputs(self):
             Marian uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
             :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
             :obj:`past_key_values`).
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -603,7 +594,7 @@ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] =
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -1028,7 +1019,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index 69cf8f909e580e..4fbacd4c746088 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -99,6 +99,7 @@ class MBartConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "mbart"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 092a4dd814ce44..147fd5d7c077c6 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -107,17 +107,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->MBart
-def MBartLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 # Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart
 class MBartLearnedPositionalEmbedding(nn.Embedding):
     """
@@ -126,7 +115,6 @@ class MBartLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
         # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models dont have this hack
         self.offset = 2
@@ -285,13 +273,13 @@ def __init__(self, config: MBartConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -299,8 +287,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -345,17 +334,17 @@ def __init__(self, config: MBartConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = MBartAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = MBartLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -376,8 +365,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -561,7 +551,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            Indices can be obtained using :class:`~transformers.MBartTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -575,7 +565,7 @@ def dummy_inputs(self):
             For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
             :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
             the right for denoising pre-training following the paper.
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -638,7 +628,7 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -651,8 +641,8 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
             self.padding_idx,
         )
         self.layers = nn.ModuleList([MBartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = MBartLayerNorm(embed_dim)
-        self.layer_norm = MBartLayerNorm(config.d_model)
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -798,8 +788,8 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
             self.padding_idx,
         )
         self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = MBartLayerNorm(config.d_model)
-        self.layer_norm = MBartLayerNorm(config.d_model)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -1068,7 +1058,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index c72a7af1eaea8e..0dee48e7bb39b2 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -99,6 +99,7 @@ class PegasusConfig(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "pegasus"
+    keys_to_ignore_at_inference = ["past_key_values"]
 
     def __init__(
         self,
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index ffe32d1391c3e7..583cec288e489b 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -102,17 +102,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartLayerNorm with Bart->Pegasus
-def PegasusLayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    try:
-        from apex.normalization import FusedLayerNorm
-
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    except ImportError:
-        pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 # Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Pegasus
 class PegasusSinusoidalPositionalEmbedding(nn.Embedding):
     """This module produces sinusoidal positional embeddings of any length."""
@@ -293,13 +282,13 @@ def __init__(self, config: PegasusConfig):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -307,8 +296,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -354,17 +344,17 @@ def __init__(self, config: PegasusConfig):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = PegasusAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = PegasusLayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -385,8 +375,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -532,7 +523,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.MarianTokenizer`. See
+            Indices can be obtained using :class:`~transformers.PegasusTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -541,7 +532,7 @@ def dummy_inputs(self):
             Pegasus uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
             :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
             :obj:`past_key_values`).
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -604,7 +595,7 @@ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] =
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -617,7 +608,7 @@ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] =
             self.padding_idx,
         )
         self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = PegasusLayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -763,7 +754,7 @@ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] =
             self.padding_idx,
         )
         self.layers = nn.ModuleList([PegasusDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layer_norm = PegasusLayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -1038,7 +1029,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 682e039d4f3a78..3049b2f8cbb8bb 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -99,7 +99,7 @@
             :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
             :obj:`past_key_values`).
 
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 63b16f9e974bb4..b8d72e277456b5 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1040,7 +1040,7 @@ def forward(
             To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
             <./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
             :obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
         encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 437517ee497ec0..1391ebed1d981b 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -922,7 +922,7 @@ def _shift_right(self, input_ids):
             - 0 for tokens that are **masked**.
 
             `What are attention masks? <../glossary.html#attention-mask>`__
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
         encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`):
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
index 52218bc1ca7a9c..13311e3cf2a6de 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -135,6 +135,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
         >>> configuration = model.config
     """
     model_type = "{{cookiecutter.lowercase_modelname}}"
+    {% if cookiecutter.is_encoder_decoder_model == "False" -%}
+    {% else -%}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    {% endif -%}
 
     def __init__(
         self,
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 33317c288a7518..029086d41f68d2 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -1849,7 +1849,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
         encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
             hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 099f778f16adad..77c8847a39e025 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -1601,17 +1601,6 @@ def _expand_mask(
     return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
 
 
-def {{cookiecutter.camelcase_modelname}}LayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
-    if torch.cuda.is_available():
-        try:
-            from apex.normalization import FusedLayerNorm
-
-            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-        except ImportError:
-            pass
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size. 
@@ -1619,7 +1608,6 @@ class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embeddin
 
     def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
         assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings
         super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
 
     def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
@@ -1774,13 +1762,13 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
-        self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
         self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
         """
@@ -1788,8 +1776,9 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, out
             hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
             attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
         hidden_states, attn_weights, _ = self.self_attn(
@@ -1834,17 +1823,17 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config):
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
 
-        self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = {{cookiecutter.camelcase_modelname}}Attention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
         self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -1865,8 +1854,9 @@ def forward(
             encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
             past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions.
-                This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
         """
         residual = hidden_states
 
@@ -2031,7 +2021,7 @@ def dummy_inputs(self):
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Provide for translation and summarization training. By default, the model will create this tensor by
             shifting the :obj:`input_ids` to the right, following the paper.
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
             Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
             also be used by default.
 
@@ -2094,7 +2084,7 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tok
         embed_dim = config.d_model
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_position_embeddings
-        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
         if embed_tokens is not None:
             self.embed_tokens = embed_tokens
@@ -2107,7 +2097,7 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tok
             self.padding_idx,
         )
         self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}EncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(embed_dim)
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         self.init_weights()
 
@@ -2251,7 +2241,7 @@ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tok
             self.padding_idx,
         )
         self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}DecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(config.d_model)
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.init_weights()
 
@@ -2513,7 +2503,7 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
index 7f1734c85a46a8..d5ee422b126f81 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -480,8 +480,6 @@ def test_inference_masked_lm(self):
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 9d242df95108fc..1100c893ae277e 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py
index c9302c29ccb421..b72cacf711961c 100644
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
diff --git a/tests/test_modeling_blenderbot_small.py b/tests/test_modeling_blenderbot_small.py
index 3f180f6be319cc..f1a107a3dc344c 100644
--- a/tests/test_modeling_blenderbot_small.py
+++ b/tests/test_modeling_blenderbot_small.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py
index 3bed7f67e986e1..eee61080779504 100644
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.hf_api import HfApi
diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py
index 6dc3a3bac19a32..d70362edd80528 100644
--- a/tests/test_modeling_mbart.py
+++ b/tests/test_modeling_mbart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py
index 74047f7750aa13..512b50ff7543b8 100644
--- a/tests/test_modeling_pegasus.py
+++ b/tests/test_modeling_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,8 +18,6 @@
 import tempfile
 import unittest
 
-import timeout_decorator  # noqa
-
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
diff --git a/tests/test_tokenization_small_blenderbot.py b/tests/test_tokenization_small_blenderbot.py
index 40a23daa62858f..4259ee09414b1f 100644
--- a/tests/test_tokenization_small_blenderbot.py
+++ b/tests/test_tokenization_small_blenderbot.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
+"""Tests for the Blenderbot small tokenizer."""
 import json
 import os
 import unittest

From 0077a6ee886d75895cb1415bfa58387530d6ae76 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 19:47:19 +0000
Subject: [PATCH 46/51] apply suggestions

---
 .../models/auto/tokenization_auto.py          |   2 +-
 .../models/blenderbot/__init__.py             |   2 +-
 .../blenderbot/tokenization_blenderbot.py     | 174 ------------------
 .../configuration_blenderbot_small.py         |   8 +-
 .../modeling_blenderbot_small.py              |   8 +-
 .../tokenization_blenderbot_small.py          |  10 +-
 .../tokenization_blenderbot_small_fast.py     |   2 +-
 7 files changed, 18 insertions(+), 188 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ecbc2d9ee784f4..313fc1832bab38 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -24,7 +24,7 @@
 from ..bert.tokenization_bert import BertTokenizer
 from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
 from ..bertweet.tokenization_bertweet import BertweetTokenizer
-from ..blenderbot.tokenization_blenderbot import BlenderbotSmallTokenizer
+from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer
 from ..ctrl.tokenization_ctrl import CTRLTokenizer
 from ..deberta.tokenization_deberta import DebertaTokenizer
 from ..distilbert.tokenization_distilbert import DistilBertTokenizer
diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py
index 40c5c34f350d85..dad947a4992e5d 100644
--- a/src/transformers/models/blenderbot/__init__.py
+++ b/src/transformers/models/blenderbot/__init__.py
@@ -18,7 +18,7 @@
 
 from ...file_utils import is_tf_available, is_torch_available
 from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
-from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
+from .tokenization_blenderbot import BlenderbotTokenizer
 
 
 if is_torch_available():
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index 2aa785d00a251c..3a750346a70853 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -92,177 +92,3 @@ def get_pairs(word):
 
     pairs = set(pairs)
     return pairs
-
-
-class BlenderbotSmallTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to the superclass for more information regarding methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            File containing the vocabulary.
-        merges_file (:obj:`str`):
-            Path to the merges file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
-            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
-            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
-    """
-
-    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-    pretrained_vocab_files_map = {
-        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
-        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
-    }
-    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        bos_token="__start__",
-        eos_token="__end__",
-        unk_token="__unk__",
-        pad_token="__null__",
-        **kwargs
-    ):
-        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token: str) -> str:
-        if token in self.cache:
-            return self.cache[token]
-        token = re.sub("([.,!?()])", r" \1", token)
-        token = re.sub("(')", r" \1 ", token)
-        token = re.sub(r"\s{2,}", " ", token)
-        if "\n" in token:
-            token = token.replace("\n", " __newln__")
-
-        tokens = token.split(" ")
-        words = []
-        for token in tokens:
-            if not len(token):
-                continue
-
-            token = token.lower()
-            word = tuple(token)
-            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
-            pairs = get_pairs(word)
-
-            if not pairs:
-                words.append(token)
-                continue
-
-            while True:
-                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-                if bigram not in self.bpe_ranks:
-                    break
-                first, second = bigram
-                new_word = []
-                i = 0
-
-                while i < len(word):
-                    try:
-                        j = word.index(first, i)
-                        new_word.extend(word[i:j])
-                        i = j
-                    except ValueError:
-                        new_word.extend(word[i:])
-                        break
-
-                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                        new_word.append(first + second)
-                        i += 2
-                    else:
-                        new_word.append(word[i])
-                        i += 1
-                new_word = tuple(new_word)
-                word = new_word
-                if len(word) == 1:
-                    break
-                else:
-                    pairs = get_pairs(word)
-            word = "@@ ".join(word)
-            word = word[:-4]
-
-            self.cache[token] = word
-            words.append(word)
-        return " ".join(words)
-
-    def _tokenize(self, text: str) -> List[str]:
-        """ Split a string into tokens using BPE."""
-        split_tokens = []
-
-        words = re.findall(r"\S+\n?", text)
-
-        for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
-        return split_tokens
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token to an id using the vocab. """
-        token = token.lower()
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """ Converts a sequence of tokens  in a single string. """
-        out_string = " ".join(tokens).replace("@@ ", "").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index b3c06258747772..b7bde44ddacc20 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -21,7 +21,7 @@
 logger = logging.get_logger(__name__)
 
 BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/blenderbot-90M": "https://huggingface.co/facebook/blenderbot-90M/resolve/main/config.json",
+    "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/config.json",
     # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
 }
 
@@ -31,7 +31,7 @@ class BlenderbotSmallConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
     used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
-    `facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__ architecture.
+    `facebook/blenderbot_small-90M <https://huggingface.co/facebook/blenderbot_small-90M>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -89,10 +89,10 @@ class BlenderbotSmallConfig(PretrainedConfig):
 
         >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
 
-        >>> # Initializing a BlenderbotSmall facebook/blenderbot-90M style configuration
+        >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
         >>> configuration = BlenderbotSmallConfig()
 
-        >>> # Initializing a model from the facebook/blenderbot-90M style configuration
+        >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
         >>> model = BlenderbotSmallModel(configuration)
 
         >>> # Accessing the model configuration
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index dbb6b6f8e05e93..c4af450bbeb530 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -49,7 +49,7 @@
 
 
 BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/blenderbot-90M",
+    "facebook/blenderbot_small-90M",
     # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
 ]
 
@@ -467,7 +467,7 @@ def dummy_inputs(self):
     Conversation example::
 
         >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
-        >>> mname = 'facebook/blenderbot-90M'
+        >>> mname = 'facebook/blenderbot_small-90M'
         >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname)
         >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
         >>> UTTERANCE = "My friends are cool but they eat too many carbs."
@@ -991,8 +991,8 @@ def forward(
 
             >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
 
-            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot-90M")
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
+            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
+            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
             >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
             >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 10dee6cbf9e3c4..0c8b821ff36325 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -77,10 +77,14 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
 
     vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
     pretrained_vocab_files_map = {
-        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
-        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
+        "vocab_file": {
+            "facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/vocab.json"
+        },
+        "merges_file": {
+            "facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/merges.txt"
+        },
     }
-    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
+    max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
 
     def __init__(
         self,
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 7e436953c98975..07d9242a90d516 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -29,7 +29,7 @@
 PRETRAINED_VOCAB_FILES_MAP = {}
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "facebook/blenderbot-90M": 512,
+    "facebook/blenderbot_small-90M": 512,
 }
 
 

From 945ad5febe58bf46a633a3764c1a857ceb38768b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 19:55:12 +0000
Subject: [PATCH 47/51] push last fixes

---
 src/transformers/models/auto/tokenization_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 313fc1832bab38..a2982efb53591a 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -194,12 +194,12 @@
         (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
         (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
         (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
+        (MBartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
         (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
         (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
         (MarianConfig, (MarianTokenizer, None)),
         (BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
         (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
-        (BartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
         (BartConfig, (BartTokenizer, BartTokenizerFast)),
         (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
         (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),

From 7e1b7fe1f9527c314d939af6fcc97eabbd007212 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 21:10:08 +0100
Subject: [PATCH 48/51] fix docs

---
 docs/source/model_doc/blenderbot.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
index 1f570088955e83..2f1ee4b0217a99 100644
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -46,7 +46,7 @@ Implementation Notes
 - Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
 - This is the `default` Blenderbot model class. However, some smaller checkpoints, such as
   ``facebook/blenderbot_small_90M``, have a different architecture and consequently should be used with
-  `BlenderbotSmall <./blenderbot_small.rst>`.
+  `BlenderbotSmall <https://huggingface.co/transformers/master/model_doc/blenderbot_small.html>`__.
 
 
 Usage

From c5e1ff344c977cfd28a843bf7dcf38b16824d0c7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 20:30:07 +0000
Subject: [PATCH 49/51] fix tok tests

---
 tests/test_tokenization_blenderbot.py       | 70 +--------------------
 tests/test_tokenization_small_blenderbot.py |  5 +-
 2 files changed, 5 insertions(+), 70 deletions(-)

diff --git a/tests/test_tokenization_blenderbot.py b/tests/test_tokenization_blenderbot.py
index aa90655556960d..6cb4eacfb4b8bf 100644
--- a/tests/test_tokenization_blenderbot.py
+++ b/tests/test_tokenization_blenderbot.py
@@ -14,78 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
-import json
-import os
 import unittest
 
 from transformers.file_utils import cached_property
-from transformers.models.blenderbot.tokenization_blenderbot import (
-    VOCAB_FILES_NAMES,
-    BlenderbotSmallTokenizer,
-    BlenderbotTokenizer,
-)
-
-from .test_tokenization_common import TokenizerTesterMixin
-
-
-class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
-    tokenizer_class = BlenderbotSmallTokenizer
-
-    def setUp(self):
-        super().setUp()
-
-        vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
-        self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
-
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(self.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs.update(self.special_tokens_map)
-        return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "adapt act apte"
-        output_text = "adapt act apte"
-        return input_text, output_text
-
-    def test_full_blenderbot_small_tokenizer(self):
-        tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "adapt act apte"
-        bpe_tokens = ["adapt", "act", "ap@@", "te"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
-
-        input_bpe_tokens = [0, 1, 2, 3, 4, 5]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_special_tokens_small_tok(self):
-        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
-        assert tok("sam").input_ids == [1384]
-        src_text = "I am a small frog."
-        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
-        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        assert src_text != decoded  # I wish it did!
-        assert decoded == "i am a small frog ."
-
-    def test_empty_word_small_tok(self):
-        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
-        src_text = "I am a small frog ."
-        src_text_dot = "."
-        encoded = tok(src_text)["input_ids"]
-        encoded_dot = tok(src_text_dot)["input_ids"]
-
-        assert encoded[-1] == encoded_dot[0]
+from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
 
 
 class Blenderbot3BTokenizerTests(unittest.TestCase):
diff --git a/tests/test_tokenization_small_blenderbot.py b/tests/test_tokenization_small_blenderbot.py
index 4259ee09414b1f..e4ee8254e1bebc 100644
--- a/tests/test_tokenization_small_blenderbot.py
+++ b/tests/test_tokenization_small_blenderbot.py
@@ -18,7 +18,10 @@
 import os
 import unittest
 
-from transformers.models.blenderbot.tokenization_blenderbot import VOCAB_FILES_NAMES, BlenderbotSmallTokenizer
+from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
+    VOCAB_FILES_NAMES,
+    BlenderbotSmallTokenizer,
+)
 
 from .test_tokenization_common import TokenizerTesterMixin
 

From dff4c1a268b0bbe388342f263ad898f39de53dff Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 20:36:19 +0000
Subject: [PATCH 50/51] fix imports code style

---
 .../models/blenderbot/tokenization_blenderbot.py           | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index 3a750346a70853..93537f1f4d9594 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -14,13 +14,8 @@
 # limitations under the License.
 """Tokenization class for Blenderbot."""
 
-import json
-import os
-from typing import Dict, List, Optional, Tuple
+from typing import List
 
-import regex as re
-
-from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import logging
 from ..roberta.tokenization_roberta import RobertaTokenizer
 

From dc3cdefde45d3ce3d43946de69b42dcf01d42843 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 5 Jan 2021 21:43:51 +0100
Subject: [PATCH 51/51] fix doc

---
 docs/source/model_doc/marian.rst | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/docs/source/model_doc/marian.rst b/docs/source/model_doc/marian.rst
index 83e3efe52fd8d3..b7d97aae990e85 100644
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -55,14 +55,10 @@ Examples
 
 - Since Marian models are smaller than many other translation models available in the library, they can be useful for
   fine-tuning experiments and integration tests.
-<<<<<<< HEAD - `Fine-tune on GPU
-<https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_enro_teacher.sh>`__
+- `Fine-tune on GPU
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_enro_teacher.sh>`__
 - `Fine-tune on GPU with pytorch-lightning
-<https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_no_teacher.sh>`__
-======= - :prefix_link:`Fine-tune on TPU <examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh>` -
-:prefix_link:`Fine-tune on GPU <examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh>` -
-:prefix_link:`Fine-tune on GPU with pytorch-lightning <examples/seq2seq/distil_marian_no_teacher.sh>` >>>>>>>
-main/master
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_no_teacher.sh>`__
 
 Multilingual Models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~