From 9da38535321a1629c499e8a4f6b17b4a3cea5d54 Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Thu, 25 Mar 2021 02:41:59 -0700
Subject: [PATCH 01/42] Add support for NVIDIA Megatron models

---
 docs/source/model_doc/megatron_bert.rst       |  129 ++
 docs/source/model_doc/megatron_gpt2.rst       |   81 +
 examples/megatron-models/README.md            |  108 +
 examples/megatron-models/run_bert.py          |   99 +
 examples/megatron-models/run_gpt2.py          |   86 +
 src/transformers/__init__.py                  |   24 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    4 +
 src/transformers/models/auto/modeling_auto.py |   20 +
 .../models/megatron_bert/__init__.py          |   95 +
 .../configuration_megatron_bert.py            |  141 ++
 .../convert_megatron_bert_checkpoint.py       |  286 +++
 .../megatron_bert/modeling_megatron_bert.py   | 1812 +++++++++++++++++
 .../convert_megatron_gpt2_checkpoint.py       |  232 +++
 utils/check_repo.py                           |    6 +
 15 files changed, 3124 insertions(+)
 create mode 100644 docs/source/model_doc/megatron_bert.rst
 create mode 100644 docs/source/model_doc/megatron_gpt2.rst
 create mode 100644 examples/megatron-models/README.md
 create mode 100644 examples/megatron-models/run_bert.py
 create mode 100644 examples/megatron-models/run_gpt2.py
 create mode 100644 src/transformers/models/megatron_bert/__init__.py
 create mode 100644 src/transformers/models/megatron_bert/configuration_megatron_bert.py
 create mode 100644 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
 create mode 100755 src/transformers/models/megatron_bert/modeling_megatron_bert.py
 create mode 100644 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py

diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
new file mode 100644
index 00000000000000..600c357dd8d580
--- /dev/null
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -0,0 +1,129 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Copyright 2021 NVIDIA Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion
+Parameter Language Models Using Model Parallelism
+<https://arxiv.org/abs/1909.08053>`__  by Mohammad Shoeybi, Mostofa Patwary,
+Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer
+models advances the state of the art in Natural Language Processing
+applications. However, very large models can be quite difficult to train due to
+memory constraints. In this work, we present our techniques for training very
+large transformer models and implement a simple, efficient intra-layer model
+parallel approach that enables training transformer models with billions of
+parameters. Our approach does not require a new compiler or library changes, is
+orthogonal and complimentary to pipeline model parallelism, and can be fully
+implemented with the insertion of a few communication operations in native
+PyTorch. We illustrate this approach by converging transformer based models up
+to 8.3 billion parameters using 512 GPUs. We sustain 15.1 PetaFLOPs across the
+entire application with 76% scaling efficiency when compared to a strong single
+GPU baseline that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To
+demonstrate that large language models can further advance the state of the art
+(SOTA), we train an 8.3 billion parameter transformer language model similar to
+GPT-2 and a 3.9 billion parameter model similar to BERT. We show that careful
+attention to the placement of layer normalization in BERT-like models is
+critical to achieving increased performance as the model size grows. Using the
+GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to SOTA
+perplexity of 15.8) and LAMBADA (66.5% compared to SOTA accuracy of 63.2%)
+datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9%
+compared to SOTA accuracy of 89.4%).*
+
+Tips:
+
+We have provided pretrained `BERT-345M
+<https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks. 
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__
+for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation
+for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased::
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O megatron_bert_345m_v0_1_uncased.zip
+
+BERT-345M-cased::
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O megatron_bert_345m_v0_1_cased.zip
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to
+convert them to a format that will easily be loaded by Hugging Face
+Transformers and our port of the BERT code. 
+
+The following commands allow you to do the conversion. We assume that the
+folder ``models/megatron_bert`` contains ``megatron_bert_345m_v0_1_{cased,
+uncased}.zip`` and that the commands are run from inside that folder::
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+
+The original code can be found `here
+<https://github.com/NVIDIA/Megatron-LM>`__. That repository
+contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular,
+it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+
+MegatronBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertConfig
+    :members:
+
+
+MegatronBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertModel
+    :members: forward
+
+
+MegatronBertForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForConditionalGeneration
+    :members: forward
+
+
+MegatronBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForSequenceClassification
+    :members: forward
+
+
+MegatronBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForQuestionAnswering
+    :members: forward
+
+
+MegatronBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForCausalLM
+    :members: forward
+
+
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
new file mode 100644
index 00000000000000..9f41fb9c73d2ea
--- /dev/null
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -0,0 +1,81 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Copyright 2021 NVIDIA Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronGPT2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion
+Parameter Language Models Using Model Parallelism
+<https://arxiv.org/abs/1909.08053>`__  by Mohammad Shoeybi, Mostofa Patwary,
+Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer
+models advances the state of the art in Natural Language Processing
+applications. However, very large models can be quite difficult to train due to
+memory constraints. In this work, we present our techniques for training very
+large transformer models and implement a simple, efficient intra-layer model
+parallel approach that enables training transformer models with billions of
+parameters. Our approach does not require a new compiler or library changes, is
+orthogonal and complimentary to pipeline model parallelism, and can be fully
+implemented with the insertion of a few communication operations in native
+PyTorch. We illustrate this approach by converging transformer based models up
+to 8.3 billion parameters using 512 GPUs. We sustain 15.1 PetaFLOPs across the
+entire application with 76% scaling efficiency when compared to a strong single
+GPU baseline that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To
+demonstrate that large language models can further advance the state of the art
+(SOTA), we train an 8.3 billion parameter transformer language model similar to
+GPT-2 and a 3.9 billion parameter model similar to BERT. We show that careful
+attention to the placement of layer normalization in BERT-like models is
+critical to achieving increased performance as the model size grows. Using the
+GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to SOTA
+perplexity of 15.8) and LAMBADA (66.5% compared to SOTA accuracy of 63.2%)
+datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9%
+compared to SOTA accuracy of 89.4%).*
+
+Tips:
+
+We have provided pretrained `GPT2-345M
+<https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks. 
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__
+for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation
+for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using::
+
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_gpt2_345m_v0_0.zip
+
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to
+convert it to a format that will easily be loaded by Hugging Face Transformers
+GPT2 implementation. 
+
+The following command allows you to do the conversion. We assume that the
+folder ``models/megatron_gpt2`` contains ``megatron_gpt2_345m_v0_0.zip`` and
+that the command is run from that folder::
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+
+The original code can be found `here
+<https://github.com/NVIDIA/Megatron-LM>`__. That repository
+contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular,
+it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+
+
diff --git a/examples/megatron-models/README.md b/examples/megatron-models/README.md
new file mode 100644
index 00000000000000..ad16d34fa94eb3
--- /dev/null
+++ b/examples/megatron-models/README.md
@@ -0,0 +1,108 @@
+<!---
+# ##############################################################################################
+# 
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# ##############################################################################################
+-->
+
+# How to run Megatron BERT and GPT2 using Transformers
+
+## Get the checkpoints from the NVIDIA GPU Cloud 
+
+The first step is to create a directory in the current folder (`examples/megatron-lm`) to store the 
+checkpoints.
+
+```
+mkdir -p models/{bert, gpt2}
+```
+
+Then, you can download the checkpoints from the NVIDIA GPU Cloud (NGC). For that you have to 
+[sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. 
+Further documentation for downloading models can be found in the 
+[NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
+
+
+Alternatively, you can directly download the checkpoints using:
+
+### BERT 345M cased
+
+```
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O models/bert/megatron_bert_345m_v0_1_cased.zip
+```
+
+### BERT 345M uncased
+
+```
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O models/bert/megatron_bert_345m_v0_1_uncased.zip
+```
+
+### GPT2 345M 
+
+```
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O models/gpt2/megatron_gpt2_345m_v0_0.zip
+```
+
+## Converting the checkpoints
+
+In order to be loaded into `Transformers`, the checkpoints have to be converted. You should run the following
+commands for that purpose.
+
+### BERT 345M cased
+
+```
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/bert/megatron_bert_345m_v0_1_cased.zip
+```
+
+### BERT 345M uncased
+
+```
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/bert/megatron_bert_345m_v0_1_uncased.zip
+```
+
+### GPT2 345M 
+
+```
+python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py models/gpt2/megatron_gpt2_345m_v0_0.zip
+```
+
+## Running the samples
+
+For BERT, we created a simple example that runs two tasks using the Megatron BERT checkpoints using
+the Transformers API. The first task is `MegatronBERTForMaskedLM` and the second one is 
+`MegatronBERTForNextSentencePrediction`.
+
+### Masked LM
+
+```
+python3 ./run_bert.py --masked-lm ./models/bert/megatron_bert_345m_v0_1_cased
+python3 ./run_bert.py --masked-lm ./models/bert/megatron_bert_345m_v0_1_uncased
+```
+
+### Next sentence prediction
+
+```
+python3 ./run_bert.py ./models/bert/megatron_bert_345m_v0_1_cased
+python3 ./run_bert.py ./models/bert/megatron_bert_345m_v0_1_uncased
+```
+
+### Text generation
+
+For GPT2, we created a simple for text generation.
+
+```
+python3 ./run_gpt2.py models/gpt2/megatron_gpt2_345m_v0_0
+```
+
diff --git a/examples/megatron-models/run_bert.py b/examples/megatron-models/run_bert.py
new file mode 100644
index 00000000000000..6d04aa6bbe732b
--- /dev/null
+++ b/examples/megatron-models/run_bert.py
@@ -0,0 +1,99 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import os
+import torch
+from   transformers import BertTokenizer
+from   transformers import MegatronBertConfig
+from   transformers import MegatronBertForMaskedLM, MegatronBertForNextSentencePrediction
+
+####################################################################################################
+
+def main():
+
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--masked-lm', action='store_true')
+    parser.add_argument('checkpoint', type=str,
+        help="""Path to the folder containing the config.json and checkpoint.pt extracted from the
+        NGC checkpoint using the convert_megatron_bert_checkpoint.py script.""")
+    args = parser.parse_args()
+
+    # Do we use the cased/uncased model.
+    is_uncased = 'uncased' in args.checkpoint
+
+    # The base model.
+    bert = 'bert-base-' + ('uncased' if is_uncased else 'cased')
+    # The tokenizer. Megatron was trained with standard tokenizer(s).
+    tokenizer = BertTokenizer.from_pretrained(bert)
+
+    # The config file.
+    config_file = os.path.join(args.checkpoint, 'config.json')
+    # Load the config.
+    config = MegatronBertConfig.from_pretrained(config_file)
+    # Make sure we do not try to tie embeddings.
+    config.tie_word_embeddings = False
+
+    # The model class.
+    model_cls = MegatronBertForMaskedLM if args.masked_lm else MegatronBertForNextSentencePrediction
+    # The checkpoint file.
+    checkpoint_file = os.path.join(args.checkpoint, 'checkpoint.pt')
+    # Load the model from transformers.
+    model = model_cls.from_pretrained(checkpoint_file, config=config)
+
+    # Do not run backward.
+    model.eval()
+
+    # Copy to the device and use FP16.
+    assert torch.cuda.is_available()
+    device = torch.device('cuda')
+    model.to(device)
+    model.half()
+
+    # The input sentence.
+
+    # Create a dummy sentence (from the BERT example page).
+    if args.masked_lm:
+        input = tokenizer('the capital of france is [MASK]', return_tensors='pt')
+        input = input.to(device)
+        label = tokenizer('the capital of france is paris', return_tensors='pt')['input_ids']
+        label = label.to(device)
+        output = model(**input, labels=label)
+    else:
+        prompt = 'In Italy, pizza served in formal settings is presented unsliced.'
+        next_sentence = 'The sky is blue due to the shorter wavelength of blue light.'
+        input = tokenizer(prompt, next_sentence, return_tensors='pt')
+        input = input.to(device)
+        label = torch.LongTensor([1])
+        label = label.to(device)
+
+    # Run the model.
+    output = model(**input, labels=label)
+
+    # Outputs.
+    print('loss:   ', output.loss)
+    print('logits: ', output.logits)
+
+####################################################################################################
+
+if __name__ == '__main__':
+    main()
+
+####################################################################################################
+
diff --git a/examples/megatron-models/run_gpt2.py b/examples/megatron-models/run_gpt2.py
new file mode 100644
index 00000000000000..6f3ce7dddcd829
--- /dev/null
+++ b/examples/megatron-models/run_gpt2.py
@@ -0,0 +1,86 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import os
+import torch
+from   transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
+
+####################################################################################################
+
+def main():
+
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('checkpoint', type=str)
+    args = parser.parse_args()
+
+    # The tokenizer. Megatron was trained with standard tokenizer(s).
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+
+    # The config file.
+    config_file = os.path.join(args.checkpoint, 'config.json')
+    # Load the GPT2 config.
+    config = GPT2Config.from_pretrained(config_file)
+
+    # The checkpoint file.
+    checkpoint_file = os.path.join(args.checkpoint, 'checkpoint.pt')
+    # Load GPT2 model from transformers.
+    model = GPT2LMHeadModel.from_pretrained(checkpoint_file, config=config)
+
+    # Do not run backward.
+    model.eval()
+
+    # Copy to the device and use FP16.
+    assert torch.cuda.is_available()
+    device = torch.device('cuda')
+    model.to(device)
+    model.half()
+
+    # Create an empty sentence.
+    input = tokenizer.encode('', return_tensors='pt')
+    input = input.to(device)
+
+    # The token ids.
+    if input.size()[-1] == 0:
+        input_ids = None
+    else:
+        input_ids = input
+
+    # Generate the sentence.
+    output = model.generate(input_ids=input_ids,
+                            max_length=128,
+                            temperature=1.0,
+                            top_k=0,
+                            top_p=0.9,
+                            do_sample=True,
+                            num_return_sequences=1)
+
+    # Output the text.
+    for sentence in output:
+        sentence = sentence.tolist()
+        text = tokenizer.decode(sentence, clean_up_tokenization_spaces=True)
+        print(text)
+
+####################################################################################################
+
+if __name__ == '__main__':
+    main()
+
+####################################################################################################
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f5954696e9ba00..e9613489476b40 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -187,6 +187,7 @@
     "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
     "models.marian": ["MarianConfig"],
     "models.mbart": ["MBartConfig"],
+    "models.megatron_bert": ["MegatronBertConfig"],
     "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
@@ -736,6 +737,16 @@
             "MBartModel",
         ]
     )
+    _import_structure["models.megatron_bert"].extend(
+        [
+            "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MegatronBertForCausalLM",
+            "MegatronBertForConditionalGeneration",
+            "MegatronBertForQuestionAnswering",
+            "MegatronBertForSequenceClassification",
+            "MegatronBertModel",
+        ]
+    )
     _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
     _import_structure["models.mobilebert"].extend(
         [
@@ -1463,6 +1474,7 @@
     from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
     from .models.marian import MarianConfig
     from .models.mbart import MBartConfig
+    from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .models.mmbt import MMBTConfig
     from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
     from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
@@ -1930,6 +1942,18 @@
             MBartForSequenceClassification,
             MBartModel,
         )
+        from .models.megatron_bert import (
+            MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertLayer,
+            MegatronBertModel,
+            MegatronBertForMaskedLM,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForSequenceClassification,
+            MegatronBertForMultipleChoice,
+            MegatronBertForTokenClassification,
+            MegatronBertForQuestionAnswering,
+        )
         from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index efc6aedef39105..97b8c8de890faa 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -50,6 +50,7 @@
     m2m_100,
     marian,
     mbart,
+    megatron_bert,
     mmbt,
     mobilebert,
     mpnet,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b32140c7c1c11c..52026b743eac6c 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -50,6 +50,7 @@
 from ..m2m_100.configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
 from ..marian.configuration_marian import MarianConfig
 from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+from ..megatron_bert.configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
 from ..mobilebert.configuration_mobilebert import MobileBertConfig
 from ..mpnet.configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
 from ..mt5.configuration_mt5 import MT5Config
@@ -85,6 +86,7 @@
         # Add archive maps here
         GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -155,6 +157,7 @@
         ("pegasus", PegasusConfig),
         ("marian", MarianConfig),
         ("mbart", MBartConfig),
+        ("megatron_bert", MegatronBertConfig),
         ("mpnet", MPNetConfig),
         ("bart", BartConfig),
         ("blenderbot", BlenderbotConfig),
@@ -211,6 +214,7 @@
         ("blenderbot", "Blenderbot"),
         ("marian", "Marian"),
         ("mbart", "mBART"),
+        ("megatron_bert", "MegatronBert"),
         ("bart", "BART"),
         ("reformer", "Reformer"),
         ("longformer", "Longformer"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index aecd7aa96715be..101399ab6584ce 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -176,6 +176,16 @@
     MBartForSequenceClassification,
     MBartModel,
 )
+from ..megatron_bert.modeling_megatron_bert import (
+    MegatronBertModel,
+    MegatronBertForMaskedLM,
+    MegatronBertForNextSentencePrediction,
+    MegatronBertForPreTraining,
+    MegatronBertForSequenceClassification,
+    MegatronBertForMultipleChoice,
+    MegatronBertForTokenClassification,
+    MegatronBertForQuestionAnswering,
+)
 from ..mobilebert.modeling_mobilebert import (
     MobileBertForMaskedLM,
     MobileBertForMultipleChoice,
@@ -300,6 +310,7 @@
     M2M100Config,
     MarianConfig,
     MBartConfig,
+    MegatronBertConfig,
     MobileBertConfig,
     MPNetConfig,
     MT5Config,
@@ -358,6 +369,7 @@
         (BertConfig, BertModel),
         (OpenAIGPTConfig, OpenAIGPTModel),
         (GPT2Config, GPT2Model),
+        (MegatronBertConfig, MegatronBertModel),
         (MobileBertConfig, MobileBertModel),
         (TransfoXLConfig, TransfoXLModel),
         (XLNetConfig, XLNetModel),
@@ -401,6 +413,7 @@
         (BigBirdConfig, BigBirdForPreTraining),
         (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
         (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForPreTraining),
         (MobileBertConfig, MobileBertForPreTraining),
         (TransfoXLConfig, TransfoXLLMHeadModel),
         (XLNetConfig, XLNetLMHeadModel),
@@ -444,6 +457,7 @@
         (BertConfig, BertForMaskedLM),
         (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
         (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
         (MobileBertConfig, MobileBertForMaskedLM),
         (TransfoXLConfig, TransfoXLLMHeadModel),
         (XLNetConfig, XLNetLMHeadModel),
@@ -517,6 +531,7 @@
         (RobertaConfig, RobertaForMaskedLM),
         (SqueezeBertConfig, SqueezeBertForMaskedLM),
         (BertConfig, BertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
         (MobileBertConfig, MobileBertForMaskedLM),
         (FlaubertConfig, FlaubertWithLMHeadModel),
         (XLMConfig, XLMWithLMHeadModel),
@@ -569,6 +584,7 @@
         (LayoutLMConfig, LayoutLMForSequenceClassification),
         (BertConfig, BertForSequenceClassification),
         (XLNetConfig, XLNetForSequenceClassification),
+        (MegatronBertConfig, MegatronBertForSequenceClassification),
         (MobileBertConfig, MobileBertForSequenceClassification),
         (FlaubertConfig, FlaubertForSequenceClassification),
         (XLMConfig, XLMForSequenceClassification),
@@ -605,6 +621,7 @@
         (BertConfig, BertForQuestionAnswering),
         (XLNetConfig, XLNetForQuestionAnsweringSimple),
         (FlaubertConfig, FlaubertForQuestionAnsweringSimple),
+        (MegatronBertConfig, MegatronBertForQuestionAnswering),
         (MobileBertConfig, MobileBertForQuestionAnswering),
         (XLMConfig, XLMForQuestionAnsweringSimple),
         (ElectraConfig, ElectraForQuestionAnswering),
@@ -640,6 +657,7 @@
         (RobertaConfig, RobertaForTokenClassification),
         (SqueezeBertConfig, SqueezeBertForTokenClassification),
         (BertConfig, BertForTokenClassification),
+        (MegatronBertConfig, MegatronBertForTokenClassification),
         (MobileBertConfig, MobileBertForTokenClassification),
         (XLNetConfig, XLNetForTokenClassification),
         (AlbertConfig, AlbertForTokenClassification),
@@ -666,6 +684,7 @@
         (SqueezeBertConfig, SqueezeBertForMultipleChoice),
         (BertConfig, BertForMultipleChoice),
         (DistilBertConfig, DistilBertForMultipleChoice),
+        (MegatronBertConfig, MegatronBertForMultipleChoice),
         (MobileBertConfig, MobileBertForMultipleChoice),
         (XLNetConfig, XLNetForMultipleChoice),
         (AlbertConfig, AlbertForMultipleChoice),
@@ -680,6 +699,7 @@
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = OrderedDict(
     [
         (BertConfig, BertForNextSentencePrediction),
+        (MegatronBertConfig, MegatronBertForNextSentencePrediction),
         (MobileBertConfig, MobileBertForNextSentencePrediction),
     ]
 )
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
new file mode 100644
index 00000000000000..5458505249f1c5
--- /dev/null
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -0,0 +1,95 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2021 NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available, is_tokenizers_available
+_import_structure = {
+    "configuration_megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
+    "tokenization_megatron_bert": ["MegatronBertTokenizer"],
+}
+
+if is_tokenizers_available():
+    _import_structure["tokenization_megatron_bert_fast"] = ["MegatronBertTokenizerFast"]
+
+if is_torch_available():
+    _import_structure["modeling_megatron_bert"] = [
+        "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MegatronBertForMaskedLM",
+        "MegatronBertForNextSentencePrediction",
+        "MegatronBertForConditionalGeneration",
+        "MegatronBertForQuestionAnswering",
+        "MegatronBertForSequenceClassification",
+        "MegatronBertForCausalLM",
+        "MegatronBertModel",
+        "MegatronBertLMHeadModel",
+        "MegatronBertPreTrainedModel",
+    ]
+
+# if is_tf_available():
+#     _import_structure["modeling_tf_megatron_bert"] = [
+#         "TFMegatronBertForConditionalGeneration",
+#         "TFMegatronBertModel",
+#         "TFMegatronBertPreTrainedModel",
+#     ]
+
+if TYPE_CHECKING:
+    from .configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
+    from .tokenization_megatron_bert import MegatronBertTokenizer
+
+    if is_tokenizers_available():
+        from .tokenization_megatron_bert_fast import MegatronBertTokenizerFast
+
+    if is_torch_available():
+        from .modeling_megatron_bert import (
+            MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForMaskedLM,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForConditionalGeneration,
+            MegatronBertForCausalLM,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertModel,
+            MegatronBertLMHeadModel,
+            MegatronBertPreTrainedModel,
+        )
+
+#     if is_tf_available():
+#         from .modeling_tf_megatron_bert import (
+#             TFMegatronBertForConditionalGeneration,
+#             TFMegatronBertModel,
+#             TFMegatronBertPreTrainedModel,
+#         )
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
+
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
new file mode 100644
index 00000000000000..fdf92405e19fc9
--- /dev/null
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MEGATRON_BERT model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+logger = logging.get_logger(__name__)
+
+MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    # See all MEGATRON_BERT models at https://huggingface.co/models?filter=bert
+}
+
+
+class MegatronBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel` or a
+    :class:`~transformers.TFMegatronBertModel`. It is used to instantiate a MEGATRON_BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the MEGATRON_BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel` or
+            :class:`~transformers.TFMegatronBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.MegatronBertModel` or
+            :class:`~transformers.TFMegatronBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+
+    Examples::
+
+        >>> from transformers import MegatronBertModel, MegatronBertConfig
+
+        >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+        >>> configuration = MegatronBertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = MegatronBertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "megatron-bert"
+
+    def __init__(
+        self,
+        vocab_size=29056,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        
+    # @property
+    # def num_attention_heads(self) -> int:
+    #     return self.num_attention_heads
+
+    # @property
+    # def hidden_size(self) -> int:
+    #     return self.hidden_size
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
new file mode 100644
index 00000000000000..143567885a51e2
--- /dev/null
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -0,0 +1,286 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import json
+import os
+import re
+import torch
+import zipfile
+
+####################################################################################################
+
+def recursive_print(name, val, spaces=0):
+    # Format the message.
+    if name is None:
+        msg = None
+    else:
+        fmt = '.' * max(0, spaces-2) + '# {:' + str(50-spaces) + 's}'
+        msg = fmt.format(name)
+
+    # Print and recurse (if needed).
+    if isinstance(val, dict):
+        if msg is not None:
+            print(msg)
+        for k in val.keys():
+            recursive_print(k, val[k], spaces+2)
+    elif isinstance(val, torch.Tensor):
+        print(msg, ':', val.size())
+    else:
+        print(msg, ':', val)
+
+####################################################################################################
+
+def convert_megatron_checkpoint(args, input_state_dict):
+    # The converted output model.
+    output_state_dict = {}
+
+    # The number of heads.
+    heads = 16
+    # The hidden_size per head.
+    hidden_size_per_head = 64
+
+    # The model.
+    model = input_state_dict['model']
+    # The language model.
+    lm = model['language_model']
+    # The embeddings.
+    embeddings = lm['embedding']
+
+    # The word embeddings.
+    word_embeddings = embeddings['word_embeddings']['weight']
+    # Trained for 29056 x 1024.
+    assert word_embeddings.size(0) == 29056 and word_embeddings.size(1) == 1024
+    # Store the word embeddings.
+    output_state_dict['bert.embeddings.word_embeddings.weight'] = word_embeddings
+
+    # The position embeddings.
+    pos_embeddings = embeddings['position_embeddings']['weight']
+    # Trained for 512 x 1024.
+    assert pos_embeddings.size(0) == 512 and pos_embeddings.size(1) == 1024
+    # Store the position embeddings.
+    output_state_dict['bert.embeddings.position_embeddings.weight'] = pos_embeddings
+
+    # The token-type embeddings.
+    tokentype_embeddings = embeddings['tokentype_embeddings']['weight']
+    # Store the position embeddings.
+    output_state_dict['bert.embeddings.token_type_embeddings.weight'] = tokentype_embeddings
+
+    # The transformer.
+    transformer = lm['transformer']
+
+    # The regex to extract layer names.
+    layer_re = re.compile('layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)')
+
+    # The simple map of names for "automated" rules.
+    megatron_to_transformers = {
+        'attention.dense'  : '.attention.output.dense.',
+        'mlp.dense_h_to_4h': '.intermediate.dense.',
+        'mlp.dense_4h_to_h': '.output.dense.',
+    }
+
+    # Keep track of the attention/query/value tensor.
+    attention_qkv_weight = None
+
+    # Extract the layers.
+    for key, val in transformer.items():
+        # Match the name.
+        m = layer_re.match(key)
+
+        # Stop if that's not a layer
+        if m is None:
+            break
+
+        # The index of the layer.
+        layer_idx = int(m.group(1))
+        # The name of the operation.
+        op_name = m.group(2)
+        # Is it a weight or a bias?
+        weight_or_bias = m.group(3)
+
+        # The name of the layer.
+        layer_name = 'bert.encoder.layer.{}'.format(layer_idx)
+
+        # For layernorm(s), simply store the layer norm.
+        if op_name.endswith('layernorm'):
+
+            ln_name = 'attention.ln' if op_name.startswith('input') else 'ln'
+            output_state_dict[layer_name + '.' + ln_name + '.' + weight_or_bias] = val
+
+        # Transpose the QKV matrix.
+        elif op_name == 'attention.query_key_value' and weight_or_bias == 'weight':
+
+            # Make sure the QKV pointer is nil.
+            assert attention_qkv_weight is None, ""
+
+            # Store the tensor as we need the bias as well to interleave QKV and biases.
+            attention_qkv_weight = val
+
+        # Transpose the bias.
+        elif op_name == 'attention.query_key_value' and weight_or_bias == 'bias':
+
+            # Make sure we read the weight tensor.
+            assert attention_qkv_weight is not None, ""
+
+            # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
+            q = attention_qkv_weight[0*1024:1*1024, :]
+            k = attention_qkv_weight[1*1024:2*1024, :]
+            v = attention_qkv_weight[2*1024:3*1024, :]
+
+            # Split the bias.
+            q_bias = val[0*1024:1*1024]
+            k_bias = val[1*1024:2*1024]
+            v_bias = val[2*1024:3*1024]
+
+            # The name of the self attention block.
+            self_attn_name = layer_name + '.attention.self'
+
+            # Store.
+            output_state_dict[self_attn_name + '.query.weight'] = q
+            output_state_dict[self_attn_name + '.query.bias'  ] = q_bias
+            output_state_dict[self_attn_name + '.key.weight'  ] = k
+            output_state_dict[self_attn_name + '.key.bias'    ] = k_bias
+            output_state_dict[self_attn_name + '.value.weight'] = v
+            output_state_dict[self_attn_name + '.value.bias'  ] = v_bias
+
+            # Clear the stored tensor.
+            attention_qkv_weight = None
+
+        # Copy weights and biases as is.
+        elif weight_or_bias in ['weight', 'bias']:
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + weight_or_bias] = val
+
+    # The final layernorm.
+    output_state_dict['bert.encoder.ln.weight'] = transformer['final_layernorm.weight']
+    output_state_dict['bert.encoder.ln.bias']   = transformer['final_layernorm.bias']
+
+    # The config.
+    output_config = {
+        'vocab_size': 29056,
+        'hidden_size': 1024,
+        'num_hidden_layers': 24,
+        'num_attention_heads': 16,
+        'hidden_act': 'gelu_new',
+        'intermediate_size': 4096,
+        'hidden_dropout_prob': 0.1,
+        'attention_probs_dropout_prob': 0.1,
+        'max_position_embeddings': 512,
+        'type_vocab_size': 2,
+        'initializer_range': 0.2,
+        'layer_norm_eps': 1e-12,
+        'gradient_checkpointing': False,
+        'position_embedding_type': 'absolute',
+        'use_cache': False,
+
+        # 'bos_token_id': 29056,
+        # 'hidden_dropout_prob': 0.1,
+        # 'eos_token_id': 50256,
+        # 'initializer_range': 0.02,
+        # 'layer_norm_epsilon': 1e-05,
+        # 'model_type': 'bert',
+        # 'd_model': 1024,
+        # 'n_embd': 1024,
+        # 'encoder_attention_heads': 16,
+        # 'encoder_layers': 24,
+        # 'activation_dropout': 0.1,
+        # 'summary_activation': None,
+        # 'summary_first_dropout': 0.1,
+        # 'summary_proj_to_labels': True,
+        # 'summary_type': 'cls_index',
+        # 'summary_use_proj': True,
+    }
+
+    # The pooler.
+    pooler = lm['pooler']
+
+    # Store the matrix and the bias.
+    output_state_dict['bert.pooler.dense.weight'] = pooler['dense.weight']
+    output_state_dict['bert.pooler.dense.bias'  ] = pooler['dense.bias']
+
+    # The LM head from Megatron (for RACE).
+    lm_head = model['lm_head']
+
+    # The transform matrix.
+    output_state_dict['cls.predictions.transform.dense.weight'] = lm_head['dense.weight']
+    output_state_dict['cls.predictions.transform.dense.bias'  ] = lm_head['dense.bias']
+
+    # The transform LN.
+    output_state_dict['cls.predictions.transform.LayerNorm.weight'] = lm_head['layernorm.weight']
+    output_state_dict['cls.predictions.transform.LayerNorm.bias'  ] = lm_head['layernorm.bias']
+
+    # For the decoder, we replicate the weights.
+    output_state_dict['cls.predictions.decoder.weight'] = word_embeddings
+    output_state_dict['cls.predictions.bias'] = lm_head['bias']
+
+    # The classifier from Megatron (for MLNI).
+    binary_head = model['binary_head']
+
+    # Store the classifier.
+    output_state_dict['cls.seq_relationship.weight'] = binary_head['weight']
+    output_state_dict['cls.seq_relationship.bias'  ] = binary_head['bias']
+
+    # It should be done!
+    return output_state_dict, output_config
+
+####################################################################################################
+
+def main():
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--print-checkpoint-structure', action='store_true')
+    parser.add_argument("path_to_checkpoint", type=str,
+        help='Path to the ZIP file containing the checkpoint')
+    args = parser.parse_args()
+
+    # Extract the basename.
+    basename = os.path.dirname(args.path_to_checkpoint)
+
+    # Load the model.
+    print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
+    with zipfile.ZipFile(args.path_to_checkpoint, 'r') as checkpoint:
+        with checkpoint.open('release/mp_rank_00/model_optim_rng.pt') as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location='cpu')
+
+    # Convert.
+    print('Converting')
+    output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
+
+    # Print the structure of converted state dict.
+    if args.print_checkpoint_structure:
+        recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, 'config.json')
+    print('Saving config to "{}"'.format(output_config_file))
+    with open(output_config_file, 'w') as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, 'checkpoint.pt')
+    print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
+    torch.save(output_state_dict, output_checkpoint_file)
+
+####################################################################################################
+
+if __name__ == "__main__":
+    main()
+
+####################################################################################################
+
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
new file mode 100755
index 00000000000000..fc9c9c3d21d01c
--- /dev/null
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -0,0 +1,1812 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MegatronBERT model. """
+
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import logging
+from .configuration_megatron_bert import MegatronBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MegatronBertConfig"
+_TOKENIZER_FOR_DOC = "MegatronBertTokenizer"
+
+
+MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "nvidia/megatron-bert-cased-345m",
+    # See all MegatronBERT models at https://huggingface.co/models?filter=megatron_bert
+]
+
+def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class MegatronBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        # In Megatron, layer-norm is applied after the 1st dropout.
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
+        # embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MegatronBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class MegatronBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return residual + hidden_states
+
+
+class MegatronBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = MegatronBertSelfAttention(config)
+        self.output = MegatronBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        ln_outputs = self.ln(hidden_states)
+        self_outputs = self.self(
+            ln_outputs,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class MegatronBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MegatronBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return input_tensor + hidden_states
+
+
+class MegatronBertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MegatronBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = MegatronBertAttention(config)
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.intermediate = MegatronBertIntermediate(config)
+        self.output = MegatronBertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        ln_output = self.ln(attention_output)
+        intermediate_output = self.intermediate(ln_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MegatronBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([MegatronBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
+        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            # Because we moved the layer-norm at the end of the hidden layer, we have non-normali-
+            # zed data here. If that's really needed, we must apply LN to match Transformer's BERT.
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Finalize the hidden states.
+        hidden_states = self.ln(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class MegatronBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MegatronBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class MegatronBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MegatronBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class MegatronBertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class MegatronBertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class MegatronBertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class MegatronBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MegatronBertConfig
+    load_tf_weights = load_tf_weights_in_megatron_bert
+    base_model_prefix = "megatron-bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+@dataclass
+class MegatronBertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.MegatronBertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+MEGATRON_BERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.MegatronBertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+MEGATRON_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.MegatronBertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MegatronBert Model transformer outputting raw hidden-states without any specific head on top.",
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertModel(MegatronBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = MegatronBertEmbeddings(config)
+        self.encoder = MegatronBertEncoder(config)
+
+        self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
+    def __init__(self, config, add_binary_head=True):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.cls = MegatronBertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=MegatronBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import MegatronBertTokenizer, MegatronBertForPreTraining
+            >>> import torch
+
+            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-uncased-345m')
+            >>> model = MegatronBertForPreTraining.from_pretrained('megatron-bert-uncased-345m')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return MegatronBertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """, MEGATRON_BERT_START_DOCSTRING
+)
+class MegatronBertLMHeadModel(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `MegatronBertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import MegatronBertTokenizer, MegatronBertLMHeadModel, MegatronBertConfig
+            >>> import torch
+
+            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-cased-345m')
+            >>> config = MegatronBertConfig.from_pretrained("megatron-bert-cased-345m")
+            >>> config.is_decoder = True
+            >>> model = MegatronBertLMHeadModel.from_pretrained('megatron-bert-cased-345m', config=config)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""MegatronBert Model with a `language modeling` head on top. """, MEGATRON_BERT_START_DOCSTRING)
+class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `MegatronBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """MegatronBert Model with a `next sentence prediction (classification)` head on top. """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"predictions"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.cls = MegatronBertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import MegatronBertTokenizer, MegatronBertForNextSentencePrediction
+            >>> import torch
+
+            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-uncased-345m')
+            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('megatron-bert-uncased-345m')
+
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MEGATRON_BERT_START_DOCSTRING,
+)
+class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="megatron-bert-uncased-345m",
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
new file mode 100644
index 00000000000000..7ab681817a37f8
--- /dev/null
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -0,0 +1,232 @@
+####################################################################################################
+
+# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+####################################################################################################
+
+import argparse
+import json
+import os
+import re
+import torch
+import zipfile
+
+####################################################################################################
+
+def recursive_print(name, val, spaces=0):
+    # Format the message.
+    if name is None:
+        msg = None
+    else:
+        fmt = '.' * max(0, spaces-2) + '# {:' + str(50-spaces) + 's}'
+        msg = fmt.format(name)
+
+    # Print and recurse (if needed).
+    if isinstance(val, dict):
+        if msg is not None:
+            print(msg)
+        for k in val.keys():
+            recursive_print(k, val[k], spaces+2)
+    elif isinstance(val, torch.Tensor):
+        print(msg, ':', val.size())
+    else:
+        print(msg, ':', val)
+
+####################################################################################################
+
+def convert_megatron_checkpoint(args, input_state_dict):
+    # The converted output model.
+    output_state_dict = {}
+
+    # The number of heads.
+    heads = 16
+    # The hidden_size per head.
+    hidden_size_per_head = 64
+
+    # The model.
+    model = input_state_dict['model']
+    # The language model.
+    lm = model['language_model']
+    # The embeddings.
+    embeddings = lm['embedding']
+
+    # The word embeddings.
+    word_embeddings = embeddings['word_embeddings']['weight']
+    # Truncate the embedding table to 50257 rows.
+    word_embeddings = word_embeddings[:50257,:]
+    # Truncate the embedding table to 50257 rows.
+    output_state_dict['transformer.wte.weight'] = word_embeddings
+
+    # The position embeddings.
+    pos_embeddings = embeddings['position_embeddings']['weight']
+    # Read the hidden dimension.
+    hidden_size = pos_embeddings.size(0)
+    # DEBUG.
+    assert hidden_size == heads * hidden_size_per_head
+    # Store the position embeddings.
+    output_state_dict['transformer.wpe.weight'] = pos_embeddings
+
+    # The transformer.
+    transformer = lm['transformer']
+
+    # The regex to extract layer names.
+    layer_re = re.compile('layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)')
+
+    # The simple map of names for "automated" rules.
+    megatron_to_transformers = {
+        'attention.dense'  : '.attn.c_proj.',
+        'mlp.dense_h_to_4h': '.mlp.c_fc.',
+        'mlp.dense_4h_to_h': '.mlp.c_proj.',
+    }
+
+    # Extract the layers.
+    for key, val in transformer.items():
+        # Match the name.
+        m = layer_re.match(key)
+
+        # Stop if that's not a layer
+        if m is None:
+            break
+
+        # The index of the layer.
+        layer_idx = int(m.group(1))
+        # The name of the operation.
+        op_name = m.group(2)
+        # Is it a weight or a bias?
+        weight_or_bias = m.group(3)
+
+        # The name of the layer.
+        layer_name = 'transformer.h.{}'.format(layer_idx)
+
+        # For layernorm(s), simply store the layer norm.
+        if op_name.endswith('layernorm'):
+
+            ln_name = 'ln_1' if op_name.startswith('input') else 'ln_2'
+            output_state_dict[layer_name + '.' + ln_name + '.' + weight_or_bias] = val
+
+        # Transpose the QKV matrix.
+        elif op_name == 'attention.query_key_value' and weight_or_bias == 'weight':
+
+            # Insert a tensor of 1x1xDxD bias.
+            zeros = torch.ones(1, 1, hidden_size, hidden_size)
+            output_state_dict[layer_name + '.attn.bias'] = zeros
+
+            # Insert a "dummy" tensor for masked_bias.
+            masked_bias = torch.tensor(-1e4)
+            output_state_dict[layer_name + '.attn.masked_bias'] = masked_bias
+
+            # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
+            out_val = val.transpose(0, 1)
+            # Store.
+            output_state_dict[layer_name + '.attn.c_attn.weight'] = out_val
+
+        # Transpose the bias.
+        elif op_name == 'attention.query_key_value' and weight_or_bias == 'bias':
+
+            # Store. No change of shape.
+            output_state_dict[layer_name + '.attn.c_attn.bias'] = val
+
+        # Transpose the weights.
+        elif weight_or_bias == 'weight':
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + 'weight'] = val.transpose(0, 1)
+
+        # Copy the bias.
+        elif weight_or_bias == 'bias':
+
+            out_name = megatron_to_transformers[op_name]
+            output_state_dict[layer_name + out_name + 'bias'] = val
+
+    # The final layernorm.
+    output_state_dict['transformer.ln_f.weight'] = transformer['final_layernorm.weight']
+    output_state_dict['transformer.ln_f.bias'] = transformer['final_layernorm.bias']
+
+    # For LM head, transformers' wants the matrix to weight embeddings.
+    output_state_dict['lm_head.weight'] = word_embeddings
+
+    # The config.
+    output_config = {
+        'activation_function': 'gelu_new',
+        'architectures': [ 'GPT2LMHeadModel' ],
+        'attn_pdrop': 0.1,
+        'bos_token_id': 50256,
+        'embd_pdrop': 0.1,
+        'eos_token_id': 50256,
+        'initializer_range': 0.02,
+        'layer_norm_epsilon': 1e-05,
+        'model_type': 'gpt2',
+        'n_ctx': 1024,
+        'n_embd': 1024,
+        'n_head': 16,
+        'n_layer': 24,
+        'n_positions': 1024,
+        'resid_pdrop': 0.1,
+        'summary_activation': None,
+        'summary_first_dropout': 0.1,
+        'summary_proj_to_labels': True,
+        'summary_type': 'cls_index',
+        'summary_use_proj': True,
+        'vocab_size': 50257
+    }
+
+    # It should be done!
+    return output_state_dict, output_config
+
+####################################################################################################
+
+def main():
+    # Create the argument parser.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--print-checkpoint-structure', action='store_true')
+    parser.add_argument('path_to_checkpoint', type=str,
+        help='Path to the ZIP file containing the checkpoint')
+    args = parser.parse_args()
+
+    # Extract the basename.
+    basename = os.path.dirname(args.path_to_checkpoint)
+
+    # Load the model.
+    print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
+    with zipfile.ZipFile(args.path_to_checkpoint, 'r') as checkpoint:
+        with checkpoint.open('release/mp_rank_00/model_optim_rng.pt') as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location='cpu')
+
+    # Convert.
+    print('Converting')
+    output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
+
+    # Print the structure of converted state dict.
+    if args.print_checkpoint_structure:
+        recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, 'config.json')
+    print('Saving config to "{}"'.format(output_config_file))
+    with open(output_config_file, 'w') as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, 'checkpoint.pt')
+    print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
+    torch.save(output_state_dict, output_checkpoint_file)
+
+####################################################################################################
+
+if __name__ == "__main__":
+    main()
+
+####################################################################################################
+
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b64f5ae2c761b8..5d0b41eacac0ca 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -45,6 +45,9 @@
     "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
     "MBartEncoder",  # Building part of bigger (tested) model.
     "MBartDecoderWrapper",  # Building part of bigger (tested) model.
+    "MegatronBertEncoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoder",  # Building part of bigger (tested) model.
+    "MegatronBertDecoderWrapper", # Building part of bigger (tested) model.
     "PegasusEncoder",  # Building part of bigger (tested) model.
     "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
@@ -105,6 +108,9 @@
     "MBartEncoder",
     "MBartDecoder",
     "MBartDecoderWrapper",
+    "MegatronBertEncoder",
+    "MegatronBertDecoder",
+    "MegatronBertDecoderWrapper",
     "OpenAIGPTDoubleHeadsModel",
     "PegasusEncoder",
     "PegasusDecoder",

From f943ed0614803a47cc368919782302355330ccfd Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Thu, 25 Mar 2021 13:35:32 -0700
Subject: [PATCH 02/42] Add support for NVIDIA Megatron GPT2 and BERT

Add the megatron_gpt2 model. That model reuses the existing GPT2 model. This
commit includes a script to convert a Megatron-GPT2 checkpoint downloaded
from NVIDIA GPU Cloud. See examples/megatron-models/README.md for details.

Add the megatron_bert model. That model is implemented as a modification of
the existing BERT model in Transformers. This commit includes a script to
convert a Megatron-BERT checkpoint downloaded from NVIDIA GPU Cloud. See
examples/megatron-models/README.md for details.
---
 docs/source/index.rst                         |   2 +
 docs/source/model_doc/megatron_bert.rst       | 116 +++---
 docs/source/model_doc/megatron_gpt2.rst       |  74 ++--
 examples/megatron-models/README.md            |  45 ++-
 examples/megatron-models/run_bert.py          |  50 +--
 examples/megatron-models/run_gpt2.py          |  39 +-
 src/transformers/__init__.py                  |  15 +-
 src/transformers/models/auto/modeling_auto.py |   6 +-
 .../models/megatron_bert/__init__.py          |  34 +-
 .../configuration_megatron_bert.py            |  18 +-
 .../convert_megatron_bert_checkpoint.py       | 192 ++++-----
 .../megatron_bert/modeling_megatron_bert.py   |  35 +-
 .../convert_megatron_gpt2_checkpoint.py       | 142 +++----
 src/transformers/utils/dummy_pt_objects.py    |  72 ++++
 .../utils/modeling_auto_mapping.py            |   1 +
 tests/test_modeling_megatron_bert.py          | 370 ++++++++++++++++++
 utils/check_repo.py                           |   4 +-
 17 files changed, 828 insertions(+), 387 deletions(-)
 create mode 100644 tests/test_modeling_megatron_bert.py

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 16164a761ae4c6..e78af6ae294c9a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -304,6 +304,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
index 600c357dd8d580..a306547d4fc6ee 100644
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -18,72 +18,62 @@ MegatronBERT
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion
-Parameter Language Models Using Model Parallelism
-<https://arxiv.org/abs/1909.08053>`__  by Mohammad Shoeybi, Mostofa Patwary,
-Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
 
 The abstract from the paper is the following:
 
-*Recent work in language modeling demonstrates that training large transformer
-models advances the state of the art in Natural Language Processing
-applications. However, very large models can be quite difficult to train due to
-memory constraints. In this work, we present our techniques for training very
-large transformer models and implement a simple, efficient intra-layer model
-parallel approach that enables training transformer models with billions of
-parameters. Our approach does not require a new compiler or library changes, is
-orthogonal and complimentary to pipeline model parallelism, and can be fully
-implemented with the insertion of a few communication operations in native
-PyTorch. We illustrate this approach by converging transformer based models up
-to 8.3 billion parameters using 512 GPUs. We sustain 15.1 PetaFLOPs across the
-entire application with 76% scaling efficiency when compared to a strong single
-GPU baseline that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To
-demonstrate that large language models can further advance the state of the art
-(SOTA), we train an 8.3 billion parameter transformer language model similar to
-GPT-2 and a 3.9 billion parameter model similar to BERT. We show that careful
-attention to the placement of layer normalization in BERT-like models is
-critical to achieving increased performance as the model size grows. Using the
-GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to SOTA
-perplexity of 15.8) and LAMBADA (66.5% compared to SOTA accuracy of 63.2%)
-datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9%
-compared to SOTA accuracy of 89.4%).*
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
 
 Tips:
 
-We have provided pretrained `BERT-345M
-<https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
-for use to evaluate or finetuning downstream tasks. 
+We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
 
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__
-for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation
-for downloading models can be found in the `NGC documentation
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
 <https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
 
 Alternatively, you can directly download the checkpoints using:
 
 BERT-345M-uncased::
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O megatron_bert_345m_v0_1_uncased.zip
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+  -O megatron_bert_345m_v0_1_uncased.zip
 
 BERT-345M-cased::
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O megatron_bert_345m_v0_1_cased.zip
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+  megatron_bert_345m_v0_1_cased.zip
 
-Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to
-convert them to a format that will easily be loaded by Hugging Face
-Transformers and our port of the BERT code. 
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
 
-The following commands allow you to do the conversion. We assume that the
-folder ``models/megatron_bert`` contains ``megatron_bert_345m_v0_1_{cased,
-uncased}.zip`` and that the commands are run from inside that folder::
+The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
+``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
 
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py
+  megatron_bert_345m_v0_1_uncased.zip python3
+  $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
 
-The original code can be found `here
-<https://github.com/NVIDIA/Megatron-LM>`__. That repository
-contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular,
-it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
 
 MegatronBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -99,10 +89,24 @@ MegatronBertModel
     :members: forward
 
 
-MegatronBertForConditionalGeneration
+MegatronBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MegatronBertForConditionalGeneration
+.. autoclass:: transformers.MegatronBertForMaskedLM
+    :members: forward
+
+
+MegatronBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForNextSentencePrediction
+    :members: forward
+
+
+MegatronBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForPreTraining
     :members: forward
 
 
@@ -113,17 +117,25 @@ MegatronBertForSequenceClassification
     :members: forward
 
 
-MegatronBertForQuestionAnswering
+MegatronBertForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MegatronBertForQuestionAnswering
+.. autoclass:: transformers.MegatronBertForMultipleChoice
+    :members: forward
+
+
+MegatronBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForTokenClassification
     :members: forward
 
 
-MegatronBertForCausalLM
+MegatronBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.MegatronBertForCausalLM
+.. autoclass:: transformers.MegatronBertForQuestionAnswering
     :members: forward
 
 
+
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
index 9f41fb9c73d2ea..23e4b242507efb 100644
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -18,64 +18,52 @@ MegatronGPT2
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion
-Parameter Language Models Using Model Parallelism
-<https://arxiv.org/abs/1909.08053>`__  by Mohammad Shoeybi, Mostofa Patwary,
-Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
 
 The abstract from the paper is the following:
 
-*Recent work in language modeling demonstrates that training large transformer
-models advances the state of the art in Natural Language Processing
-applications. However, very large models can be quite difficult to train due to
-memory constraints. In this work, we present our techniques for training very
-large transformer models and implement a simple, efficient intra-layer model
-parallel approach that enables training transformer models with billions of
-parameters. Our approach does not require a new compiler or library changes, is
-orthogonal and complimentary to pipeline model parallelism, and can be fully
-implemented with the insertion of a few communication operations in native
-PyTorch. We illustrate this approach by converging transformer based models up
-to 8.3 billion parameters using 512 GPUs. We sustain 15.1 PetaFLOPs across the
-entire application with 76% scaling efficiency when compared to a strong single
-GPU baseline that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To
-demonstrate that large language models can further advance the state of the art
-(SOTA), we train an 8.3 billion parameter transformer language model similar to
-GPT-2 and a 3.9 billion parameter model similar to BERT. We show that careful
-attention to the placement of layer normalization in BERT-like models is
-critical to achieving increased performance as the model size grows. Using the
-GPT-2 model we achieve SOTA results on the WikiText103 (10.8 compared to SOTA
-perplexity of 15.8) and LAMBADA (66.5% compared to SOTA accuracy of 63.2%)
-datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9%
-compared to SOTA accuracy of 89.4%).*
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
 
 Tips:
 
-We have provided pretrained `GPT2-345M
-<https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
-for use to evaluate or finetuning downstream tasks. 
+We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
 
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__
-for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. Further documentation
-for downloading models can be found in the `NGC documentation
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
 <https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
 
 Alternatively, you can directly download the checkpoints using::
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_gpt2_345m_v0_0.zip
+  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+  megatron_gpt2_345m_v0_0.zip
 
-Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to
-convert it to a format that will easily be loaded by Hugging Face Transformers
-GPT2 implementation. 
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
+be loaded by Hugging Face Transformers GPT2 implementation.
 
-The following command allows you to do the conversion. We assume that the
-folder ``models/megatron_gpt2`` contains ``megatron_gpt2_345m_v0_0.zip`` and
-that the command is run from that folder::
+The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
+``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
 
   python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
 
-The original code can be found `here
-<https://github.com/NVIDIA/Megatron-LM>`__. That repository
-contains a multi-GPU and multi-node implementation of the Megatron Language models. In particular,
-it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques.
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
 
 
diff --git a/examples/megatron-models/README.md b/examples/megatron-models/README.md
index ad16d34fa94eb3..4a774aedc4e598 100644
--- a/examples/megatron-models/README.md
+++ b/examples/megatron-models/README.md
@@ -22,37 +22,36 @@
 
 ## Get the checkpoints from the NVIDIA GPU Cloud 
 
-The first step is to create a directory in the current folder (`examples/megatron-lm`) to store the 
-checkpoints.
+The first step is to create a directory called `models` from the `examples/megatron-models` folder.
 
 ```
-mkdir -p models/{bert, gpt2}
+mkdir models
 ```
 
-Then, you can download the checkpoints from the NVIDIA GPU Cloud (NGC). For that you have to 
-[sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU Cloud (NGC) Registry CLI. 
-Further documentation for downloading models can be found in the 
-[NGC documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
-
+You can download the checkpoints from the NVIDIA GPU Cloud (NGC). For that you
+have to [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU
+Cloud (NGC) Registry CLI.  Further documentation for downloading models can be
+found in the [NGC
+documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
 
 Alternatively, you can directly download the checkpoints using:
 
 ### BERT 345M cased
 
 ```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O models/bert/megatron_bert_345m_v0_1_cased.zip
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O models/megatron_bert_345m_v0_1_cased.zip
 ```
 
 ### BERT 345M uncased
 
 ```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O models/bert/megatron_bert_345m_v0_1_uncased.zip
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O models/megatron_bert_345m_v0_1_uncased.zip
 ```
 
 ### GPT2 345M 
 
 ```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O models/gpt2/megatron_gpt2_345m_v0_0.zip
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O models/megatron_gpt2_345m_v0_0.zip
 ```
 
 ## Converting the checkpoints
@@ -60,22 +59,30 @@ wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_
 In order to be loaded into `Transformers`, the checkpoints have to be converted. You should run the following
 commands for that purpose.
 
+For the conversion, we use scripts stored in
+`src/transformers/models/megatron_bert` and
+`src/transformers/models/megatron_gpt2`. We define the relative path as:
+
+```
+export PATH_TO_TRANSFORMERS=../../src/transformers
+```
+
 ### BERT 345M cased
 
 ```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/bert/megatron_bert_345m_v0_1_cased.zip
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/megatron_bert_345m_v0_1_cased.zip
 ```
 
 ### BERT 345M uncased
 
 ```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/bert/megatron_bert_345m_v0_1_uncased.zip
+python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/megatron_bert_345m_v0_1_uncased.zip
 ```
 
 ### GPT2 345M 
 
 ```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py models/gpt2/megatron_gpt2_345m_v0_0.zip
+python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py models/megatron_gpt2_345m_v0_0.zip
 ```
 
 ## Running the samples
@@ -87,15 +94,15 @@ the Transformers API. The first task is `MegatronBERTForMaskedLM` and the second
 ### Masked LM
 
 ```
-python3 ./run_bert.py --masked-lm ./models/bert/megatron_bert_345m_v0_1_cased
-python3 ./run_bert.py --masked-lm ./models/bert/megatron_bert_345m_v0_1_uncased
+python3 ./run_bert.py --masked-lm ./models/megatron_bert_345m_v0_1_cased
+python3 ./run_bert.py --masked-lm ./models/megatron_bert_345m_v0_1_uncased
 ```
 
 ### Next sentence prediction
 
 ```
-python3 ./run_bert.py ./models/bert/megatron_bert_345m_v0_1_cased
-python3 ./run_bert.py ./models/bert/megatron_bert_345m_v0_1_uncased
+python3 ./run_bert.py ./models/megatron_bert_345m_v0_1_cased
+python3 ./run_bert.py ./models/megatron_bert_345m_v0_1_uncased
 ```
 
 ### Text generation
@@ -103,6 +110,6 @@ python3 ./run_bert.py ./models/bert/megatron_bert_345m_v0_1_uncased
 For GPT2, we created a simple for text generation.
 
 ```
-python3 ./run_gpt2.py models/gpt2/megatron_gpt2_345m_v0_0
+python3 ./run_gpt2.py models/megatron_gpt2_345m_v0_0
 ```
 
diff --git a/examples/megatron-models/run_bert.py b/examples/megatron-models/run_bert.py
index 6d04aa6bbe732b..93bcf1eba773e5 100644
--- a/examples/megatron-models/run_bert.py
+++ b/examples/megatron-models/run_bert.py
@@ -17,43 +17,45 @@
 ####################################################################################################
 
 import argparse
-import os
+
 import torch
-from   transformers import BertTokenizer
-from   transformers import MegatronBertConfig
-from   transformers import MegatronBertForMaskedLM, MegatronBertForNextSentencePrediction
+
+from transformers import (
+    BertTokenizer,
+    MegatronBertConfig,
+    MegatronBertForMaskedLM,
+    MegatronBertForNextSentencePrediction,
+)
+
 
 ####################################################################################################
 
+
 def main():
 
     # Create the argument parser.
     parser = argparse.ArgumentParser()
-    parser.add_argument('--masked-lm', action='store_true')
-    parser.add_argument('checkpoint', type=str,
-        help="""Path to the folder containing the config.json and checkpoint.pt extracted from the
-        NGC checkpoint using the convert_megatron_bert_checkpoint.py script.""")
+    parser.add_argument("--masked-lm", action="store_true")
+    parser.add_argument("checkpoint", type=str, help="See examples in README.md.")
     args = parser.parse_args()
 
     # Do we use the cased/uncased model.
-    is_uncased = 'uncased' in args.checkpoint
+    is_uncased = "uncased" in args.checkpoint
 
     # The base model.
-    bert = 'bert-base-' + ('uncased' if is_uncased else 'cased')
+    bert = "bert-base-" + ("uncased" if is_uncased else "cased")
     # The tokenizer. Megatron was trained with standard tokenizer(s).
     tokenizer = BertTokenizer.from_pretrained(bert)
 
     # The config file.
-    config_file = os.path.join(args.checkpoint, 'config.json')
+    config_file = args.checkpoint + "_config.json"
     # Load the config.
     config = MegatronBertConfig.from_pretrained(config_file)
-    # Make sure we do not try to tie embeddings.
-    config.tie_word_embeddings = False
 
     # The model class.
     model_cls = MegatronBertForMaskedLM if args.masked_lm else MegatronBertForNextSentencePrediction
     # The checkpoint file.
-    checkpoint_file = os.path.join(args.checkpoint, 'checkpoint.pt')
+    checkpoint_file = args.checkpoint + "_checkpoint.pt"
     # Load the model from transformers.
     model = model_cls.from_pretrained(checkpoint_file, config=config)
 
@@ -62,7 +64,7 @@ def main():
 
     # Copy to the device and use FP16.
     assert torch.cuda.is_available()
-    device = torch.device('cuda')
+    device = torch.device("cuda")
     model.to(device)
     model.half()
 
@@ -70,15 +72,15 @@ def main():
 
     # Create a dummy sentence (from the BERT example page).
     if args.masked_lm:
-        input = tokenizer('the capital of france is [MASK]', return_tensors='pt')
+        input = tokenizer("The capital of France is [MASK]", return_tensors="pt")
         input = input.to(device)
-        label = tokenizer('the capital of france is paris', return_tensors='pt')['input_ids']
+        label = tokenizer("The capital of France is Paris", return_tensors="pt")["input_ids"]
         label = label.to(device)
         output = model(**input, labels=label)
     else:
-        prompt = 'In Italy, pizza served in formal settings is presented unsliced.'
-        next_sentence = 'The sky is blue due to the shorter wavelength of blue light.'
-        input = tokenizer(prompt, next_sentence, return_tensors='pt')
+        prompt = "In Italy, pizza served in formal settings is presented unsliced."
+        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        input = tokenizer(prompt, next_sentence, return_tensors="pt")
         input = input.to(device)
         label = torch.LongTensor([1])
         label = label.to(device)
@@ -87,13 +89,13 @@ def main():
     output = model(**input, labels=label)
 
     # Outputs.
-    print('loss:   ', output.loss)
-    print('logits: ', output.logits)
+    print("loss:   ", output.loss)
+    print("logits: ", output.logits)
+
 
 ####################################################################################################
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 
 ####################################################################################################
-
diff --git a/examples/megatron-models/run_gpt2.py b/examples/megatron-models/run_gpt2.py
index 6f3ce7dddcd829..816bd1fe64b111 100644
--- a/examples/megatron-models/run_gpt2.py
+++ b/examples/megatron-models/run_gpt2.py
@@ -17,29 +17,32 @@
 ####################################################################################################
 
 import argparse
-import os
+
 import torch
-from   transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
+
+from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
+
 
 ####################################################################################################
 
+
 def main():
 
     # Create the argument parser.
     parser = argparse.ArgumentParser()
-    parser.add_argument('checkpoint', type=str)
+    parser.add_argument("checkpoint", type=str, help="See examples in README.md.")
     args = parser.parse_args()
 
     # The tokenizer. Megatron was trained with standard tokenizer(s).
-    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 
     # The config file.
-    config_file = os.path.join(args.checkpoint, 'config.json')
+    config_file = args.checkpoint + "_config.json"
     # Load the GPT2 config.
     config = GPT2Config.from_pretrained(config_file)
 
     # The checkpoint file.
-    checkpoint_file = os.path.join(args.checkpoint, 'checkpoint.pt')
+    checkpoint_file = args.checkpoint + "_checkpoint.pt"
     # Load GPT2 model from transformers.
     model = GPT2LMHeadModel.from_pretrained(checkpoint_file, config=config)
 
@@ -48,12 +51,12 @@ def main():
 
     # Copy to the device and use FP16.
     assert torch.cuda.is_available()
-    device = torch.device('cuda')
+    device = torch.device("cuda")
     model.to(device)
     model.half()
 
     # Create an empty sentence.
-    input = tokenizer.encode('', return_tensors='pt')
+    input = tokenizer.encode("", return_tensors="pt")
     input = input.to(device)
 
     # The token ids.
@@ -63,13 +66,15 @@ def main():
         input_ids = input
 
     # Generate the sentence.
-    output = model.generate(input_ids=input_ids,
-                            max_length=128,
-                            temperature=1.0,
-                            top_k=0,
-                            top_p=0.9,
-                            do_sample=True,
-                            num_return_sequences=1)
+    output = model.generate(
+        input_ids=input_ids,
+        max_length=128,
+        temperature=1.0,
+        top_k=0,
+        top_p=0.9,
+        do_sample=True,
+        num_return_sequences=1,
+    )
 
     # Output the text.
     for sentence in output:
@@ -77,10 +82,10 @@ def main():
         text = tokenizer.decode(sentence, clean_up_tokenization_spaces=True)
         print(text)
 
+
 ####################################################################################################
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
 
 ####################################################################################################
-
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e9613489476b40..39b6bff008594c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -740,10 +740,13 @@
     _import_structure["models.megatron_bert"].extend(
         [
             "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "MegatronBertForCausalLM",
-            "MegatronBertForConditionalGeneration",
+            "MegatronBertForMaskedLM",
+            "MegatronBertForMultipleChoice",
+            "MegatronBertForNextSentencePrediction",
+            "MegatronBertForPreTraining",
             "MegatronBertForQuestionAnswering",
             "MegatronBertForSequenceClassification",
+            "MegatronBertForTokenClassification",
             "MegatronBertModel",
         ]
     )
@@ -1944,15 +1947,15 @@
         )
         from .models.megatron_bert import (
             MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
-            MegatronBertLayer,
-            MegatronBertModel,
             MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
             MegatronBertForNextSentencePrediction,
             MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
             MegatronBertForSequenceClassification,
-            MegatronBertForMultipleChoice,
             MegatronBertForTokenClassification,
-            MegatronBertForQuestionAnswering,
+            MegatronBertLayer,
+            MegatronBertModel,
         )
         from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
         from .models.mobilebert import (
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 101399ab6584ce..3b620c7478a6ad 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -177,14 +177,14 @@
     MBartModel,
 )
 from ..megatron_bert.modeling_megatron_bert import (
-    MegatronBertModel,
     MegatronBertForMaskedLM,
+    MegatronBertForMultipleChoice,
     MegatronBertForNextSentencePrediction,
     MegatronBertForPreTraining,
+    MegatronBertForQuestionAnswering,
     MegatronBertForSequenceClassification,
-    MegatronBertForMultipleChoice,
     MegatronBertForTokenClassification,
-    MegatronBertForQuestionAnswering,
+    MegatronBertModel,
 )
 from ..mobilebert.modeling_mobilebert import (
     MobileBertForMaskedLM,
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
index 5458505249f1c5..5b9eccaff9e6f5 100644
--- a/src/transformers/models/megatron_bert/__init__.py
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -17,7 +17,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING
-from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available, is_tokenizers_available
+
+from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+
+
 _import_structure = {
     "configuration_megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "tokenization_megatron_bert": ["MegatronBertTokenizer"],
@@ -30,23 +33,15 @@
     _import_structure["modeling_megatron_bert"] = [
         "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
         "MegatronBertForMaskedLM",
+        "MegatronBertForMultipleChoice",
         "MegatronBertForNextSentencePrediction",
-        "MegatronBertForConditionalGeneration",
+        "MegatronBertForPreTraining",
         "MegatronBertForQuestionAnswering",
         "MegatronBertForSequenceClassification",
-        "MegatronBertForCausalLM",
+        "MegatronBertForTokenClassification",
         "MegatronBertModel",
-        "MegatronBertLMHeadModel",
-        "MegatronBertPreTrainedModel",
     ]
 
-# if is_tf_available():
-#     _import_structure["modeling_tf_megatron_bert"] = [
-#         "TFMegatronBertForConditionalGeneration",
-#         "TFMegatronBertModel",
-#         "TFMegatronBertPreTrainedModel",
-#     ]
-
 if TYPE_CHECKING:
     from .configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
     from .tokenization_megatron_bert import MegatronBertTokenizer
@@ -58,23 +53,15 @@
         from .modeling_megatron_bert import (
             MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
             MegatronBertForNextSentencePrediction,
-            MegatronBertForConditionalGeneration,
-            MegatronBertForCausalLM,
+            MegatronBertForPreTraining,
             MegatronBertForQuestionAnswering,
             MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
             MegatronBertModel,
-            MegatronBertLMHeadModel,
-            MegatronBertPreTrainedModel,
         )
 
-#     if is_tf_available():
-#         from .modeling_tf_megatron_bert import (
-#             TFMegatronBertForConditionalGeneration,
-#             TFMegatronBertModel,
-#             TFMegatronBertPreTrainedModel,
-#         )
-
 else:
     import importlib
     import os
@@ -92,4 +79,3 @@ def _get_module(self, module_name: str):
             return importlib.import_module("." + module_name, self.__name__)
 
     sys.modules[__name__] = _LazyModule(__name__, _import_structure)
-
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index fdf92405e19fc9..911e6281485e91 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -18,6 +18,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
+
 logger = logging.get_logger(__name__)
 
 MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -28,9 +29,10 @@
 class MegatronBertConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel` or a
-    :class:`~transformers.TFMegatronBertModel`. It is used to instantiate a MEGATRON_BERT model according to the specified arguments,
-    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the MEGATRON_BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    :class:`~transformers.TFMegatronBertModel`. It is used to instantiate a MEGATRON_BERT model according to the
+    specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
+    similar configuration to that of the MEGATRON_BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__
+    architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -38,8 +40,8 @@ class MegatronBertConfig(PretrainedConfig):
 
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel` or
+            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
+            by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel` or
             :class:`~transformers.TFMegatronBertModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
@@ -60,8 +62,8 @@ class MegatronBertConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.MegatronBertModel` or
-            :class:`~transformers.TFMegatronBertModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed when calling
+            :class:`~transformers.MegatronBertModel` or :class:`~transformers.TFMegatronBertModel`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
@@ -131,7 +133,7 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-        
+
     # @property
     # def num_attention_heads(self) -> int:
     #     return self.num_attention_heads
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 143567885a51e2..30ef01a57c22d1 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -20,17 +20,20 @@
 import json
 import os
 import re
-import torch
 import zipfile
 
+import torch
+
+
 ####################################################################################################
 
+
 def recursive_print(name, val, spaces=0):
     # Format the message.
     if name is None:
         msg = None
     else:
-        fmt = '.' * max(0, spaces-2) + '# {:' + str(50-spaces) + 's}'
+        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
         msg = fmt.format(name)
 
     # Print and recurse (if needed).
@@ -38,60 +41,55 @@ def recursive_print(name, val, spaces=0):
         if msg is not None:
             print(msg)
         for k in val.keys():
-            recursive_print(k, val[k], spaces+2)
+            recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
-        print(msg, ':', val.size())
+        print(msg, ":", val.size())
     else:
-        print(msg, ':', val)
+        print(msg, ":", val)
+
 
 ####################################################################################################
 
+
 def convert_megatron_checkpoint(args, input_state_dict):
     # The converted output model.
     output_state_dict = {}
 
-    # The number of heads.
-    heads = 16
-    # The hidden_size per head.
-    hidden_size_per_head = 64
-
     # The model.
-    model = input_state_dict['model']
+    model = input_state_dict["model"]
     # The language model.
-    lm = model['language_model']
+    lm = model["language_model"]
     # The embeddings.
-    embeddings = lm['embedding']
+    embeddings = lm["embedding"]
 
     # The word embeddings.
-    word_embeddings = embeddings['word_embeddings']['weight']
-    # Trained for 29056 x 1024.
-    assert word_embeddings.size(0) == 29056 and word_embeddings.size(1) == 1024
+    word_embeddings = embeddings["word_embeddings"]["weight"]
     # Store the word embeddings.
-    output_state_dict['bert.embeddings.word_embeddings.weight'] = word_embeddings
+    output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings
 
     # The position embeddings.
-    pos_embeddings = embeddings['position_embeddings']['weight']
+    pos_embeddings = embeddings["position_embeddings"]["weight"]
     # Trained for 512 x 1024.
     assert pos_embeddings.size(0) == 512 and pos_embeddings.size(1) == 1024
     # Store the position embeddings.
-    output_state_dict['bert.embeddings.position_embeddings.weight'] = pos_embeddings
+    output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings
 
     # The token-type embeddings.
-    tokentype_embeddings = embeddings['tokentype_embeddings']['weight']
+    tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"]
     # Store the position embeddings.
-    output_state_dict['bert.embeddings.token_type_embeddings.weight'] = tokentype_embeddings
+    output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings
 
     # The transformer.
-    transformer = lm['transformer']
+    transformer = lm["transformer"]
 
     # The regex to extract layer names.
-    layer_re = re.compile('layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)')
+    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
 
     # The simple map of names for "automated" rules.
     megatron_to_transformers = {
-        'attention.dense'  : '.attention.output.dense.',
-        'mlp.dense_h_to_4h': '.intermediate.dense.',
-        'mlp.dense_4h_to_h': '.output.dense.',
+        "attention.dense": ".attention.output.dense.",
+        "mlp.dense_h_to_4h": ".intermediate.dense.",
+        "mlp.dense_4h_to_h": ".output.dense.",
     }
 
     # Keep track of the attention/query/value tensor.
@@ -114,16 +112,16 @@ def convert_megatron_checkpoint(args, input_state_dict):
         weight_or_bias = m.group(3)
 
         # The name of the layer.
-        layer_name = 'bert.encoder.layer.{}'.format(layer_idx)
+        layer_name = "bert.encoder.layer.{}".format(layer_idx)
 
         # For layernorm(s), simply store the layer norm.
-        if op_name.endswith('layernorm'):
+        if op_name.endswith("layernorm"):
 
-            ln_name = 'attention.ln' if op_name.startswith('input') else 'ln'
-            output_state_dict[layer_name + '.' + ln_name + '.' + weight_or_bias] = val
+            ln_name = "attention.ln" if op_name.startswith("input") else "ln"
+            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
         # Transpose the QKV matrix.
-        elif op_name == 'attention.query_key_value' and weight_or_bias == 'weight':
+        elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
 
             # Make sure the QKV pointer is nil.
             assert attention_qkv_weight is None, ""
@@ -132,134 +130,118 @@ def convert_megatron_checkpoint(args, input_state_dict):
             attention_qkv_weight = val
 
         # Transpose the bias.
-        elif op_name == 'attention.query_key_value' and weight_or_bias == 'bias':
+        elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
 
             # Make sure we read the weight tensor.
             assert attention_qkv_weight is not None, ""
 
             # Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved.
-            q = attention_qkv_weight[0*1024:1*1024, :]
-            k = attention_qkv_weight[1*1024:2*1024, :]
-            v = attention_qkv_weight[2*1024:3*1024, :]
+            q = attention_qkv_weight[0 * 1024 : 1 * 1024, :]
+            k = attention_qkv_weight[1 * 1024 : 2 * 1024, :]
+            v = attention_qkv_weight[2 * 1024 : 3 * 1024, :]
 
             # Split the bias.
-            q_bias = val[0*1024:1*1024]
-            k_bias = val[1*1024:2*1024]
-            v_bias = val[2*1024:3*1024]
+            q_bias = val[0 * 1024 : 1 * 1024]
+            k_bias = val[1 * 1024 : 2 * 1024]
+            v_bias = val[2 * 1024 : 3 * 1024]
 
             # The name of the self attention block.
-            self_attn_name = layer_name + '.attention.self'
+            self_attn_name = layer_name + ".attention.self"
 
             # Store.
-            output_state_dict[self_attn_name + '.query.weight'] = q
-            output_state_dict[self_attn_name + '.query.bias'  ] = q_bias
-            output_state_dict[self_attn_name + '.key.weight'  ] = k
-            output_state_dict[self_attn_name + '.key.bias'    ] = k_bias
-            output_state_dict[self_attn_name + '.value.weight'] = v
-            output_state_dict[self_attn_name + '.value.bias'  ] = v_bias
+            output_state_dict[self_attn_name + ".query.weight"] = q
+            output_state_dict[self_attn_name + ".query.bias"] = q_bias
+            output_state_dict[self_attn_name + ".key.weight"] = k
+            output_state_dict[self_attn_name + ".key.bias"] = k_bias
+            output_state_dict[self_attn_name + ".value.weight"] = v
+            output_state_dict[self_attn_name + ".value.bias"] = v_bias
 
             # Clear the stored tensor.
             attention_qkv_weight = None
 
         # Copy weights and biases as is.
-        elif weight_or_bias in ['weight', 'bias']:
+        elif weight_or_bias in ["weight", "bias"]:
 
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + weight_or_bias] = val
 
     # The final layernorm.
-    output_state_dict['bert.encoder.ln.weight'] = transformer['final_layernorm.weight']
-    output_state_dict['bert.encoder.ln.bias']   = transformer['final_layernorm.bias']
+    output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"]
+    output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"]
 
     # The config.
     output_config = {
-        'vocab_size': 29056,
-        'hidden_size': 1024,
-        'num_hidden_layers': 24,
-        'num_attention_heads': 16,
-        'hidden_act': 'gelu_new',
-        'intermediate_size': 4096,
-        'hidden_dropout_prob': 0.1,
-        'attention_probs_dropout_prob': 0.1,
-        'max_position_embeddings': 512,
-        'type_vocab_size': 2,
-        'initializer_range': 0.2,
-        'layer_norm_eps': 1e-12,
-        'gradient_checkpointing': False,
-        'position_embedding_type': 'absolute',
-        'use_cache': False,
-
-        # 'bos_token_id': 29056,
-        # 'hidden_dropout_prob': 0.1,
-        # 'eos_token_id': 50256,
-        # 'initializer_range': 0.02,
-        # 'layer_norm_epsilon': 1e-05,
-        # 'model_type': 'bert',
-        # 'd_model': 1024,
-        # 'n_embd': 1024,
-        # 'encoder_attention_heads': 16,
-        # 'encoder_layers': 24,
-        # 'activation_dropout': 0.1,
-        # 'summary_activation': None,
-        # 'summary_first_dropout': 0.1,
-        # 'summary_proj_to_labels': True,
-        # 'summary_type': 'cls_index',
-        # 'summary_use_proj': True,
+        "vocab_size": word_embeddings.size(0),
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "hidden_act": "gelu_new",
+        "intermediate_size": 4096,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.2,
+        "layer_norm_eps": 1e-12,
+        "gradient_checkpointing": False,
+        "position_embedding_type": "absolute",
+        "use_cache": False,
     }
 
     # The pooler.
-    pooler = lm['pooler']
+    pooler = lm["pooler"]
 
     # Store the matrix and the bias.
-    output_state_dict['bert.pooler.dense.weight'] = pooler['dense.weight']
-    output_state_dict['bert.pooler.dense.bias'  ] = pooler['dense.bias']
+    output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"]
+    output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"]
 
     # The LM head from Megatron (for RACE).
-    lm_head = model['lm_head']
+    lm_head = model["lm_head"]
 
     # The transform matrix.
-    output_state_dict['cls.predictions.transform.dense.weight'] = lm_head['dense.weight']
-    output_state_dict['cls.predictions.transform.dense.bias'  ] = lm_head['dense.bias']
+    output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"]
+    output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"]
 
     # The transform LN.
-    output_state_dict['cls.predictions.transform.LayerNorm.weight'] = lm_head['layernorm.weight']
-    output_state_dict['cls.predictions.transform.LayerNorm.bias'  ] = lm_head['layernorm.bias']
+    output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"]
+    output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"]
 
     # For the decoder, we replicate the weights.
-    output_state_dict['cls.predictions.decoder.weight'] = word_embeddings
-    output_state_dict['cls.predictions.bias'] = lm_head['bias']
+    output_state_dict["cls.predictions.decoder.weight"] = word_embeddings
+    output_state_dict["cls.predictions.bias"] = lm_head["bias"]
 
     # The classifier from Megatron (for MLNI).
-    binary_head = model['binary_head']
+    binary_head = model["binary_head"]
 
     # Store the classifier.
-    output_state_dict['cls.seq_relationship.weight'] = binary_head['weight']
-    output_state_dict['cls.seq_relationship.bias'  ] = binary_head['bias']
+    output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"]
+    output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"]
 
     # It should be done!
     return output_state_dict, output_config
 
+
 ####################################################################################################
 
+
 def main():
     # Create the argument parser.
     parser = argparse.ArgumentParser()
-    parser.add_argument('--print-checkpoint-structure', action='store_true')
-    parser.add_argument("path_to_checkpoint", type=str,
-        help='Path to the ZIP file containing the checkpoint')
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
     args = parser.parse_args()
 
     # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
+    basename = os.path.splitext(args.path_to_checkpoint)[0]
 
     # Load the model.
     print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
-    with zipfile.ZipFile(args.path_to_checkpoint, 'r') as checkpoint:
-        with checkpoint.open('release/mp_rank_00/model_optim_rng.pt') as pytorch_dict:
-            input_state_dict = torch.load(pytorch_dict, map_location='cpu')
+    with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
+        with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location="cpu")
 
     # Convert.
-    print('Converting')
+    print("Converting")
     output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
 
     # Print the structure of converted state dict.
@@ -267,20 +249,20 @@ def main():
         recursive_print(None, output_state_dict)
 
     # Store the config to file.
-    output_config_file = os.path.join(basename, 'config.json')
+    output_config_file = basename + "_config.json"
     print('Saving config to "{}"'.format(output_config_file))
-    with open(output_config_file, 'w') as f:
+    with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 
     # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, 'checkpoint.pt')
+    output_checkpoint_file = basename + "_checkpoint.pt"
     print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
     torch.save(output_state_dict, output_checkpoint_file)
 
+
 ####################################################################################################
 
 if __name__ == "__main__":
     main()
 
 ####################################################################################################
-
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index fc9c9c3d21d01c..96868f29b90a48 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -67,6 +67,7 @@
     # See all MegatronBERT models at https://huggingface.co/models?filter=megatron_bert
 ]
 
+
 def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
     """Load tf checkpoints in a pytorch model."""
     try:
@@ -635,13 +636,10 @@ class MegatronBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = MegatronBertPredictionHeadTransform(config)
-
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
         # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
         self.decoder.bias = self.bias
 
@@ -691,7 +689,7 @@ class MegatronBertPreTrainedModel(PreTrainedModel):
 
     config_class = MegatronBertConfig
     load_tf_weights = load_tf_weights_in_megatron_bert
-    base_model_prefix = "megatron-bert"
+    base_model_prefix = "bert"
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
@@ -986,8 +984,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
-    sentence prediction (classification)` head.
+    MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `next sentence prediction (classification)` head.
     """,
     MEGATRON_BERT_START_DOCSTRING,
 )
@@ -1090,7 +1088,8 @@ def forward(
 
 
 @add_start_docstrings(
-    """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """, MEGATRON_BERT_START_DOCSTRING
+    """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """,
+    MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertLMHeadModel(MegatronBertPreTrainedModel):
 
@@ -1445,8 +1444,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
-    output) e.g. for GLUE tasks.
+    MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
     """,
     MEGATRON_BERT_START_DOCSTRING,
 )
@@ -1527,10 +1526,11 @@ def forward(
             attentions=outputs.attentions,
         )
 
+
 @add_start_docstrings(
     """
-    MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
+    MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output
+    and a softmax) e.g. for RocStories/SWAG tasks.
     """,
     MEGATRON_BERT_START_DOCSTRING,
 )
@@ -1544,7 +1544,9 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="megatron-bert-uncased-345m",
@@ -1620,8 +1622,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
+    MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
     """,
     MEGATRON_BERT_START_DOCSTRING,
 )
@@ -1711,8 +1713,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
     """,
     MEGATRON_BERT_START_DOCSTRING,
 )
@@ -1809,4 +1811,3 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
-
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index 7ab681817a37f8..d70e1e207d9338 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -20,17 +20,20 @@
 import json
 import os
 import re
-import torch
 import zipfile
 
+import torch
+
+
 ####################################################################################################
 
+
 def recursive_print(name, val, spaces=0):
     # Format the message.
     if name is None:
         msg = None
     else:
-        fmt = '.' * max(0, spaces-2) + '# {:' + str(50-spaces) + 's}'
+        fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}"
         msg = fmt.format(name)
 
     # Print and recurse (if needed).
@@ -38,14 +41,16 @@ def recursive_print(name, val, spaces=0):
         if msg is not None:
             print(msg)
         for k in val.keys():
-            recursive_print(k, val[k], spaces+2)
+            recursive_print(k, val[k], spaces + 2)
     elif isinstance(val, torch.Tensor):
-        print(msg, ':', val.size())
+        print(msg, ":", val.size())
     else:
-        print(msg, ':', val)
+        print(msg, ":", val)
+
 
 ####################################################################################################
 
+
 def convert_megatron_checkpoint(args, input_state_dict):
     # The converted output model.
     output_state_dict = {}
@@ -56,39 +61,39 @@ def convert_megatron_checkpoint(args, input_state_dict):
     hidden_size_per_head = 64
 
     # The model.
-    model = input_state_dict['model']
+    model = input_state_dict["model"]
     # The language model.
-    lm = model['language_model']
+    lm = model["language_model"]
     # The embeddings.
-    embeddings = lm['embedding']
+    embeddings = lm["embedding"]
 
     # The word embeddings.
-    word_embeddings = embeddings['word_embeddings']['weight']
+    word_embeddings = embeddings["word_embeddings"]["weight"]
     # Truncate the embedding table to 50257 rows.
-    word_embeddings = word_embeddings[:50257,:]
+    word_embeddings = word_embeddings[:50257, :]
     # Truncate the embedding table to 50257 rows.
-    output_state_dict['transformer.wte.weight'] = word_embeddings
+    output_state_dict["transformer.wte.weight"] = word_embeddings
 
     # The position embeddings.
-    pos_embeddings = embeddings['position_embeddings']['weight']
+    pos_embeddings = embeddings["position_embeddings"]["weight"]
     # Read the hidden dimension.
     hidden_size = pos_embeddings.size(0)
     # DEBUG.
     assert hidden_size == heads * hidden_size_per_head
     # Store the position embeddings.
-    output_state_dict['transformer.wpe.weight'] = pos_embeddings
+    output_state_dict["transformer.wpe.weight"] = pos_embeddings
 
     # The transformer.
-    transformer = lm['transformer']
+    transformer = lm["transformer"]
 
     # The regex to extract layer names.
-    layer_re = re.compile('layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)')
+    layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
 
     # The simple map of names for "automated" rules.
     megatron_to_transformers = {
-        'attention.dense'  : '.attn.c_proj.',
-        'mlp.dense_h_to_4h': '.mlp.c_fc.',
-        'mlp.dense_4h_to_h': '.mlp.c_proj.',
+        "attention.dense": ".attn.c_proj.",
+        "mlp.dense_h_to_4h": ".mlp.c_fc.",
+        "mlp.dense_4h_to_h": ".mlp.c_proj.",
     }
 
     # Extract the layers.
@@ -108,104 +113,105 @@ def convert_megatron_checkpoint(args, input_state_dict):
         weight_or_bias = m.group(3)
 
         # The name of the layer.
-        layer_name = 'transformer.h.{}'.format(layer_idx)
+        layer_name = "transformer.h.{}".format(layer_idx)
 
         # For layernorm(s), simply store the layer norm.
-        if op_name.endswith('layernorm'):
+        if op_name.endswith("layernorm"):
 
-            ln_name = 'ln_1' if op_name.startswith('input') else 'ln_2'
-            output_state_dict[layer_name + '.' + ln_name + '.' + weight_or_bias] = val
+            ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
+            output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
         # Transpose the QKV matrix.
-        elif op_name == 'attention.query_key_value' and weight_or_bias == 'weight':
+        elif op_name == "attention.query_key_value" and weight_or_bias == "weight":
 
             # Insert a tensor of 1x1xDxD bias.
             zeros = torch.ones(1, 1, hidden_size, hidden_size)
-            output_state_dict[layer_name + '.attn.bias'] = zeros
+            output_state_dict[layer_name + ".attn.bias"] = zeros
 
             # Insert a "dummy" tensor for masked_bias.
             masked_bias = torch.tensor(-1e4)
-            output_state_dict[layer_name + '.attn.masked_bias'] = masked_bias
+            output_state_dict[layer_name + ".attn.masked_bias"] = masked_bias
 
             # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
             out_val = val.transpose(0, 1)
             # Store.
-            output_state_dict[layer_name + '.attn.c_attn.weight'] = out_val
+            output_state_dict[layer_name + ".attn.c_attn.weight"] = out_val
 
         # Transpose the bias.
-        elif op_name == 'attention.query_key_value' and weight_or_bias == 'bias':
+        elif op_name == "attention.query_key_value" and weight_or_bias == "bias":
 
             # Store. No change of shape.
-            output_state_dict[layer_name + '.attn.c_attn.bias'] = val
+            output_state_dict[layer_name + ".attn.c_attn.bias"] = val
 
         # Transpose the weights.
-        elif weight_or_bias == 'weight':
+        elif weight_or_bias == "weight":
 
             out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + 'weight'] = val.transpose(0, 1)
+            output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
 
         # Copy the bias.
-        elif weight_or_bias == 'bias':
+        elif weight_or_bias == "bias":
 
             out_name = megatron_to_transformers[op_name]
-            output_state_dict[layer_name + out_name + 'bias'] = val
+            output_state_dict[layer_name + out_name + "bias"] = val
 
     # The final layernorm.
-    output_state_dict['transformer.ln_f.weight'] = transformer['final_layernorm.weight']
-    output_state_dict['transformer.ln_f.bias'] = transformer['final_layernorm.bias']
+    output_state_dict["transformer.ln_f.weight"] = transformer["final_layernorm.weight"]
+    output_state_dict["transformer.ln_f.bias"] = transformer["final_layernorm.bias"]
 
     # For LM head, transformers' wants the matrix to weight embeddings.
-    output_state_dict['lm_head.weight'] = word_embeddings
+    output_state_dict["lm_head.weight"] = word_embeddings
 
     # The config.
     output_config = {
-        'activation_function': 'gelu_new',
-        'architectures': [ 'GPT2LMHeadModel' ],
-        'attn_pdrop': 0.1,
-        'bos_token_id': 50256,
-        'embd_pdrop': 0.1,
-        'eos_token_id': 50256,
-        'initializer_range': 0.02,
-        'layer_norm_epsilon': 1e-05,
-        'model_type': 'gpt2',
-        'n_ctx': 1024,
-        'n_embd': 1024,
-        'n_head': 16,
-        'n_layer': 24,
-        'n_positions': 1024,
-        'resid_pdrop': 0.1,
-        'summary_activation': None,
-        'summary_first_dropout': 0.1,
-        'summary_proj_to_labels': True,
-        'summary_type': 'cls_index',
-        'summary_use_proj': True,
-        'vocab_size': 50257
+        "activation_function": "gelu_new",
+        "architectures": ["GPT2LMHeadModel"],
+        "attn_pdrop": 0.1,
+        "bos_token_id": 50256,
+        "embd_pdrop": 0.1,
+        "eos_token_id": 50256,
+        "initializer_range": 0.02,
+        "layer_norm_epsilon": 1e-05,
+        "model_type": "gpt2",
+        "n_ctx": 1024,
+        "n_embd": 1024,
+        "n_head": 16,
+        "n_layer": 24,
+        "n_positions": 1024,
+        "resid_pdrop": 0.1,
+        "summary_activation": None,
+        "summary_first_dropout": 0.1,
+        "summary_proj_to_labels": True,
+        "summary_type": "cls_index",
+        "summary_use_proj": True,
+        "vocab_size": 50257,
     }
 
     # It should be done!
     return output_state_dict, output_config
 
+
 ####################################################################################################
 
+
 def main():
     # Create the argument parser.
     parser = argparse.ArgumentParser()
-    parser.add_argument('--print-checkpoint-structure', action='store_true')
-    parser.add_argument('path_to_checkpoint', type=str,
-        help='Path to the ZIP file containing the checkpoint')
+    parser.add_argument("--print-checkpoint-structure", action="store_true")
+    parser.add_argument("path_to_checkpoint", type=str, help="Path to the ZIP file containing the checkpoint")
     args = parser.parse_args()
 
     # Extract the basename.
-    basename = os.path.dirname(args.path_to_checkpoint)
+    basename = os.path.splitext(args.path_to_checkpoint)[0]
 
     # Load the model.
     print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
-    with zipfile.ZipFile(args.path_to_checkpoint, 'r') as checkpoint:
-        with checkpoint.open('release/mp_rank_00/model_optim_rng.pt') as pytorch_dict:
-            input_state_dict = torch.load(pytorch_dict, map_location='cpu')
+    with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
+        with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
+            input_state_dict = torch.load(pytorch_dict, map_location="cpu")
 
     # Convert.
-    print('Converting')
+    print("Converting")
     output_state_dict, output_config = convert_megatron_checkpoint(args, input_state_dict)
 
     # Print the structure of converted state dict.
@@ -213,20 +219,20 @@ def main():
         recursive_print(None, output_state_dict)
 
     # Store the config to file.
-    output_config_file = os.path.join(basename, 'config.json')
+    output_config_file = basename + "_config.json"
     print('Saving config to "{}"'.format(output_config_file))
-    with open(output_config_file, 'w') as f:
+    with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 
     # Store the state_dict to file.
-    output_checkpoint_file = os.path.join(basename, 'checkpoint.pt')
+    output_checkpoint_file = basename + "_checkpoint.pt"
     print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
     torch.save(output_state_dict, output_checkpoint_file)
 
+
 ####################################################################################################
 
 if __name__ == "__main__":
     main()
 
 ####################################################################################################
-
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 59649a3c02bd88..988f49447ece6c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1811,6 +1811,78 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
+MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class MegatronBertForMaskedLM:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForMultipleChoice:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForNextSentencePrediction:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForPreTraining:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertForTokenClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertLayer:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MegatronBertModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class MMBTForClassification:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)
diff --git a/src/transformers/utils/modeling_auto_mapping.py b/src/transformers/utils/modeling_auto_mapping.py
index 189b2e1959f4fd..0a05ac24d795ee 100644
--- a/src/transformers/utils/modeling_auto_mapping.py
+++ b/src/transformers/utils/modeling_auto_mapping.py
@@ -21,6 +21,7 @@
         ("BertConfig", "BertForQuestionAnswering"),
         ("XLNetConfig", "XLNetForQuestionAnsweringSimple"),
         ("FlaubertConfig", "FlaubertForQuestionAnsweringSimple"),
+        ("MegatronBertConfig", "MegatronBertForQuestionAnswering"),
         ("MobileBertConfig", "MobileBertForQuestionAnswering"),
         ("XLMConfig", "XLMForQuestionAnsweringSimple"),
         ("ElectraConfig", "ElectraForQuestionAnswering"),
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
new file mode 100644
index 00000000000000..57c4ec05c25564
--- /dev/null
+++ b/tests/test_modeling_megatron_bert.py
@@ -0,0 +1,370 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch MegatronBERT model. """
+
+
+import math
+import unittest
+
+from transformers import is_torch_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_PRETRAINING_MAPPING,
+        MegatronBertConfig,
+        MegatronBertForMaskedLM,
+        MegatronBertForMultipleChoice,
+        MegatronBertForNextSentencePrediction,
+        MegatronBertForPreTraining,
+        MegatronBertForQuestionAnswering,
+        MegatronBertForSequenceClassification,
+        MegatronBertForTokenClassification,
+        MegatronBertModel,
+    )
+
+
+class MegatronBertModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=64,
+        embedding_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = embedding_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MegatronBertConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            embedding_size=self.embedding_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def create_and_check_megatron_bert_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertModel(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_megatron_bert_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForMaskedLM(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_megatron_bert_for_next_sequence_prediction(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForNextSentencePrediction(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=sequence_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_pretraining(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForPreTraining(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            labels=token_labels,
+            next_sentence_label=sequence_labels,
+        )
+        self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        self.parent.assertEqual(result.seq_relationship_logits.shape, (self.batch_size, 2))
+
+    def create_and_check_megatron_bert_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_megatron_bert_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForSequenceClassification(config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_megatron_bert_for_token_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = MegatronBertForTokenClassification(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_megatron_bert_for_multiple_choice(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = MegatronBertForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.half()
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        MegatronBertModel,
+        MegatronBertForMaskedLM,
+        MegatronBertForMultipleChoice,
+        MegatronBertForNextSentencePrediction,
+        MegatronBertForPreTraining,
+        MegatronBertForQuestionAnswering,
+        MegatronBertForSequenceClassification,
+        MegatronBertForTokenClassification,
+    )
+
+    # test_resize_embeddings = False
+    test_head_masking = False
+
+    # special case for ForPreTraining model
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        if return_labels:
+            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+                inputs_dict["labels"] = torch.zeros(
+                    (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+                )
+                inputs_dict["next_sentence_label"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = MegatronBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MegatronBertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_megatron_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_multiple_choice(*config_and_inputs)
+
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_next_sequence_prediction(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_pretraining(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_megatron_bert_for_token_classification(*config_and_inputs)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(
+        tok_lst,
+        dtype=torch.long,
+        device=torch_device,
+    )
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+class MegatronBertModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_inference_no_head(self):
+        basename = "examples/megatron-models/models/megatron_bert_345m_v0_1_uncased"
+        config_file = basename + "_config.json"
+        config = MegatronBertConfig.from_pretrained(config_file)
+        checkpoint_file = basename + "_checkpoint.pt"
+        model = MegatronBertModel.from_pretrained(checkpoint_file, config=config)
+        model.to(torch_device)
+        model.half()
+        input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
+        with torch.no_grad():
+            output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 9, 1024))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected = [-0.6040, -0.2517, -0.1025, 0.3420, -0.6758, -0.0017, -0.1089, -0.1990, 0.5728]
+        for ii in range(3):
+            for jj in range(3):
+                a = output[0, ii, jj]
+                b = expected[3 * ii + jj]
+                msg = "ii={} jj={} a={} b={}".format(ii, jj, a, b)
+                self.assertTrue(math.isclose(a, b, rel_tol=TOLERANCE, abs_tol=TOLERANCE), msg=msg)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 5d0b41eacac0ca..b3a230066f7e82 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -45,9 +45,10 @@
     "BlenderbotDecoderWrapper",  # Building part of bigger (tested) model.
     "MBartEncoder",  # Building part of bigger (tested) model.
     "MBartDecoderWrapper",  # Building part of bigger (tested) model.
+    "MegatronBertLMHeadModel",  # Building part of bigger (tested) model.
     "MegatronBertEncoder",  # Building part of bigger (tested) model.
     "MegatronBertDecoder",  # Building part of bigger (tested) model.
-    "MegatronBertDecoderWrapper", # Building part of bigger (tested) model.
+    "MegatronBertDecoderWrapper",  # Building part of bigger (tested) model.
     "PegasusEncoder",  # Building part of bigger (tested) model.
     "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
@@ -108,6 +109,7 @@
     "MBartEncoder",
     "MBartDecoder",
     "MBartDecoderWrapper",
+    "MegatronBertLMHeadModel",
     "MegatronBertEncoder",
     "MegatronBertDecoder",
     "MegatronBertDecoderWrapper",

From 685479d9d1be6465eb5e49fc034defe1c2b3d8a9 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Mon, 29 Mar 2021 12:37:13 +0200
Subject: [PATCH 03/42] Update
 src/transformers/models/megatron_bert/configuration_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 .../models/megatron_bert/configuration_megatron_bert.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 911e6281485e91..23a84f5882c57d 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -28,8 +28,8 @@
 
 class MegatronBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel` or a
-    :class:`~transformers.TFMegatronBertModel`. It is used to instantiate a MEGATRON_BERT model according to the
+    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. 
+    It is used to instantiate a MEGATRON_BERT model according to the
     specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
     similar configuration to that of the MEGATRON_BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__
     architecture.

From e347036911e82a95f08e1e7ed56caf99829cadd3 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Mon, 29 Mar 2021 12:37:34 +0200
Subject: [PATCH 04/42] Update
 src/transformers/models/megatron_bert/configuration_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 .../models/megatron_bert/configuration_megatron_bert.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 23a84f5882c57d..d6fb410a592057 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -41,8 +41,7 @@ class MegatronBertConfig(PretrainedConfig):
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel` or
-            :class:`~transformers.TFMegatronBertModel`.
+            by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 24):

From 0af41688bf5b64b3f3b3f6fe19c649c75087cde9 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Mon, 29 Mar 2021 12:37:48 +0200
Subject: [PATCH 05/42] Update
 src/transformers/models/megatron_bert/configuration_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 .../models/megatron_bert/configuration_megatron_bert.py   | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index d6fb410a592057..163b515003df96 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -132,11 +132,3 @@ def __init__(
         self.gradient_checkpointing = gradient_checkpointing
         self.position_embedding_type = position_embedding_type
         self.use_cache = use_cache
-
-    # @property
-    # def num_attention_heads(self) -> int:
-    #     return self.num_attention_heads
-
-    # @property
-    # def hidden_size(self) -> int:
-    #     return self.hidden_size

From 435c33e7576237347836d3f73e9fdb0bb4ee2748 Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Mon, 29 Mar 2021 04:30:04 -0700
Subject: [PATCH 06/42] Remove model.half in tests + add "# Copied ..."

Remove the model.half() instruction which makes tests fail on the CPU.

Add a comment "# Copied ..." before many classes in the model to enable automatic
tracking in CI between the new Megatron classes and the original Bert ones.
---
 .../megatron_bert/modeling_megatron_bert.py   | 64 +++++++++++++------
 tests/test_modeling_megatron_bert.py          |  9 ---
 2 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 96868f29b90a48..bf49e1356af4ee 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -59,7 +59,7 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "MegatronBertConfig"
-_TOKENIZER_FOR_DOC = "MegatronBertTokenizer"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
 
 
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
@@ -142,6 +142,7 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
     return model
 
 
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->MegatronBert
 class MegatronBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -193,6 +194,7 @@ def forward(
         return embeddings
 
 
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->MegatronBert
 class MegatronBertSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -317,7 +319,7 @@ def forward(
             outputs = outputs + (past_key_value,)
         return outputs
 
-
+# Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to MegatronBertAttention below.
 class MegatronBertSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -329,7 +331,7 @@ def forward(self, hidden_states, residual):
         hidden_states = self.dropout(hidden_states)
         return residual + hidden_states
 
-
+# Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
 class MegatronBertAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -381,6 +383,7 @@ def forward(
         return outputs
 
 
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->MegatronBert
 class MegatronBertIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -395,7 +398,7 @@ def forward(self, hidden_states):
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
-
+# Based transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
 class MegatronBertOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -408,6 +411,7 @@ def forward(self, hidden_states, input_tensor):
         return input_tensor + hidden_states
 
 
+# Based transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
 class MegatronBertLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -493,6 +497,7 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->MegatronBert
 class MegatronBertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -600,6 +605,7 @@ def custom_forward(*inputs):
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->MegatronBert
 class MegatronBertPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -615,6 +621,7 @@ def forward(self, hidden_states):
         return pooled_output
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->MegatronBert
 class MegatronBertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -632,6 +639,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->MegatronBert
 class MegatronBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -649,6 +657,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->MegatronBert
 class MegatronBertOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -659,6 +668,7 @@ def forward(self, sequence_output):
         return prediction_scores
 
 
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->MegatronBert
 class MegatronBertOnlyNSPHead(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -669,6 +679,7 @@ def forward(self, pooled_output):
         return seq_relationship_score
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->MegatronBert
 class MegatronBertPreTrainingHeads(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -681,6 +692,7 @@ def forward(self, sequence_output, pooled_output):
         return prediction_scores, seq_relationship_score
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->MegatronBert
 class MegatronBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -705,6 +717,7 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
 
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingOutput with Bert->MegatronBert
 @dataclass
 class MegatronBertForPreTrainingOutput(ModelOutput):
     """
@@ -761,7 +774,7 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
         input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.MegatronBertTokenizer`. See
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
             :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
             details.
 
@@ -807,6 +820,7 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
 """
 
 
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->MegatronBert
 @add_start_docstrings(
     "The bare MegatronBert Model transformer outputting raw hidden-states without any specific head on top.",
     MEGATRON_BERT_START_DOCSTRING,
@@ -853,7 +867,7 @@ class PreTrainedModel
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -982,6 +996,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForPreTraining with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
@@ -1038,11 +1053,11 @@ def forward(
 
         Example::
 
-            >>> from transformers import MegatronBertTokenizer, MegatronBertForPreTraining
+            >>> from transformers import BertTokenizer, MegatronBertForPreTraining
             >>> import torch
 
-            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-uncased-345m')
-            >>> model = MegatronBertForPreTraining.from_pretrained('megatron-bert-uncased-345m')
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
+            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-uncased-345m')
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)
@@ -1087,6 +1102,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel with Bert->MegatronBert
 @add_start_docstrings(
     """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """,
     MEGATRON_BERT_START_DOCSTRING,
@@ -1160,13 +1176,13 @@ def forward(
 
         Example::
 
-            >>> from transformers import MegatronBertTokenizer, MegatronBertLMHeadModel, MegatronBertConfig
+            >>> from transformers import BertTokenizer, MegatronBertLMHeadModel, MegatronBertConfig
             >>> import torch
 
-            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-cased-345m')
-            >>> config = MegatronBertConfig.from_pretrained("megatron-bert-cased-345m")
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
+            >>> config = MegatronBertConfig.from_pretrained("nvidia/megatron-bert-cased-345m")
             >>> config.is_decoder = True
-            >>> model = MegatronBertLMHeadModel.from_pretrained('megatron-bert-cased-345m', config=config)
+            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', config=config)
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)
@@ -1236,6 +1252,7 @@ def _reorder_cache(self, past, beam_idx):
         return reordered_past
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM with Bert->MegatronBert
 @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top. """, MEGATRON_BERT_START_DOCSTRING)
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
 
@@ -1265,7 +1282,7 @@ def set_output_embeddings(self, new_embeddings):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1341,6 +1358,7 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction with Bert->MegatronBert
 @add_start_docstrings(
     """MegatronBert Model with a `next sentence prediction (classification)` head on top. """,
     MEGATRON_BERT_START_DOCSTRING,
@@ -1385,11 +1403,11 @@ def forward(
 
         Example::
 
-            >>> from transformers import MegatronBertTokenizer, MegatronBertForNextSentencePrediction
+            >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
             >>> import torch
 
-            >>> tokenizer = MegatronBertTokenizer.from_pretrained('megatron-bert-uncased-345m')
-            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('megatron-bert-uncased-345m')
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
+            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-uncased-345m')
 
             >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
             >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
@@ -1442,6 +1460,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForSentenceClassification with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
@@ -1463,7 +1482,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1527,6 +1546,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output
@@ -1549,7 +1569,7 @@ def __init__(self, config):
     )
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1620,6 +1640,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
@@ -1644,7 +1665,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1711,6 +1732,7 @@ def forward(
         )
 
 
+# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
@@ -1734,7 +1756,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert-uncased-345m",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
index 57c4ec05c25564..c76c52ca922429 100644
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -136,7 +136,6 @@ def create_and_check_megatron_bert_model(
     ):
         model = MegatronBertModel(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
         result = model(input_ids, token_type_ids=token_type_ids)
@@ -150,7 +149,6 @@ def create_and_check_megatron_bert_for_masked_lm(
     ):
         model = MegatronBertForMaskedLM(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
@@ -160,7 +158,6 @@ def create_and_check_megatron_bert_for_next_sequence_prediction(
     ):
         model = MegatronBertForNextSentencePrediction(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(
             input_ids,
@@ -175,7 +172,6 @@ def create_and_check_megatron_bert_for_pretraining(
     ):
         model = MegatronBertForPreTraining(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(
             input_ids,
@@ -192,7 +188,6 @@ def create_and_check_megatron_bert_for_question_answering(
     ):
         model = MegatronBertForQuestionAnswering(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(
             input_ids,
@@ -210,7 +205,6 @@ def create_and_check_megatron_bert_for_sequence_classification(
         config.num_labels = self.num_labels
         model = MegatronBertForSequenceClassification(config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
@@ -221,7 +215,6 @@ def create_and_check_megatron_bert_for_token_classification(
         config.num_labels = self.num_labels
         model = MegatronBertForTokenClassification(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
@@ -232,7 +225,6 @@ def create_and_check_megatron_bert_for_multiple_choice(
         config.num_choices = self.num_choices
         model = MegatronBertForMultipleChoice(config=config)
         model.to(torch_device)
-        model.half()
         model.eval()
         multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
         multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -354,7 +346,6 @@ def test_inference_no_head(self):
         checkpoint_file = basename + "_checkpoint.pt"
         model = MegatronBertModel.from_pretrained(checkpoint_file, config=config)
         model.to(torch_device)
-        model.half()
         input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
         with torch.no_grad():
             output = model(input_ids)[0]

From 343f68d5a5c5db832a27465d1b13e4a1ba5392e7 Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Wed, 31 Mar 2021 04:14:49 -0700
Subject: [PATCH 07/42] Fix issues

---
 README.md                                     |   2 +
 docs/source/index.rst                         |  44 ++++---
 examples/megatron-models/README.md            | 115 ------------------
 examples/megatron-models/run_bert.py          | 101 ---------------
 examples/megatron-models/run_gpt2.py          |  91 --------------
 .../configuration_megatron_bert.py            |   9 +-
 .../convert_megatron_bert_checkpoint.py       |   6 +-
 .../megatron_bert/modeling_megatron_bert.py   |  38 +++---
 .../convert_megatron_gpt2_checkpoint.py       |   6 +-
 tests/test_modeling_megatron_bert.py          |  12 +-
 10 files changed, 61 insertions(+), 363 deletions(-)
 delete mode 100644 examples/megatron-models/README.md
 delete mode 100644 examples/megatron-models/run_bert.py
 delete mode 100644 examples/megatron-models/run_gpt2.py

diff --git a/README.md b/README.md
index dd535688cb9333..372492d329e81b 100644
--- a/README.md
+++ b/README.md
@@ -223,6 +223,8 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e78af6ae294c9a..96395db6398ad1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -178,58 +178,64 @@ and conversion utilities for the following models:
 32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
     Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
     Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+33. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+34. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+35. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
     Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
     Jianfeng Lu, Tie-Yan Liu.
-34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+36. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
     text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
     Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+37. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
     Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
     Mohammad Saleh and Peter J. Liu.
-36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+38. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
     Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
     Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+39. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+40. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
     Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
     Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+41. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
     `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
     Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+42. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
     about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
     Krishna, and Kurt W. Keutzer.
-41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+43. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+44. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
     Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
     Francesco Piccinno and Julian Martin Eisenschlos.
-43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+45. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+46. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
     Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
     Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
     Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+47. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
     Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
     Zhou, Abdelrahman Mohamed, Michael Auli.
-46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+48. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
     Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+49. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
     Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
     Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+50. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
     Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
     Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
     Zettlemoyer and Veselin Stoyanov.
-49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
+51. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
     Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
     Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+52. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
     Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
     Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 
@@ -450,6 +456,8 @@ TensorFlow and/or Flax.
     model_doc/marian
     model_doc/m2m_100
     model_doc/mbart
+    model_doc/megatron_bert
+    model_doc/megatron_gpt2
     model_doc/mobilebert
     model_doc/mpnet
     model_doc/mt5
diff --git a/examples/megatron-models/README.md b/examples/megatron-models/README.md
deleted file mode 100644
index 4a774aedc4e598..00000000000000
--- a/examples/megatron-models/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-<!---
-# ##############################################################################################
-# 
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-# ##############################################################################################
--->
-
-# How to run Megatron BERT and GPT2 using Transformers
-
-## Get the checkpoints from the NVIDIA GPU Cloud 
-
-The first step is to create a directory called `models` from the `examples/megatron-models` folder.
-
-```
-mkdir models
-```
-
-You can download the checkpoints from the NVIDIA GPU Cloud (NGC). For that you
-have to [sign up](https://ngc.nvidia.com/signup) for and setup the NVIDIA GPU
-Cloud (NGC) Registry CLI.  Further documentation for downloading models can be
-found in the [NGC
-documentation](https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1).
-
-Alternatively, you can directly download the checkpoints using:
-
-### BERT 345M cased
-
-```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O models/megatron_bert_345m_v0_1_cased.zip
-```
-
-### BERT 345M uncased
-
-```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip -O models/megatron_bert_345m_v0_1_uncased.zip
-```
-
-### GPT2 345M 
-
-```
-wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O models/megatron_gpt2_345m_v0_0.zip
-```
-
-## Converting the checkpoints
-
-In order to be loaded into `Transformers`, the checkpoints have to be converted. You should run the following
-commands for that purpose.
-
-For the conversion, we use scripts stored in
-`src/transformers/models/megatron_bert` and
-`src/transformers/models/megatron_gpt2`. We define the relative path as:
-
-```
-export PATH_TO_TRANSFORMERS=../../src/transformers
-```
-
-### BERT 345M cased
-
-```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/megatron_bert_345m_v0_1_cased.zip
-```
-
-### BERT 345M uncased
-
-```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py models/megatron_bert_345m_v0_1_uncased.zip
-```
-
-### GPT2 345M 
-
-```
-python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py models/megatron_gpt2_345m_v0_0.zip
-```
-
-## Running the samples
-
-For BERT, we created a simple example that runs two tasks using the Megatron BERT checkpoints using
-the Transformers API. The first task is `MegatronBERTForMaskedLM` and the second one is 
-`MegatronBERTForNextSentencePrediction`.
-
-### Masked LM
-
-```
-python3 ./run_bert.py --masked-lm ./models/megatron_bert_345m_v0_1_cased
-python3 ./run_bert.py --masked-lm ./models/megatron_bert_345m_v0_1_uncased
-```
-
-### Next sentence prediction
-
-```
-python3 ./run_bert.py ./models/megatron_bert_345m_v0_1_cased
-python3 ./run_bert.py ./models/megatron_bert_345m_v0_1_uncased
-```
-
-### Text generation
-
-For GPT2, we created a simple for text generation.
-
-```
-python3 ./run_gpt2.py models/megatron_gpt2_345m_v0_0
-```
-
diff --git a/examples/megatron-models/run_bert.py b/examples/megatron-models/run_bert.py
deleted file mode 100644
index 93bcf1eba773e5..00000000000000
--- a/examples/megatron-models/run_bert.py
+++ /dev/null
@@ -1,101 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-import argparse
-
-import torch
-
-from transformers import (
-    BertTokenizer,
-    MegatronBertConfig,
-    MegatronBertForMaskedLM,
-    MegatronBertForNextSentencePrediction,
-)
-
-
-####################################################################################################
-
-
-def main():
-
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--masked-lm", action="store_true")
-    parser.add_argument("checkpoint", type=str, help="See examples in README.md.")
-    args = parser.parse_args()
-
-    # Do we use the cased/uncased model.
-    is_uncased = "uncased" in args.checkpoint
-
-    # The base model.
-    bert = "bert-base-" + ("uncased" if is_uncased else "cased")
-    # The tokenizer. Megatron was trained with standard tokenizer(s).
-    tokenizer = BertTokenizer.from_pretrained(bert)
-
-    # The config file.
-    config_file = args.checkpoint + "_config.json"
-    # Load the config.
-    config = MegatronBertConfig.from_pretrained(config_file)
-
-    # The model class.
-    model_cls = MegatronBertForMaskedLM if args.masked_lm else MegatronBertForNextSentencePrediction
-    # The checkpoint file.
-    checkpoint_file = args.checkpoint + "_checkpoint.pt"
-    # Load the model from transformers.
-    model = model_cls.from_pretrained(checkpoint_file, config=config)
-
-    # Do not run backward.
-    model.eval()
-
-    # Copy to the device and use FP16.
-    assert torch.cuda.is_available()
-    device = torch.device("cuda")
-    model.to(device)
-    model.half()
-
-    # The input sentence.
-
-    # Create a dummy sentence (from the BERT example page).
-    if args.masked_lm:
-        input = tokenizer("The capital of France is [MASK]", return_tensors="pt")
-        input = input.to(device)
-        label = tokenizer("The capital of France is Paris", return_tensors="pt")["input_ids"]
-        label = label.to(device)
-        output = model(**input, labels=label)
-    else:
-        prompt = "In Italy, pizza served in formal settings is presented unsliced."
-        next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        input = tokenizer(prompt, next_sentence, return_tensors="pt")
-        input = input.to(device)
-        label = torch.LongTensor([1])
-        label = label.to(device)
-
-    # Run the model.
-    output = model(**input, labels=label)
-
-    # Outputs.
-    print("loss:   ", output.loss)
-    print("logits: ", output.logits)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/examples/megatron-models/run_gpt2.py b/examples/megatron-models/run_gpt2.py
deleted file mode 100644
index 816bd1fe64b111..00000000000000
--- a/examples/megatron-models/run_gpt2.py
+++ /dev/null
@@ -1,91 +0,0 @@
-####################################################################################################
-
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-####################################################################################################
-
-import argparse
-
-import torch
-
-from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
-
-
-####################################################################################################
-
-
-def main():
-
-    # Create the argument parser.
-    parser = argparse.ArgumentParser()
-    parser.add_argument("checkpoint", type=str, help="See examples in README.md.")
-    args = parser.parse_args()
-
-    # The tokenizer. Megatron was trained with standard tokenizer(s).
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-
-    # The config file.
-    config_file = args.checkpoint + "_config.json"
-    # Load the GPT2 config.
-    config = GPT2Config.from_pretrained(config_file)
-
-    # The checkpoint file.
-    checkpoint_file = args.checkpoint + "_checkpoint.pt"
-    # Load GPT2 model from transformers.
-    model = GPT2LMHeadModel.from_pretrained(checkpoint_file, config=config)
-
-    # Do not run backward.
-    model.eval()
-
-    # Copy to the device and use FP16.
-    assert torch.cuda.is_available()
-    device = torch.device("cuda")
-    model.to(device)
-    model.half()
-
-    # Create an empty sentence.
-    input = tokenizer.encode("", return_tensors="pt")
-    input = input.to(device)
-
-    # The token ids.
-    if input.size()[-1] == 0:
-        input_ids = None
-    else:
-        input_ids = input
-
-    # Generate the sentence.
-    output = model.generate(
-        input_ids=input_ids,
-        max_length=128,
-        temperature=1.0,
-        top_k=0,
-        top_p=0.9,
-        do_sample=True,
-        num_return_sequences=1,
-    )
-
-    # Output the text.
-    for sentence in output:
-        sentence = sentence.tolist()
-        text = tokenizer.decode(sentence, clean_up_tokenization_spaces=True)
-        print(text)
-
-
-####################################################################################################
-
-if __name__ == "__main__":
-    main()
-
-####################################################################################################
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 163b515003df96..b8664dec2b2160 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -28,11 +28,10 @@
 
 class MegatronBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. 
-    It is used to instantiate a MEGATRON_BERT model according to the
-    specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
-    similar configuration to that of the MEGATRON_BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__
-    architecture.
+    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
+    used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
+    `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 30ef01a57c22d1..b8ac627d7eb94f 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -232,7 +232,7 @@ def main():
     args = parser.parse_args()
 
     # Extract the basename.
-    basename = os.path.splitext(args.path_to_checkpoint)[0]
+    basename = os.path.dirname(args.path_to_checkpoint)
 
     # Load the model.
     print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
@@ -249,13 +249,13 @@ def main():
         recursive_print(None, output_state_dict)
 
     # Store the config to file.
-    output_config_file = basename + "_config.json"
+    output_config_file = os.path.join(basename, "config.json")
     print('Saving config to "{}"'.format(output_config_file))
     with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 
     # Store the state_dict to file.
-    output_checkpoint_file = basename + "_checkpoint.pt"
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
     print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
     torch.save(output_state_dict, output_checkpoint_file)
 
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index bf49e1356af4ee..0cb2a2d0d1e996 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -63,7 +63,7 @@
 
 
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "nvidia/megatron-bert-cased-345m",
+    "nvidia/megatron-bert",
     # See all MegatronBERT models at https://huggingface.co/models?filter=megatron_bert
 ]
 
@@ -142,7 +142,6 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
     return model
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->MegatronBert
 class MegatronBertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -200,8 +199,8 @@ def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
             )
 
         self.num_attention_heads = config.num_attention_heads
@@ -319,6 +318,7 @@ def forward(
             outputs = outputs + (past_key_value,)
         return outputs
 
+
 # Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to MegatronBertAttention below.
 class MegatronBertSelfOutput(nn.Module):
     def __init__(self, config):
@@ -331,6 +331,7 @@ def forward(self, hidden_states, residual):
         hidden_states = self.dropout(hidden_states)
         return residual + hidden_states
 
+
 # Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
 class MegatronBertAttention(nn.Module):
     def __init__(self, config):
@@ -398,6 +399,7 @@ def forward(self, hidden_states):
         hidden_states = self.intermediate_act_fn(hidden_states)
         return hidden_states
 
+
 # Based transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
 class MegatronBertOutput(nn.Module):
     def __init__(self, config):
@@ -497,7 +499,6 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->MegatronBert
 class MegatronBertEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -644,10 +645,13 @@ class MegatronBertLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.transform = MegatronBertPredictionHeadTransform(config)
+
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
         # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
         self.decoder.bias = self.bias
 
@@ -692,7 +696,6 @@ def forward(self, sequence_output, pooled_output):
         return prediction_scores, seq_relationship_score
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert->MegatronBert
 class MegatronBertPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -717,8 +720,8 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPreTrainingOutput with Bert->MegatronBert
 @dataclass
+# Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->MegatronBert
 class MegatronBertForPreTrainingOutput(ModelOutput):
     """
     Output type of :class:`~transformers.MegatronBertForPreTraining`.
@@ -820,7 +823,6 @@ class MegatronBertForPreTrainingOutput(ModelOutput):
 """
 
 
-# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->MegatronBert
 @add_start_docstrings(
     "The bare MegatronBert Model transformer outputting raw hidden-states without any specific head on top.",
     MEGATRON_BERT_START_DOCSTRING,
@@ -867,7 +869,7 @@ class PreTrainedModel
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -996,7 +998,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForPreTraining with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
@@ -1102,7 +1103,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLMHeadModel with Bert->MegatronBert
 @add_start_docstrings(
     """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """,
     MEGATRON_BERT_START_DOCSTRING,
@@ -1252,7 +1252,6 @@ def _reorder_cache(self, past, beam_idx):
         return reordered_past
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForMaskedLM with Bert->MegatronBert
 @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top. """, MEGATRON_BERT_START_DOCSTRING)
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
 
@@ -1282,7 +1281,7 @@ def set_output_embeddings(self, new_embeddings):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1358,7 +1357,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForNextSentencePrediction with Bert->MegatronBert
 @add_start_docstrings(
     """MegatronBert Model with a `next sentence prediction (classification)` head on top. """,
     MEGATRON_BERT_START_DOCSTRING,
@@ -1460,7 +1458,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForSentenceClassification with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
@@ -1482,7 +1479,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1546,7 +1543,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output
@@ -1569,7 +1565,7 @@ def __init__(self, config):
     )
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1640,7 +1636,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
@@ -1665,7 +1660,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )
@@ -1732,7 +1727,6 @@ def forward(
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MegatronBert
 @add_start_docstrings(
     """
     MegatronBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
@@ -1756,7 +1750,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert-uncased-345m",
+        checkpoint="nvidia/megatron-bert",
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index d70e1e207d9338..ece2909b634206 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -202,7 +202,7 @@ def main():
     args = parser.parse_args()
 
     # Extract the basename.
-    basename = os.path.splitext(args.path_to_checkpoint)[0]
+    basename = os.path.dirname(args.path_to_checkpoint)
 
     # Load the model.
     print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
@@ -219,13 +219,13 @@ def main():
         recursive_print(None, output_state_dict)
 
     # Store the config to file.
-    output_config_file = basename + "_config.json"
+    output_config_file = os.path.join(basename, "config.json")
     print('Saving config to "{}"'.format(output_config_file))
     with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 
     # Store the state_dict to file.
-    output_checkpoint_file = basename + "_checkpoint.pt"
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
     print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
     torch.save(output_state_dict, output_checkpoint_file)
 
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
index c76c52ca922429..4a816cdd4ba737 100644
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -17,6 +17,7 @@
 
 
 import math
+import os
 import unittest
 
 from transformers import is_torch_available
@@ -43,6 +44,7 @@
     )
 
 
+@require_torch
 class MegatronBertModelTester:
     def __init__(
         self,
@@ -340,12 +342,12 @@ def _long_tensor(tok_lst):
 class MegatronBertModelIntegrationTests(unittest.TestCase):
     @slow
     def test_inference_no_head(self):
-        basename = "examples/megatron-models/models/megatron_bert_345m_v0_1_uncased"
-        config_file = basename + "_config.json"
-        config = MegatronBertConfig.from_pretrained(config_file)
-        checkpoint_file = basename + "_checkpoint.pt"
-        model = MegatronBertModel.from_pretrained(checkpoint_file, config=config)
+        directory = "nvidia/megatron-bert-uncased-345m"
+        if "MYDIR" in os.environ:
+            directory = os.path.join(os.environ["MYDIR"], directory)
+        model = MegatronBertModel.from_pretrained(directory)
         model.to(torch_device)
+        model.half()
         input_ids = _long_tensor([[101, 7110, 1005, 1056, 2023, 11333, 17413, 1029, 102]])
         with torch.no_grad():
             output = model(input_ids)[0]

From 6b551fa16635ec67b4b933029ef7d03984abaa61 Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Wed, 31 Mar 2021 05:51:56 -0700
Subject: [PATCH 08/42] Fix Flax/TF tests

---
 tests/test_modeling_megatron_bert.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
index 4a816cdd4ba737..e0946c3985f7c9 100644
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -44,7 +44,6 @@
     )
 
 
-@require_torch
 class MegatronBertModelTester:
     def __init__(
         self,
@@ -258,14 +257,18 @@ def prepare_config_and_inputs_for_common(self):
 class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
-        MegatronBertModel,
-        MegatronBertForMaskedLM,
-        MegatronBertForMultipleChoice,
-        MegatronBertForNextSentencePrediction,
-        MegatronBertForPreTraining,
-        MegatronBertForQuestionAnswering,
-        MegatronBertForSequenceClassification,
-        MegatronBertForTokenClassification,
+        (
+            MegatronBertModel,
+            MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
     )
 
     # test_resize_embeddings = False

From 4236f00425383fa590f2f7fb84f23c426c510a6c Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Wed, 31 Mar 2021 23:44:51 -0700
Subject: [PATCH 09/42] Fix copyright

---
 .../models/megatron_bert/configuration_megatron_bert.py        | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index b8664dec2b2160..d114c71c90b933 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2021-, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021- NVIDIA Corporation and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From d2c48dee8076e841b43bcc1a22e66603991951c3 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:47:06 +0200
Subject: [PATCH 10/42] Update
 src/transformers/models/megatron_bert/configuration_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 .../models/megatron_bert/configuration_megatron_bert.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index d114c71c90b933..1f980d73a80d4e 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -60,7 +60,7 @@ class MegatronBertConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.MegatronBertModel` or :class:`~transformers.TFMegatronBertModel`.
+            :class:`~transformers.MegatronBertModel`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):

From 2f80114826daac0ad78f654517c3bad08fff6a8e Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:47:19 +0200
Subject: [PATCH 11/42] Update
 src/transformers/models/megatron_bert/configuration_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 .../models/megatron_bert/configuration_megatron_bert.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 1f980d73a80d4e..58cdcb68c63ba4 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -37,7 +37,7 @@ class MegatronBertConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (:obj:`int`, `optional`, defaults to 29056):
             Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
             by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 1024):

From 691466c60f98de96105eddff2c0e8488e054b78c Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:50:14 +0200
Subject: [PATCH 12/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 0cb2a2d0d1e996..ffdf26dfe9022b 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1058,7 +1058,7 @@ def forward(
             >>> import torch
 
             >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
-            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-uncased-345m')
+            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert')
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)

From 35c91b89c3103b00f520f4912e9b33c287e93faf Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:50:36 +0200
Subject: [PATCH 13/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index ffdf26dfe9022b..4abd1c4d68df24 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1180,7 +1180,7 @@ def forward(
             >>> import torch
 
             >>> tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
-            >>> config = MegatronBertConfig.from_pretrained("nvidia/megatron-bert-cased-345m")
+            >>> config = MegatronBertConfig.from_pretrained("nvidia/megatron-bert")
             >>> config.is_decoder = True
             >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', config=config)
 

From b159513366af0844fd1a22ef945309426d7c644a Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:50:50 +0200
Subject: [PATCH 14/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 4abd1c4d68df24..e95f170227ead6 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1182,7 +1182,7 @@ def forward(
             >>> tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
             >>> config = MegatronBertConfig.from_pretrained("nvidia/megatron-bert")
             >>> config.is_decoder = True
-            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', config=config)
+            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert', config=config)
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)

From 75dbd92101cf740d04b9feca64764d1f1524a1b3 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 08:50:58 +0200
Subject: [PATCH 15/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index e95f170227ead6..81b185ac1c9177 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1405,7 +1405,7 @@ def forward(
             >>> import torch
 
             >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
-            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-uncased-345m')
+            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert')
 
             >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
             >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."

From ba47704a60fa52cb52052554f9de6d723be18b42 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:55:24 +0200
Subject: [PATCH 16/42] Update docs/source/model_doc/megatron_bert.rst

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/model_doc/megatron_bert.rst | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
index a306547d4fc6ee..5140c57e41cb31 100644
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -1,7 +1,5 @@
 .. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Copyright 2021 NVIDIA Corporation
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
     the License. You may obtain a copy of the License at
@@ -138,4 +136,3 @@ MegatronBertForQuestionAnswering
     :members: forward
 
 
-

From 7c69ccafbb0e7ed347f649ee16d8bf42c44ecef2 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:56:12 +0200
Subject: [PATCH 17/42] Update docs/source/model_doc/megatron_gpt2.rst

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/model_doc/megatron_gpt2.rst | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
index 23e4b242507efb..909bafe2a62570 100644
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -1,7 +1,5 @@
 .. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Copyright 2021 NVIDIA Corporation
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
     the License. You may obtain a copy of the License at
@@ -66,4 +64,3 @@ The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__.
 and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
 approach using "tensor parallel" and "pipeline parallel" techniques.
 
-

From ef5a4dd7d844af18dc7bf72167378f24f925c09f Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:57:08 +0200
Subject: [PATCH 18/42] Update
 src/transformers/models/megatron_bert/__init__.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
index 5b9eccaff9e6f5..4fe50c5c1c9409 100644
--- a/src/transformers/models/megatron_bert/__init__.py
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -2,8 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-# Copyright 2021 NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2021  NVIDIA Corporation and The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 934bc8d4b09160507068201867dc4020233a7e4c Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:57:23 +0200
Subject: [PATCH 19/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 81b185ac1c9177..522fe53a01c66f 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1750,7 +1750,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )

From e3b4c2b615653dd357519b22a04cd296658e6149 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:57:46 +0200
Subject: [PATCH 20/42] Update
 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index ece2909b634206..11bbe9f7b862c8 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -113,7 +113,7 @@ def convert_megatron_checkpoint(args, input_state_dict):
         weight_or_bias = m.group(3)
 
         # The name of the layer.
-        layer_name = "transformer.h.{}".format(layer_idx)
+        layer_name = f"transformer.h.{layer_idx}"
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):

From f1efe7afa6f13cfe30e9d2a2cbc5be40b10ce6a8 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:57:59 +0200
Subject: [PATCH 21/42] Update
 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index 11bbe9f7b862c8..dd7980fb73c602 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -220,7 +220,7 @@ def main():
 
     # Store the config to file.
     output_config_file = os.path.join(basename, "config.json")
-    print('Saving config to "{}"'.format(output_config_file))
+    print(f'Saving config to "{output_config_file}"')
     with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 

From 30164e90c9d13afca4edb95863ba51c02ac83022 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:58:12 +0200
Subject: [PATCH 22/42] Update
 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index dd7980fb73c602..2d2d54b8123a99 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -226,7 +226,7 @@ def main():
 
     # Store the state_dict to file.
     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
     torch.save(output_state_dict, output_checkpoint_file)
 
 

From 4b4eb7cf025805eb61df2e081d0f17417a659a73 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:58:53 +0200
Subject: [PATCH 23/42] Update
 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_bert/convert_megatron_bert_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index b8ac627d7eb94f..5ce505b19ec66b 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -235,7 +235,7 @@ def main():
     basename = os.path.dirname(args.path_to_checkpoint)
 
     # Load the model.
-    print('Extracting PyTorch state dictionary from "{}"'.format(args.path_to_checkpoint))
+    print(f'Extracting PyTorch state dictionary from "{args.path_to_checkpoint}"')
     with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint:
         with checkpoint.open("release/mp_rank_00/model_optim_rng.pt") as pytorch_dict:
             input_state_dict = torch.load(pytorch_dict, map_location="cpu")

From d20e62820b93ab98cb5d4b3cb39281182fc2b028 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:59:21 +0200
Subject: [PATCH 24/42] Update
 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_bert/convert_megatron_bert_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 5ce505b19ec66b..2a100082323126 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -250,7 +250,7 @@ def main():
 
     # Store the config to file.
     output_config_file = os.path.join(basename, "config.json")
-    print('Saving config to "{}"'.format(output_config_file))
+    print(f'Saving config to "{output_config_file}"')
     with open(output_config_file, "w") as f:
         json.dump(output_config, f)
 

From 1b02b4ebe363ebec043e3f77a4491675c071bd72 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 17:59:52 +0200
Subject: [PATCH 25/42] Update
 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_bert/convert_megatron_bert_checkpoint.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 2a100082323126..0005018a028af7 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -256,7 +256,7 @@ def main():
 
     # Store the state_dict to file.
     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print('Saving checkpoint to "{}"'.format(output_checkpoint_file))
+    print('Saving checkpoint to "{output_checkpoint_file}"')
     torch.save(output_state_dict, output_checkpoint_file)
 
 

From 5a2b5553786a9a08c5b996a137a4293a0e745180 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:00:26 +0200
Subject: [PATCH 26/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 522fe53a01c66f..305d3b20777f46 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -88,7 +88,7 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
     names = []
     arrays = []
     for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        logger.info(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)

From 19206aaea18692add4fb7e0f69928115627ddea5 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:00:57 +0200
Subject: [PATCH 27/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 305d3b20777f46..1dec2e777a89cc 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -101,7 +101,7 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
             n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
             for n in name
         ):
-            logger.info("Skipping {}".format("/".join(name)))
+            logger.info(f"Skipping {'/'.join(name)}")
             continue
         pointer = model
         for m_name in name:

From 8c7f61b91339edfdd3d4a03d4b7a8929b3f29abf Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:01:21 +0200
Subject: [PATCH 28/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 1dec2e777a89cc..8b3010004ffa65 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -121,7 +121,7 @@ def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
                 try:
                     pointer = getattr(pointer, scope_names[0])
                 except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
+                    logger.info(f"Skipping {'/'.join(name)}")
                     continue
             if len(scope_names) >= 2:
                 num = int(scope_names[1])

From 1bf4b51728039ee93fd43fabee9890eb7d3bef4c Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:02:00 +0200
Subject: [PATCH 29/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 8b3010004ffa65..cf873507e0c817 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -400,7 +400,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Based transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
+# Based on transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
 class MegatronBertOutput(nn.Module):
     def __init__(self, config):
         super().__init__()

From 92d461d1b95b21cd6a8dbeac2f3ac8a3eff37ce3 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:02:37 +0200
Subject: [PATCH 30/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index cf873507e0c817..f2c6a00ccb3b76 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -413,7 +413,7 @@ def forward(self, hidden_states, input_tensor):
         return input_tensor + hidden_states
 
 
-# Based transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
+# Based on transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
 class MegatronBertLayer(nn.Module):
     def __init__(self, config):
         super().__init__()

From 93096e7f7bbd5427270ea7fc287ec34e763dc0fc Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:03:12 +0200
Subject: [PATCH 31/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index f2c6a00ccb3b76..fa6034bfc469e1 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -869,7 +869,7 @@ class PreTrainedModel
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=BaseModelOutputWithPoolingAndCrossAttentions,
         config_class=_CONFIG_FOR_DOC,
     )

From 11072bc13a6da8bbe0884da6689eea798fc0c43b Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:03:57 +0200
Subject: [PATCH 32/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/megatron_bert/modeling_megatron_bert.py            | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index fa6034bfc469e1..5cfe7b02fd7cf1 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1180,9 +1180,7 @@ def forward(
             >>> import torch
 
             >>> tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
-            >>> config = MegatronBertConfig.from_pretrained("nvidia/megatron-bert")
-            >>> config.is_decoder = True
-            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert', config=config)
+            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert', is_decoder=True)
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)

From acd1ee824dd9e974cfd693dc6c85a7338abc3e5d Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:04:23 +0200
Subject: [PATCH 33/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 5cfe7b02fd7cf1..3d70a8f3e14b98 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1279,7 +1279,7 @@ def set_output_embeddings(self, new_embeddings):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )

From 5f616b7fdb80965d70f7dc88bef5b79fff17996a Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:04:51 +0200
Subject: [PATCH 34/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 3d70a8f3e14b98..2bf72362b043ec 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1477,7 +1477,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=SequenceClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )

From 5e24d731be0088c6b4051162441ad8dfa7892314 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:05:11 +0200
Subject: [PATCH 35/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 2bf72362b043ec..d843f72a8d323c 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1563,7 +1563,7 @@ def __init__(self, config):
     )
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=MultipleChoiceModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )

From 57ea6d33865c744c4793ca230a484f0577397b06 Mon Sep 17 00:00:00 2001
From: Julien Demouth <julien.demouth@gmail.com>
Date: Thu, 1 Apr 2021 18:05:34 +0200
Subject: [PATCH 36/42] Update
 src/transformers/models/megatron_bert/modeling_megatron_bert.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/megatron_bert/modeling_megatron_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index d843f72a8d323c..f66f48db4a51fb 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1658,7 +1658,7 @@ def __init__(self, config):
     @add_start_docstrings_to_model_forward(MEGATRON_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint="nvidia/megatron-bert",
+        checkpoint=_CHECKPOINT_FOR_DOC,
         output_type=TokenClassifierOutput,
         config_class=_CONFIG_FOR_DOC,
     )

From 74a82057fb38dcab2818bd092ee811e664702c7a Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Thu, 1 Apr 2021 09:16:49 -0700
Subject: [PATCH 37/42] Resolve most of 'sgugger' comments

---
 docs/source/model_doc/megatron_bert.rst          | 14 +++++++++++---
 docs/source/model_doc/megatron_gpt2.rst          |  4 ++++
 src/transformers/__init__.py                     |  1 -
 .../convert_megatron_bert_checkpoint.py          | 16 ++++++++--------
 .../megatron_bert/modeling_megatron_bert.py      |  2 +-
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
index 5140c57e41cb31..07b50d99cb6974 100644
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -51,11 +51,15 @@ Alternatively, you can directly download the checkpoints using:
 
 BERT-345M-uncased::
 
+.. code-block:: bash
+
   wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
   -O megatron_bert_345m_v0_1_uncased.zip
 
 BERT-345M-cased::
 
+.. code-block:: bash
+
   wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
   megatron_bert_345m_v0_1_cased.zip
 
@@ -65,9 +69,13 @@ easily be loaded by Hugging Face Transformers and our port of the BERT code.
 The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
 ``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
 
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py
-  megatron_bert_345m_v0_1_uncased.zip python3
-  $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+.. code-block:: bash
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
+
+.. code-block:: bash
+
+  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
 
 The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
 and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
index 909bafe2a62570..8a7659acd7ab89 100644
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -49,6 +49,8 @@ Registry CLI. Further documentation for downloading models can be found in the `
 
 Alternatively, you can directly download the checkpoints using::
 
+.. code-block:: bash
+
   wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
   megatron_gpt2_345m_v0_0.zip
 
@@ -58,6 +60,8 @@ be loaded by Hugging Face Transformers GPT2 implementation.
 The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
 ``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
 
+.. code-block:: bash
+
   python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
 
 The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 39b6bff008594c..51011a9822b6e3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1954,7 +1954,6 @@
             MegatronBertForQuestionAnswering,
             MegatronBertForSequenceClassification,
             MegatronBertForTokenClassification,
-            MegatronBertLayer,
             MegatronBertModel,
         )
         from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 0005018a028af7..5e65911cbc71e8 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -112,7 +112,7 @@ def convert_megatron_checkpoint(args, input_state_dict):
         weight_or_bias = m.group(3)
 
         # The name of the layer.
-        layer_name = "bert.encoder.layer.{}".format(layer_idx)
+        layer_name = f"bert.encoder.layer.{layer_idx}"
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):
@@ -149,12 +149,12 @@ def convert_megatron_checkpoint(args, input_state_dict):
             self_attn_name = layer_name + ".attention.self"
 
             # Store.
-            output_state_dict[self_attn_name + ".query.weight"] = q
-            output_state_dict[self_attn_name + ".query.bias"] = q_bias
-            output_state_dict[self_attn_name + ".key.weight"] = k
-            output_state_dict[self_attn_name + ".key.bias"] = k_bias
-            output_state_dict[self_attn_name + ".value.weight"] = v
-            output_state_dict[self_attn_name + ".value.bias"] = v_bias
+            output_state_dict[f"self_attn_name.{query.weight}"] = q
+            output_state_dict[f"self_attn_name.{query.bias}"] = q_bias
+            output_state_dict[f"self_attn_name.{key.weight}"] = k
+            output_state_dict[f"self_attn_name.{key.bias}"] = k_bias
+            output_state_dict[f"self_attn_name.{value.weight}"] = v
+            output_state_dict[f"self_attn_name.{value.bias}"] = v_bias
 
             # Clear the stored tensor.
             attention_qkv_weight = None
@@ -256,7 +256,7 @@ def main():
 
     # Store the state_dict to file.
     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
-    print('Saving checkpoint to "{output_checkpoint_file}"')
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
     torch.save(output_state_dict, output_checkpoint_file)
 
 
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index f66f48db4a51fb..69dd5ec0c6e5ee 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -60,7 +60,7 @@
 
 _CONFIG_FOR_DOC = "MegatronBertConfig"
 _TOKENIZER_FOR_DOC = "BertTokenizer"
-
+_CHECKPOINT_FOR_DOC = "nvidia/megatron-bert"
 
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "nvidia/megatron-bert",

From f155bc48ffd4606aab7461035e49c0318e57fd9c Mon Sep 17 00:00:00 2001
From: Julien Demouth <jdemouth@nvidia.com>
Date: Thu, 1 Apr 2021 11:30:01 -0700
Subject: [PATCH 38/42] Fix conversion issue + Run make fix-copies/quality/docs

---
 .../convert_megatron_bert_checkpoint.py           | 15 ++++++---------
 src/transformers/utils/dummy_pt_objects.py        |  5 -----
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 5e65911cbc71e8..3d7f03dcbb767c 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -145,16 +145,13 @@ def convert_megatron_checkpoint(args, input_state_dict):
             k_bias = val[1 * 1024 : 2 * 1024]
             v_bias = val[2 * 1024 : 3 * 1024]
 
-            # The name of the self attention block.
-            self_attn_name = layer_name + ".attention.self"
-
             # Store.
-            output_state_dict[f"self_attn_name.{query.weight}"] = q
-            output_state_dict[f"self_attn_name.{query.bias}"] = q_bias
-            output_state_dict[f"self_attn_name.{key.weight}"] = k
-            output_state_dict[f"self_attn_name.{key.bias}"] = k_bias
-            output_state_dict[f"self_attn_name.{value.weight}"] = v
-            output_state_dict[f"self_attn_name.{value.bias}"] = v_bias
+            output_state_dict[f"{layer_name}.attention.self.query.weight"] = q
+            output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias
+            output_state_dict[f"{layer_name}.attention.self.key.weight"] = k
+            output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias
+            output_state_dict[f"{layer_name}.attention.self.value.weight"] = v
+            output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias
 
             # Clear the stored tensor.
             attention_qkv_weight = None
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 988f49447ece6c..980cd009ff77da 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1869,11 +1869,6 @@ def from_pretrained(self, *args, **kwargs):
         requires_pytorch(self)
 
 
-class MegatronBertLayer:
-    def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
-
-
 class MegatronBertModel:
     def __init__(self, *args, **kwargs):
         requires_pytorch(self)

From 487c5a0f9a79954869e3ada208a5ffdfeaa96380 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Wed, 7 Apr 2021 18:39:18 -0400
Subject: [PATCH 39/42] Apply suggestions from code review

---
 .../megatron_bert/configuration_megatron_bert.py |  2 +-
 .../megatron_bert/modeling_megatron_bert.py      | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 58cdcb68c63ba4..19171e70da1bc2 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -30,7 +30,7 @@ class MegatronBertConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
     used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
-    `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    `megatron-bert-uncased-345m <https://huggingface.co/nvidia/megatron-bert-uncased-345m>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 69dd5ec0c6e5ee..71b7a4e2d656d1 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -60,10 +60,10 @@
 
 _CONFIG_FOR_DOC = "MegatronBertConfig"
 _TOKENIZER_FOR_DOC = "BertTokenizer"
-_CHECKPOINT_FOR_DOC = "nvidia/megatron-bert"
+_CHECKPOINT_FOR_DOC = "nvidia/megatron-bert-cased-345m"
 
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "nvidia/megatron-bert",
+    "nvidia/megatron-bert-cased-345m",
     # See all MegatronBERT models at https://huggingface.co/models?filter=megatron_bert
 ]
 
@@ -1057,8 +1057,8 @@ def forward(
             >>> from transformers import BertTokenizer, MegatronBertForPreTraining
             >>> import torch
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
-            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert')
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)
@@ -1179,8 +1179,8 @@ def forward(
             >>> from transformers import BertTokenizer, MegatronBertLMHeadModel, MegatronBertConfig
             >>> import torch
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
-            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert', is_decoder=True)
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertLMHeadModel.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
 
             >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
             >>> outputs = model(**inputs)
@@ -1402,8 +1402,8 @@ def forward(
             >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
             >>> import torch
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
-            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert')
+            >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
+            >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
 
             >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
             >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."

From 6a4367ee940ed89c8f8a06deed7f5f5de81505f7 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 7 Apr 2021 18:57:16 -0400
Subject: [PATCH 40/42] Causal LM & merge

---
 docs/source/model_doc/megatron_bert.rst       |  7 ++++
 src/transformers/__init__.py                  |  2 ++
 src/transformers/models/auto/modeling_auto.py |  2 ++
 .../models/megatron_bert/__init__.py          | 12 ++-----
 .../megatron_bert/modeling_megatron_bert.py   |  6 ++--
 src/transformers/utils/dummy_pt_objects.py    | 33 +++++++++++--------
 tests/test_modeling_megatron_bert.py          | 11 +++++++
 utils/check_repo.py                           |  1 -
 8 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
index 07b50d99cb6974..853f09b9b42042 100644
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -102,6 +102,13 @@ MegatronBertForMaskedLM
     :members: forward
 
 
+MegatronBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForCausalLM
+    :members: forward
+
+
 MegatronBertForNextSentencePrediction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index aeb0408e16f9ce..2bc787f654c84e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -769,6 +769,7 @@
     _import_structure["models.megatron_bert"].extend(
         [
             "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MegatronBertForCausalLM",
             "MegatronBertForMaskedLM",
             "MegatronBertForMultipleChoice",
             "MegatronBertForNextSentencePrediction",
@@ -2016,6 +2017,7 @@
         )
         from .models.megatron_bert import (
             MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForCausalLM,
             MegatronBertForMaskedLM,
             MegatronBertForMultipleChoice,
             MegatronBertForNextSentencePrediction,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 3d993bfeb43641..7d915f8515c074 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -175,6 +175,7 @@
     MBartModel,
 )
 from ..megatron_bert.modeling_megatron_bert import (
+    MegatronBertForCausalLM,
     MegatronBertForMaskedLM,
     MegatronBertForMultipleChoice,
     MegatronBertForNextSentencePrediction,
@@ -470,6 +471,7 @@
         (DebertaConfig, DebertaForMaskedLM),
         (DebertaV2Config, DebertaV2ForMaskedLM),
         (IBertConfig, IBertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
     ]
 )
 
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
index 4fe50c5c1c9409..714f1b1ecc78ad 100644
--- a/src/transformers/models/megatron_bert/__init__.py
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -17,20 +17,17 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _BaseLazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...file_utils import _BaseLazyModule, is_torch_available
 
 
 _import_structure = {
     "configuration_megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
-    "tokenization_megatron_bert": ["MegatronBertTokenizer"],
 }
 
-if is_tokenizers_available():
-    _import_structure["tokenization_megatron_bert_fast"] = ["MegatronBertTokenizerFast"]
-
 if is_torch_available():
     _import_structure["modeling_megatron_bert"] = [
         "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "MegatronBertForCausalLM",
         "MegatronBertForMaskedLM",
         "MegatronBertForMultipleChoice",
         "MegatronBertForNextSentencePrediction",
@@ -43,14 +40,11 @@
 
 if TYPE_CHECKING:
     from .configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
-    from .tokenization_megatron_bert import MegatronBertTokenizer
-
-    if is_tokenizers_available():
-        from .tokenization_megatron_bert_fast import MegatronBertTokenizerFast
 
     if is_torch_available():
         from .modeling_megatron_bert import (
             MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForCausalLM,
             MegatronBertForMaskedLM,
             MegatronBertForMultipleChoice,
             MegatronBertForNextSentencePrediction,
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 71b7a4e2d656d1..ce4ece3d32fb98 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1107,7 +1107,7 @@ def forward(
     """MegatronBert Model with a `language modeling` head on top for CLM fine-tuning. """,
     MEGATRON_BERT_START_DOCSTRING,
 )
-class MegatronBertLMHeadModel(MegatronBertPreTrainedModel):
+class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
@@ -1116,7 +1116,7 @@ def __init__(self, config):
         super().__init__(config)
 
         if not config.is_decoder:
-            logger.warning("If you want to use `MegatronBertLMHeadModel` as a standalone, add `is_decoder=True.`")
+            logger.warning("If you want to use `MegatronBertForCausalLM` as a standalone, add `is_decoder=True.`")
 
         self.bert = MegatronBertModel(config, add_pooling_layer=False)
         self.cls = MegatronBertOnlyMLMHead(config)
@@ -1176,7 +1176,7 @@ def forward(
 
         Example::
 
-            >>> from transformers import BertTokenizer, MegatronBertLMHeadModel, MegatronBertConfig
+            >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
             >>> import torch
 
             >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 79f4a674cc0950..ac8ee4d488c19d 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1843,68 +1843,73 @@ def from_pretrained(self, *args, **kwargs):
 MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class MegatronBertForCausalLM:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MegatronBertForMaskedLM:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForMultipleChoice:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForNextSentencePrediction:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForPreTraining:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForQuestionAnswering:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForSequenceClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertForTokenClassification:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MegatronBertModel:
     def __init__(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
     @classmethod
     def from_pretrained(self, *args, **kwargs):
-        requires_pytorch(self)
+        requires_backends(self, ["torch"])
 
 
 class MMBTForClassification:
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
index e0946c3985f7c9..3423f2d6f1aaf7 100644
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -33,6 +33,7 @@
     from transformers import (
         MODEL_FOR_PRETRAINING_MAPPING,
         MegatronBertConfig,
+        MegatronBertForCausalLM,
         MegatronBertForMaskedLM,
         MegatronBertForMultipleChoice,
         MegatronBertForNextSentencePrediction,
@@ -154,6 +155,15 @@ def create_and_check_megatron_bert_for_masked_lm(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
+    def create_and_check_for_causal_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MegatronBertForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
     def create_and_check_megatron_bert_for_next_sequence_prediction(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
@@ -260,6 +270,7 @@ class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
         (
             MegatronBertModel,
             MegatronBertForMaskedLM,
+            MegatronBertForCausalLM,
             MegatronBertForMultipleChoice,
             MegatronBertForNextSentencePrediction,
             MegatronBertForPreTraining,
diff --git a/utils/check_repo.py b/utils/check_repo.py
index a99c62517789d3..9869133ce05657 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -88,7 +88,6 @@
     "FlaubertForQuestionAnswering",
     "FunnelBaseModel",
     "GPT2DoubleHeadsModel",
-    "MegatronBertLMHeadModel",
     "OpenAIGPTDoubleHeadsModel",
     "RagModel",
     "RagSequenceForGeneration",

From bae4340b0a787107b17d66e9e6ee30861e217ed9 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 7 Apr 2021 19:12:16 -0400
Subject: [PATCH 41/42] Fix init

---
 src/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2bc787f654c84e..60de5a817d5399 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -191,6 +191,7 @@
     "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
     "models.marian": ["MarianConfig"],
     "models.mbart": ["MBartConfig"],
+    "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
     "models.megatron_bert": ["MegatronBertConfig"],
     "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],

From 8f7a942d3dab2bcf980dce83291bf6e609cbe3de Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 7 Apr 2021 20:36:44 -0400
Subject: [PATCH 42/42] Add CausalLM to last auto class

---
 src/transformers/__init__.py                  | 1 -
 src/transformers/models/auto/modeling_auto.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 60de5a817d5399..9108904b9c92b6 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -192,7 +192,6 @@
     "models.marian": ["MarianConfig"],
     "models.mbart": ["MBartConfig"],
     "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
-    "models.megatron_bert": ["MegatronBertConfig"],
     "models.mmbt": ["MMBTConfig"],
     "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
     "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7d915f8515c074..64ff826a8ecaf4 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -503,6 +503,7 @@
         (MarianConfig, MarianForCausalLM),
         (BlenderbotConfig, BlenderbotForCausalLM),
         (BlenderbotSmallConfig, BlenderbotSmallForCausalLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
     ]
 )