From 3de37b60c907be2aff675d08775725ac15693bb9 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 13 Apr 2021 13:52:38 -0400
Subject: [PATCH 1/3] Indent code block

---
 utils/style_doc.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/utils/style_doc.py b/utils/style_doc.py
index 57179e6347e9a4..b5f61f76e38d7d 100644
--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@@ -49,6 +49,7 @@
 _re_table = re.compile(r"(\+-+)+\+\s*$")
 # Matches a code block in rst `:: `.
 _re_code_block = re.compile(r"^\s*::\s*$")
+_re_code_block_explicit = re.compile(r"^\.\.\s+code\-block::")
 # Matches any block of the form `.. something::` or `.. something:: bla`.
 _re_ignore = re.compile(r"^\s*\.\.\s+(.*?)\s*::\s*\S*\s*$")
 # Matches comment introduction in rst.
@@ -374,6 +375,29 @@ def init_in_block(self, text):
 doc_styler = DocstringStyler()
 
 
+def _reindent_code_blocks(text):
+    """Checks indent in code blocks is of four"""
+    lines = text.split("\n")
+    in_code_block = False
+    idx = 0
+    while idx < len(lines):
+        # Detect if the line is the start of a new code-block.
+        if _re_code_block.search(lines[idx]) is not None or _re_code_block_explicit.search(lines[idx]) is not None:
+            while len(get_indent(lines[idx])) == 0:
+                idx += 1
+            indent = len(get_indent(lines[idx]))
+            should_continue = True
+            while should_continue:
+                if len(lines[idx]) > 0 and indent < 4:
+                    lines[idx] = " " * 4 + lines[idx][indent:]
+                idx += 1
+                should_continue = (idx < len(lines)) and (len(lines[idx]) == 0 or len(get_indent(lines[idx])) > 0)
+        else:
+            idx += 1
+
+    return "\n".join(lines)
+
+
 def _add_new_lines_before_list(text):
     """Add a new empty line before a list begins."""
     lines = text.split("\n")
@@ -412,8 +436,10 @@ def style_rst_file(doc_file, max_len=119, check_only=False):
     with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
         doc = f.read()
 
+    # Make sure code blocks are indented at 4
+    clean_doc = _reindent_code_blocks(doc)
     # Add missing new lines before lists
-    clean_doc = _add_new_lines_before_list(doc)
+    clean_doc = _add_new_lines_before_list(clean_doc)
     # Style
     clean_doc = rst_styler.style(clean_doc, max_len=max_len)
 

From c369b5f78b20f84b03cf2cd19ff0019513315230 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 13 Apr 2021 13:53:34 -0400
Subject: [PATCH 2/3] Indent code blocks version 2

---
 docs/source/add_new_model.rst                |  18 +-
 docs/source/converting_tensorflow_models.rst |  92 +++---
 docs/source/glossary.rst                     |   2 +-
 docs/source/main_classes/trainer.rst         | 316 +++++++++----------
 docs/source/model_doc/bert_japanese.rst      |  36 +--
 docs/source/model_doc/bertgeneration.rst     |  38 +--
 docs/source/model_doc/bertweet.rst           |  30 +-
 docs/source/model_doc/herbert.rst            |  20 +-
 docs/source/model_doc/layoutlm.rst           |  20 +-
 docs/source/model_doc/megatron_bert.rst      |  12 +-
 docs/source/model_doc/megatron_gpt2.rst      |   6 +-
 docs/source/model_doc/phobert.rst            |  24 +-
 docs/source/model_doc/reformer.rst           |   4 +-
 docs/source/model_doc/t5.rst                 |  16 +-
 docs/source/testing.rst                      |  78 ++---
 15 files changed, 356 insertions(+), 356 deletions(-)

diff --git a/docs/source/add_new_model.rst b/docs/source/add_new_model.rst
index c1474471c0ab31..a7d47b600e914f 100644
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -388,7 +388,7 @@ Next, you can finally start adding new code to 🤗 Transformers. Go into the cl
 
 ::
 
-   cd transformers
+    cd transformers
 
 In the special case that you are adding a model whose architecture exactly matches the model architecture of an
 existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
@@ -417,27 +417,27 @@ You should do the following:
 
 ::
 
-   git checkout -b add_brand_new_bert
+    git checkout -b add_brand_new_bert
 
 2. Commit the automatically generated code:
 
 ::
 
-   git add .
-   git commit
+    git add .
+    git commit
 
 3. Fetch and rebase to current master
 
 ::
 
-   git fetch upstream
-   git rebase upstream/master
+    git fetch upstream
+    git rebase upstream/master
 
 4. Push the changes to your account using:
 
 ::
 
-   git push -u origin a-descriptive-name-for-my-changes
+    git push -u origin a-descriptive-name-for-my-changes
 
 5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
    GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
@@ -451,8 +451,8 @@ time to time by doing:
 
 ::
 
-   git fetch upstream
-   git merge upstream/master
+    git fetch upstream
+    git merge upstream/master
 
 In general, all questions you might have regarding the model or your implementation should be asked in your PR and
 discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index e04ccdee2a209b..95c0c15371d120 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -47,12 +47,12 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
 
 .. code-block:: shell
 
-   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 
-   transformers-cli convert --model_type bert \
-     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-     --config $BERT_BASE_DIR/bert_config.json \
-     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type bert \
+      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+      --config $BERT_BASE_DIR/bert_config.json \
+      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
 
 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/bert#pre-trained-models>`__.
@@ -72,12 +72,12 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``
 
 .. code-block:: shell
 
-   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+    export ALBERT_BASE_DIR=/path/to/albert/albert_base
 
-   transformers-cli convert --model_type albert \
-     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-     --config $ALBERT_BASE_DIR/albert_config.json \
-     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type albert \
+      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+      --config $ALBERT_BASE_DIR/albert_config.json \
+      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
 
 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/albert#pre-trained-models>`__.
@@ -91,13 +91,13 @@ save as the same format than OpenAI pretrained model (see `here <https://github.
 
 .. code-block:: shell
 
-   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 
-   transformers-cli convert --model_type gpt \
-     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+    transformers-cli convert --model_type gpt \
+      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
 
 
 OpenAI GPT-2
@@ -108,13 +108,13 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
 
 .. code-block:: shell
 
-   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
 
-   transformers-cli convert --model_type gpt2 \
-     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT2_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+    transformers-cli convert --model_type gpt2 \
+      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT2_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 
 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -124,13 +124,13 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
 
 .. code-block:: shell
 
-   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 
-   transformers-cli convert --model_type transfo_xl \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config TRANSFO_XL_CONFIG] \
-     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+    transformers-cli convert --model_type transfo_xl \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config TRANSFO_XL_CONFIG] \
+      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
 
 
 XLNet
@@ -140,14 +140,14 @@ Here is an example of the conversion process for a pre-trained XLNet model:
 
 .. code-block:: shell
 
-   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 
-   transformers-cli convert --model_type xlnet \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-     --config $TRANSFO_XL_CONFIG_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--finetuning_task_name XLNET_FINETUNED_TASK] \
+    transformers-cli convert --model_type xlnet \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+      --config $TRANSFO_XL_CONFIG_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--finetuning_task_name XLNET_FINETUNED_TASK] \
 
 
 XLM
@@ -157,13 +157,13 @@ Here is an example of the conversion process for a pre-trained XLM model:
 
 .. code-block:: shell
 
-   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
 
-   transformers-cli convert --model_type xlm \
-     --tf_checkpoint $XLM_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-    [--config XML_CONFIG] \
-    [--finetuning_task_name XML_FINETUNED_TASK]
+    transformers-cli convert --model_type xlm \
+      --tf_checkpoint $XLM_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+     [--config XML_CONFIG] \
+     [--finetuning_task_name XML_FINETUNED_TASK]
 
 
 T5
@@ -173,9 +173,9 @@ Here is an example of the conversion process for a pre-trained T5 model:
 
 .. code-block:: shell
 
-   export T5=/path/to/t5/uncased_L-12_H-768_A-12
+    export T5=/path/to/t5/uncased_L-12_H-768_A-12
 
-   transformers-cli convert --model_type t5 \
-     --tf_checkpoint $T5/t5_model.ckpt \
-     --config $T5/t5_config.json \
-     --pytorch_dump_output $T5/pytorch_model.bin
+    transformers-cli convert --model_type t5 \
+      --tf_checkpoint $T5/t5_model.ckpt \
+      --config $T5/t5_config.json \
+      --pytorch_dump_output $T5/pytorch_model.bin
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
index be7a4686946b68..8080e5916e8a26 100644
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -182,7 +182,7 @@ such:
 
 .. code-block::
 
-   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
 
 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
 arguments (and not a list, like before) like this:
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index aae325076cec8a..a046f5a485e3bd 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -293,33 +293,33 @@ with it, you may want to try one of:
 
 .. code-block:: bash
 
-   pip install fairscale --no-build-isolation .
+    pip install fairscale --no-build-isolation .
 
 or:
 
 .. code-block:: bash
 
-   git clone https://github.com/facebookresearch/fairscale/
-   cd fairscale
-   rm -r dist build
-   python setup.py bdist_wheel
-   pip uninstall -y fairscale
-   pip install dist/fairscale-*.whl
+    git clone https://github.com/facebookresearch/fairscale/
+    cd fairscale
+    rm -r dist build
+    python setup.py bdist_wheel
+    pip uninstall -y fairscale
+    pip install dist/fairscale-*.whl
 
 ``fairscale`` also has issues with building against pytorch-nightly, so if you use it you may have to try one of:
 
 .. code-block:: bash
 
-   pip uninstall -y fairscale; pip install fairscale --pre \
-   -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html \
-   --no-cache --no-build-isolation
+    pip uninstall -y fairscale; pip install fairscale --pre \
+    -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html \
+    --no-cache --no-build-isolation
 
 or:
 
 .. code-block:: bash
 
-   pip install -v --disable-pip-version-check . \
-   -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html --pre
+    pip install -v --disable-pip-version-check . \
+    -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html --pre
 
 Of course, adjust the urls to match the cuda version you use.
 
@@ -447,12 +447,12 @@ To make a local build for DeepSpeed:
 
 .. code-block:: bash
 
-   git clone https://github.com/microsoft/DeepSpeed/
-   cd DeepSpeed
-   rm -rf build
-   TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
-   --global-option="build_ext" --global-option="-j8" --no-cache -v \
-   --disable-pip-version-check 2>&1 | tee build.log
+    git clone https://github.com/microsoft/DeepSpeed/
+    cd DeepSpeed
+    rm -rf build
+    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
+    --global-option="build_ext" --global-option="-j8" --no-cache -v \
+    --disable-pip-version-check 2>&1 | tee build.log
 
 Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use.
 
@@ -460,11 +460,11 @@ Or if you need to use the same setup on multiple machines, make a binary wheel:
 
 .. code-block:: bash
 
-   git clone https://github.com/microsoft/DeepSpeed/
-   cd DeepSpeed
-   rm -rf build
-   TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
-   python setup.py build_ext -j8 bdist_wheel
+    git clone https://github.com/microsoft/DeepSpeed/
+    cd DeepSpeed
+    rm -rf build
+    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
+    python setup.py build_ext -j8 bdist_wheel
 
 it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install
 as ``pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` locally or on any other machine.
@@ -478,20 +478,20 @@ You can check the archs pytorch was built with using:
 
 .. code-block:: bash
 
-   python -c "import torch; print(torch.cuda.get_arch_list())"
+    python -c "import torch; print(torch.cuda.get_arch_list())"
 
 Here is how to find out the arch for one of the installed GPU. For example, for GPU 0:
 
 .. code-block:: bash
 
-   CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
-   print(torch.cuda.get_device_properties(torch.device('cuda')))"
+    CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+    print(torch.cuda.get_device_properties(torch.device('cuda')))"
 
 If the output is:
 
 .. code-block:: bash
 
-   _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
+    _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
 
 then you know that this card's arch is ``8.6``.
 
@@ -591,18 +591,18 @@ with DeepSpeed is to have at least the following configuration in the configurat
 
 .. code-block:: json
 
-  {
-    "zero_optimization": {
-       "stage": 2,
-       "allgather_partitions": true,
-       "allgather_bucket_size": 2e8,
-       "reduce_scatter": true,
-       "reduce_bucket_size": 2e8,
-       "overlap_comm": true,
-       "contiguous_gradients": true,
-       "cpu_offload": true
-    },
-  }
+    {
+      "zero_optimization": {
+         "stage": 2,
+         "allgather_partitions": true,
+         "allgather_bucket_size": 2e8,
+         "reduce_scatter": true,
+         "reduce_bucket_size": 2e8,
+         "overlap_comm": true,
+         "contiguous_gradients": true,
+         "cpu_offload": true
+      },
+    }
 
 which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will
 find more details in the discussion below.
@@ -710,18 +710,18 @@ shell from a cell. For example, to use ``run_translation.py`` you would launch i
 
 .. code-block::
 
-   !git clone https://github.com/huggingface/transformers
-   !cd transformers; deepspeed examples/seq2seq/run_translation.py ...
+    !git clone https://github.com/huggingface/transformers
+    !cd transformers; deepspeed examples/seq2seq/run_translation.py ...
 
 or with ``%%bash`` magic, where you can write a multi-line code for the shell program to run:
 
 .. code-block::
 
-   %%bash
+    %%bash
 
-   git clone https://github.com/huggingface/transformers
-   cd transformers
-   deepspeed examples/seq2seq/run_translation.py ...
+    git clone https://github.com/huggingface/transformers
+    cd transformers
+    deepspeed examples/seq2seq/run_translation.py ...
 
 In such case you don't need any of the code presented at the beginning of this section.
 
@@ -743,16 +743,16 @@ repo <https://github.com/microsoft/DeepSpeedExamples>`__:
 
 .. code-block:: bash
 
-  git clone https://github.com/microsoft/DeepSpeedExamples
-  cd DeepSpeedExamples
-  find . -name '*json'
+    git clone https://github.com/microsoft/DeepSpeedExamples
+    cd DeepSpeedExamples
+    find . -name '*json'
 
 Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
 example ``.json`` files with:
 
 .. code-block:: bash
 
-  grep -i Lamb $(find . -name '*json')
+    grep -i Lamb $(find . -name '*json')
 
 Some more examples are to be found in the `main repo <https://github.com/microsoft/DeepSpeed>`__ as well.
 
@@ -1020,49 +1020,49 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
 
 .. code-block:: json
 
-   {
-       "fp16": {
-           "enabled": true,
-           "loss_scale": 0,
-           "loss_scale_window": 1000,
-           "initial_scale_power": 16,
-           "hysteresis": 2,
-           "min_loss_scale": 1
-       },
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
 
-       "zero_optimization": {
-           "stage": 2,
-           "allgather_partitions": true,
-           "allgather_bucket_size": 2e8,
-           "overlap_comm": true,
-           "reduce_scatter": true,
-           "reduce_bucket_size": 2e8,
-           "contiguous_gradients": true,
-           "cpu_offload": true
-       },
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 2e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 2e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        },
 
-       "optimizer": {
-           "type": "AdamW",
-           "params": {
-               "lr": 3e-5,
-               "betas": [0.8, 0.999],
-               "eps": 1e-8,
-               "weight_decay": 3e-7
-           }
-       },
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": 3e-5,
+                "betas": [0.8, 0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
 
-       "scheduler": {
-           "type": "WarmupLR",
-           "params": {
-               "warmup_min_lr": 0,
-               "warmup_max_lr": 3e-5,
-               "warmup_num_steps": 500
-           }
-       },
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": 0,
+                "warmup_max_lr": 3e-5,
+                "warmup_num_steps": 500
+            }
+        },
 
-       "steps_per_print": 2000,
-       "wall_clock_breakdown": false
-   }
+        "steps_per_print": 2000,
+        "wall_clock_breakdown": false
+    }
 
 
 
@@ -1073,54 +1073,54 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
 
 .. code-block:: json
 
-   {
-       "fp16": {
-           "enabled": true,
-           "loss_scale": 0,
-           "loss_scale_window": 1000,
-           "initial_scale_power": 16,
-           "hysteresis": 2,
-           "min_loss_scale": 1
-       },
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
 
-       "zero_optimization": {
-           "stage": 3,
-           "cpu_offload": true,
-           "cpu_offload_params": true,
-           "cpu_offload_use_pin_memory" : true,
-           "overlap_comm": true,
-           "contiguous_gradients": true,
-           "sub_group_size": 1e14,
-           "reduce_bucket_size": 1e6,
-           "stage3_prefetch_bucket_size": 0.94e6,
-           "stage3_param_persistence_threshold": 1e4,
-           "stage3_max_live_parameters": 1e9,
-           "stage3_max_reuse_distance": 1e9,
-           "stage3_gather_fp16_weights_on_model_save": true
-       },
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 1e6,
+            "stage3_prefetch_bucket_size": 0.94e6,
+            "stage3_param_persistence_threshold": 1e4,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        },
 
-       "optimizer": {
-           "type": "AdamW",
-           "params": {
-               "lr": 3e-5,
-               "betas": [0.8, 0.999],
-               "eps": 1e-8,
-               "weight_decay": 3e-7
-           }
-       },
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": 3e-5,
+                "betas": [0.8, 0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
 
-       "scheduler": {
-           "type": "WarmupLR",
-           "params": {
-               "warmup_min_lr": 0,
-               "warmup_max_lr": 3e-5,
-               "warmup_num_steps": 500
-           }
-       },
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": 0,
+                "warmup_max_lr": 3e-5,
+                "warmup_num_steps": 500
+            }
+        },
 
-       "steps_per_print": 2000,
-       "wall_clock_breakdown": false
-   }
+        "steps_per_print": 2000,
+        "wall_clock_breakdown": false
+    }
 
 
 Optimizer and Scheduler
@@ -1367,26 +1367,26 @@ Let's say your checkpoint folder looks like this:
 
 .. code-block:: bash
 
-   $ ls -l output_dir/checkpoint-1/
-   -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
-   drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
-   -rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
-   -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
-   -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
-   -rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
-   -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
-   -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
-   -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
-   -rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
-   -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
-   -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+    $ ls -l output_dir/checkpoint-1/
+    -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+    drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+    -rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+    -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+    -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+    -rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+    -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+    -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+    -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+    -rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+    -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+    -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
 
 In this example there is just one DeepSpeed checkpoint sub-folder `global_step1`. Therefore to reconstruct the fp32
 weights just run:
 
 .. code-block:: bash
 
-   python zero_to_fp32.py global_step1 pytorch_model.bin
+    python zero_to_fp32.py global_step1 pytorch_model.bin
 
 The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
 
@@ -1416,18 +1416,18 @@ be seen in the following example:
 
 .. code-block:: python
 
-   class ModuleZ3(torch.nn.Module):
-       def __init__(self, *args):
-           super().__init__(self, *args)
-           self.layer1 = SomeLayer()
-           self.layer2 = OtherLayer()
-           deepspeed.zero.register_external_parameter(self, self.layer1.weight)
+    class ModuleZ3(torch.nn.Module):
+        def __init__(self, *args):
+            super().__init__(self, *args)
+            self.layer1 = SomeLayer()
+            self.layer2 = OtherLayer()
+            deepspeed.zero.register_external_parameter(self, self.layer1.weight)
 
-       def forward(self, input):
-           x = self.layer1(input)
-           # self.layer1.weight is needed in ModuleZ3.forward
-           y = self.layer2(x, self.layer1.weight)
-           return y
+        def forward(self, input):
+            x = self.layer1(input)
+            # self.layer1.weight is needed in ModuleZ3.forward
+            y = self.layer2(x, self.layer1.weight)
+            return y
 
 In general ``transformers`` models don't use this style of referring to other layer's weights so most likely you won't
 need to use it.
@@ -1494,7 +1494,7 @@ Also under ZeRO-3, if you write your own code and run into a model parameter wei
 
 .. code-block:: python
 
-   tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True)
+    tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True)
 
 stress on ``tensor([1.])``, or if you get an error where it says the parameter is of size ``1``, instead of some much
 larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
diff --git a/docs/source/model_doc/bert_japanese.rst b/docs/source/model_doc/bert_japanese.rst
index b078d4cba70a15..586d26ed66b5f5 100644
--- a/docs/source/model_doc/bert_japanese.rst
+++ b/docs/source/model_doc/bert_japanese.rst
@@ -33,38 +33,38 @@ Example of using a model with MeCab and WordPiece tokenization:
 
 .. code-block::
 
-  >>> import torch
-  >>> from transformers import AutoModel, AutoTokenizer 
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer 
 
-  >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
-  >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
 
-  >>> ## Input Japanese Text
-  >>> line = "吾輩は猫である。"
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
 
-  >>> inputs = tokenizer(line, return_tensors="pt")
+    >>> inputs = tokenizer(line, return_tensors="pt")
 
-  >>> print(tokenizer.decode(inputs['input_ids'][0]))
-  [CLS] 吾輩 は 猫 で ある 。 [SEP]
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾輩 は 猫 で ある 。 [SEP]
 
-  >>> outputs = bertjapanese(**inputs)
+    >>> outputs = bertjapanese(**inputs)
 
 Example of using a model with Character tokenization:
 
 .. code-block::
 
-  >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
-  >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
 
-  >>> ## Input Japanese Text
-  >>> line = "吾輩は猫である。"
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
 
-  >>> inputs = tokenizer(line, return_tensors="pt")
+    >>> inputs = tokenizer(line, return_tensors="pt")
 
-  >>> print(tokenizer.decode(inputs['input_ids'][0]))
-  [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
 
-  >>> outputs = bertjapanese(**inputs)
+    >>> outputs = bertjapanese(**inputs)
 
 Tips:
 
diff --git a/docs/source/model_doc/bertgeneration.rst b/docs/source/model_doc/bertgeneration.rst
index 6099385bea4cd3..7c8433806098da 100644
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -38,22 +38,22 @@ Usage:
 
 .. code-block::
 
-  # leverage checkpoints for Bert2Bert model...
-  # use BERT's cls token as BOS token and sep token as EOS token
-  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
-  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
-  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
-  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+    # leverage checkpoints for Bert2Bert model...
+    # use BERT's cls token as BOS token and sep token as EOS token
+    encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+    # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+    decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+    bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
 
-  # create tokenizer...
-  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+    # create tokenizer...
+    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
 
-  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
-  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+    input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+    labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
 
-  # train...
-  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
-  loss.backward()
+    # train...
+    loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+    loss.backward()
 
 
 - Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
@@ -61,15 +61,15 @@ Usage:
 
 .. code-block::
 
-  # instantiate sentence fusion model
-  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
-  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    # instantiate sentence fusion model
+    sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
 
-  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+    input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
 
-  outputs = sentence_fuser.generate(input_ids)
+    outputs = sentence_fuser.generate(input_ids)
 
-  print(tokenizer.decode(outputs[0]))
+    print(tokenizer.decode(outputs[0]))
 
 
 Tips:
diff --git a/docs/source/model_doc/bertweet.rst b/docs/source/model_doc/bertweet.rst
index 4fe1470def8329..b1d35d3a68d80f 100644
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -31,28 +31,28 @@ Example of use:
 
 .. code-block::
 
-  import torch
-  from transformers import AutoModel, AutoTokenizer 
+    import torch
+    from transformers import AutoModel, AutoTokenizer 
 
-  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+    bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
 
-  # For transformers v4.x+: 
-  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+    # For transformers v4.x+: 
+    tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
 
-  # For transformers v3.x: 
-  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+    # For transformers v3.x: 
+    # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
 
-  # INPUT TWEET IS ALREADY NORMALIZED!
-  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+    # INPUT TWEET IS ALREADY NORMALIZED!
+    line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
 
-  input_ids = torch.tensor([tokenizer.encode(line)])
+    input_ids = torch.tensor([tokenizer.encode(line)])
 
-  with torch.no_grad():
-      features = bertweet(input_ids)  # Models outputs are now tuples
+    with torch.no_grad():
+        features = bertweet(input_ids)  # Models outputs are now tuples
 
-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+    ## With TensorFlow 2.0+:
+    # from transformers import TFAutoModel
+    # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
 
 
 The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
diff --git a/docs/source/model_doc/herbert.rst b/docs/source/model_doc/herbert.rst
index 1a975897e21796..2b94b957d153f7 100644
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -40,20 +40,20 @@ Examples of use:
 
 .. code-block::
 
-  from transformers import HerbertTokenizer, RobertaModel
+    from transformers import HerbertTokenizer, RobertaModel
 
-  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
 
-  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
-  outputs = model(encoded_input)
+    encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+    outputs = model(encoded_input)
 
-  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+    import torch
+    from transformers import AutoModel, AutoTokenizer
 
-  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
 
 
 The original code can be found `here <https://github.com/allegro/HerBERT>`__.
diff --git a/docs/source/model_doc/layoutlm.rst b/docs/source/model_doc/layoutlm.rst
index 4d4fd34a5dbf2d..6c537f236c43f3 100644
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -56,24 +56,24 @@ Tips:
 
 .. code-block::
 
-   def normalize_bbox(bbox, width, height):
-        return [
-            int(1000 * (bbox[0] / width)),
-            int(1000 * (bbox[1] / height)),
-            int(1000 * (bbox[2] / width)),
-            int(1000 * (bbox[3] / height)),
-        ]
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]
 
 Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
 occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
 
 .. code-block::
 
-   from PIL import Image
+    from PIL import Image
 
-   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
 
-   width, height = image.size
+    width, height = image.size
 
 - For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
   <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
diff --git a/docs/source/model_doc/megatron_bert.rst b/docs/source/model_doc/megatron_bert.rst
index 853f09b9b42042..7e6262981f5248 100644
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -53,15 +53,15 @@ BERT-345M-uncased::
 
 .. code-block:: bash
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
-  -O megatron_bert_345m_v0_1_uncased.zip
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+    -O megatron_bert_345m_v0_1_uncased.zip
 
 BERT-345M-cased::
 
 .. code-block:: bash
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
-  megatron_bert_345m_v0_1_cased.zip
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+    megatron_bert_345m_v0_1_cased.zip
 
 Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
 easily be loaded by Hugging Face Transformers and our port of the BERT code.
@@ -71,11 +71,11 @@ The following commands allow you to do the conversion. We assume that the folder
 
 .. code-block:: bash
 
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
 
 .. code-block:: bash
 
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
 
 The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
 and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
diff --git a/docs/source/model_doc/megatron_gpt2.rst b/docs/source/model_doc/megatron_gpt2.rst
index 8a7659acd7ab89..67ec7227fa9ce4 100644
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -51,8 +51,8 @@ Alternatively, you can directly download the checkpoints using::
 
 .. code-block:: bash
 
-  wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
-  megatron_gpt2_345m_v0_0.zip
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+    megatron_gpt2_345m_v0_0.zip
 
 Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
 be loaded by Hugging Face Transformers GPT2 implementation.
@@ -62,7 +62,7 @@ The following command allows you to do the conversion. We assume that the folder
 
 .. code-block:: bash
 
-  python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
 
 The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
 and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
diff --git a/docs/source/model_doc/phobert.rst b/docs/source/model_doc/phobert.rst
index 5ef99b40801d2e..95e12877a3922d 100644
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -31,23 +31,23 @@ Example of use:
 
 .. code-block::
 
-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    import torch
+    from transformers import AutoModel, AutoTokenizer
 
-  phobert = AutoModel.from_pretrained("vinai/phobert-base")
-  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+    phobert = AutoModel.from_pretrained("vinai/phobert-base")
+    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
 
-  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
-  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+    # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+    line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
 
-  input_ids = torch.tensor([tokenizer.encode(line)])
+    input_ids = torch.tensor([tokenizer.encode(line)])
 
-  with torch.no_grad():
-      features = phobert(input_ids)  # Models outputs are now tuples
+    with torch.no_grad():
+        features = phobert(input_ids)  # Models outputs are now tuples
 
-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+    ## With TensorFlow 2.0+:
+    # from transformers import TFAutoModel
+    # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
 
 
 The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
diff --git a/docs/source/model_doc/reformer.rst b/docs/source/model_doc/reformer.rst
index c46bd2bb7480ed..9fa45076b31a3a 100644
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -145,8 +145,8 @@ For training, the :class:`~transformers.ReformerModelWithLMHead` should be used
 
 .. code-block::
 
-  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
-  loss = model(input_ids, labels=input_ids)[0]
+    input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+    loss = model(input_ids, labels=input_ids)[0]
 
 
 ReformerConfig
diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst
index 27425218d27dfd..b400401ebd171b 100644
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -73,10 +73,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
 
 .. code-block::
 
-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss
 
 - Supervised training
 
@@ -86,10 +86,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
 
 .. code-block::
 
-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss
 
 
 T5Config
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 9a4efb06fcb85f..72bd6840c194ae 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -70,19 +70,19 @@ Run all:
 
 .. code-block:: console
 
-   pytest
+    pytest
 
 or:
 
 .. code-block:: bash
 
-   make test
+    make test
 
 Note that the latter is defined as:
 
 .. code-block:: bash
 
-   python -m pytest -n auto --dist=loadfile -s -v ./tests/
+    python -m pytest -n auto --dist=loadfile -s -v ./tests/
 
 which tells pytest to:
 
@@ -100,13 +100,13 @@ All tests of the test suite:
 
 .. code-block:: bash
 
-   pytest --collect-only -q
+    pytest --collect-only -q
 
 All tests of a given test file:
 
 .. code-block:: bash
 
-   pytest tests/test_optimization.py --collect-only -q
+    pytest tests/test_optimization.py --collect-only -q
 
 
 
@@ -117,7 +117,7 @@ To run an individual test module:
 
 .. code-block:: bash
 
-   pytest tests/test_logging.py
+    pytest tests/test_logging.py
 
 
 Run specific tests
@@ -128,7 +128,7 @@ class containing those tests. For example, it could be:
 
 .. code-block:: bash
 
-   pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+    pytest tests/test_optimization.py::OptimizationTest::test_adam_w
 
 Here:
 
@@ -140,7 +140,7 @@ If the file contains multiple classes, you can choose to run only tests of a giv
 
 .. code-block:: bash
 
-   pytest tests/test_optimization.py::OptimizationTest
+    pytest tests/test_optimization.py::OptimizationTest
 
 
 will run all the tests inside that class.
@@ -149,7 +149,7 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat
 
 .. code-block:: bash
 
-   pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+    pytest tests/test_optimization.py::OptimizationTest --collect-only -q
 
 You can run tests by keyword expressions.
 
@@ -157,7 +157,7 @@ To run only tests whose name contains ``adam``:
 
 .. code-block:: bash
 
-   pytest -k adam tests/test_optimization.py
+    pytest -k adam tests/test_optimization.py
 
 Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
 negate.
@@ -166,19 +166,19 @@ To run all tests except those whose name contains ``adam``:
 
 .. code-block:: bash
 
-   pytest -k "not adam" tests/test_optimization.py
+    pytest -k "not adam" tests/test_optimization.py
 
 And you can combine the two patterns in one:
 
 .. code-block:: bash
 
-   pytest -k "ada and not adam" tests/test_optimization.py
+    pytest -k "ada and not adam" tests/test_optimization.py
 
 For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:
 
 .. code-block:: bash
 
-   pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+    pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
 
 Note that we use ``or`` here, since we want either of the keywords to match to include both.
 
@@ -186,7 +186,7 @@ If you want to include only tests that include both patterns, ``and`` is to be u
 
 .. code-block:: bash
 
-   pytest -k "test and ada" tests/test_optimization.py
+    pytest -k "test and ada" tests/test_optimization.py
 
 
 
@@ -251,7 +251,7 @@ example, to run all except ``test_modeling_*.py`` tests:
 
 .. code-block:: bash
 
-   pytest `ls -1 tests/*py | grep -v test_modeling`
+    pytest `ls -1 tests/*py | grep -v test_modeling`
 
 
 Clearing state
@@ -292,13 +292,13 @@ Repeat tests
 
 .. code-block:: bash
 
-   pip install pytest-flakefinder
+    pip install pytest-flakefinder
 
 And then run every test multiple times (50 by default):
 
 .. code-block:: bash
 
-   pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+    pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
 
 .. note::
    This plugin doesn't work with ``-n`` flag from ``pytest-xdist``.
@@ -322,19 +322,19 @@ As explained earlier this allows detection of coupled tests - where one test's s
 
 .. code-block:: bash
 
-   pytest tests
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest tests
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663
 
 So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:
 
 .. code-block:: bash
 
-   pytest --random-order-seed=573663
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest --random-order-seed=573663
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663
 
 It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
 manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
@@ -342,7 +342,7 @@ they failed and tell pytest to not randomize them instead using ``--random-order
 
 .. code-block:: bash
 
-   pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+    pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
 
 To disable the shuffling for all tests:
 
@@ -369,7 +369,7 @@ progressbar, and show tests that fail and the assert instantly. It gets activate
 
 .. code-block:: bash
 
-   pip install pytest-sugar
+    pip install pytest-sugar
 
 To run tests without it, run:
 
@@ -388,7 +388,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe
 
 .. code-block:: bash
 
-   pytest --pspec tests/test_optimization.py
+    pytest --pspec tests/test_optimization.py
 
 
 
@@ -490,8 +490,8 @@ Inside tests:
 
 .. code-block:: bash
 
-   from transformers.testing_utils import get_gpu_count
-   n_gpu = get_gpu_count() # works with torch and tf
+    from transformers.testing_utils import get_gpu_count
+    n_gpu = get_gpu_count() # works with torch and tf
 
 
 
@@ -514,8 +514,8 @@ You will need at least 2 GPUs to see these tests in action:
 
 .. code-block:: bash
 
-   CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
-   examples/seq2seq/test_seq2seq_examples_multi_gpu.py
+    CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
+    examples/seq2seq/test_seq2seq_examples_multi_gpu.py
 
 
 Output capture
@@ -528,13 +528,13 @@ To disable output capturing and to get the ``stdout`` and ``stderr`` normally, u
 
 .. code-block:: bash
 
-   pytest -s tests/test_logging.py
+    pytest -s tests/test_logging.py
 
 To send test results to JUnit format output:
 
 .. code-block:: bash
 
-   py.test tests --junitxml=result.xml
+    py.test tests --junitxml=result.xml
 
 
 Color control
@@ -544,7 +544,7 @@ To have no color (e.g., yellow on white background is not readable):
 
 .. code-block:: bash
 
-   pytest --color=no tests/test_logging.py
+    pytest --color=no tests/test_logging.py
 
 
 
@@ -555,7 +555,7 @@ Creating a URL for each test failure:
 
 .. code-block:: bash
 
-   pytest --pastebin=failed tests/test_logging.py
+    pytest --pastebin=failed tests/test_logging.py
 
 This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
 tests as usual or add for example -x if you only want to send one particular failure.
@@ -564,7 +564,7 @@ Creating a URL for a whole test session log:
 
 .. code-block:: bash
 
-   pytest --pastebin=all tests/test_logging.py
+    pytest --pastebin=all tests/test_logging.py
 
 
 
@@ -606,13 +606,13 @@ and you could run just the ``negative`` and ``integer`` sets of params with:
 
 .. code-block:: bash
 
-   pytest -k "negative and integer" tests/test_mytest.py
+    pytest -k "negative and integer" tests/test_mytest.py
 
 or all but ``negative`` sub-tests, with:
 
 .. code-block:: bash
 
-   pytest -k "not negative" tests/test_mytest.py
+    pytest -k "not negative" tests/test_mytest.py
 
 Besides using the ``-k`` filter that was just mentioned, you can find out the exact name of each sub-test and run any
 or all of them using their exact names.

From f9eb4d48f726e538b01d94f6642f3eed3c110108 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <sylvain.gugger@gmail.com>
Date: Tue, 13 Apr 2021 14:06:02 -0400
Subject: [PATCH 3/3] Quality

---
 utils/style_doc.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/style_doc.py b/utils/style_doc.py
index b5f61f76e38d7d..4da47099124306 100644
--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@@ -378,7 +378,6 @@ def init_in_block(self, text):
 def _reindent_code_blocks(text):
     """Checks indent in code blocks is of four"""
     lines = text.split("\n")
-    in_code_block = False
     idx = 0
     while idx < len(lines):
         # Detect if the line is the start of a new code-block.