From 61c7ec94b563d371cbafde35f3c6822049ebf131 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 23 Apr 2021 22:08:39 -0700
Subject: [PATCH 01/12] adding Z-inf

---
 docs/source/main_classes/trainer.rst | 204 ++++++++++++++++++---------
 tests/deepspeed/ds_config_zero3.json |  11 +-
 tests/deepspeed/test_deepspeed.py    |  19 ++-
 3 files changed, 160 insertions(+), 74 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 106ef3c80ef8..bd02cb06230b 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -400,18 +400,18 @@ DeepSpeed
 `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
 <https://arxiv.org/abs/1910.02054>`__. Currently it provides full support for:
 
-1. Optimizer State Partitioning (ZeRO stage 1)
-2. Gradient Partitioning (ZeRO stage 2)
-3. Param Partitioning (ZeRO stage 3)
+1. Optimizer state partitioning (ZeRO stage 1)
+2. Gradient partitioning (ZeRO stage 2)
+3. Parameter partitioning (ZeRO stage 3)
 4. Custom mixed precision training handling
-5. A range of fast CUDA-extension-based Optimizers
-6. ZeRO-Offload
+5. A range of fast CUDA-extension-based optimizers
+6. ZeRO-Offload to CPU and NVMe
 
 ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training
-<https://arxiv.org/abs/2101.06840>`__.
+<https://arxiv.org/abs/2101.06840>`__. And NVMe-support is described in the paper `ZeRO-Infinity: Breaking the GPU
+Memory Wall for Extreme Scale Deep Learning <https://arxiv.org/abs/2104.07857>`__.
 
-DeepSpeed ZeRO-2 is currently used only for training, as all the currently available features are of no use to
-inference.
+DeepSpeed ZeRO-2 is primarily used only for training, as its features are of no use to inference.
 
 DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
 won't be possible on a single GPU.
@@ -541,7 +541,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config_zero3.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -566,17 +566,17 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash
 
     deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
-    --deepspeed tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config_zero2.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
     --dataset_name wmt16 --dataset_config "ro-en" \
     --source_lang en --target_lang ro
 
-This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU. By default,
-DeepSpeed deploys all GPUs it can see. If you have only 1 GPU to start with, then you don't need this argument. The
-following `documentation <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__ discusses the
-launcher options.
+This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU via
+``--num_gpus=1``. By default, DeepSpeed deploys all GPUs it can see on the given node. If you have only 1 GPU to start
+with, then you don't need this argument. The following `documentation
+<https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__ discusses the launcher options.
 
 Why would you want to use DeepSpeed with just one GPU?
 
@@ -610,6 +610,10 @@ find more details in the discussion below.
 For a practical usage example of this type of deployment, please, see this `post
 <https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685>`__.
 
+You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document.
+
+TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU.
+
 Notes:
 
 - if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit
@@ -725,7 +729,7 @@ or with ``%%bash`` magic, where you can write a multi-line code for the shell pr
 
 In such case you don't need any of the code presented at the beginning of this section.
 
-Note: ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process
+Note: While ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process
 completes.
 
 
@@ -860,10 +864,10 @@ Of course, you will need to adjust the values in this example to your situation.
 ZeRO
 =======================================================================================================================
 
-`Zero Redundancy Optimizer (ZeRO) <https://www.deepspeed.ai/tutorials/zero/>`__ is the work horse of DeepSpeed. It
+`Zero Redundancy Optimizer (ZeRO) <https://www.deepspeed.ai/tutorials/zero/>`__ is the workhorse of DeepSpeed. It
 support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes,
-therefore this document focuses on stages 2 and 3. You will find more indepth information in the DeepSpeed
-documentation.
+therefore this document focuses on stages 2 and 3. Stage 3 is further improved by the latest addition of ZeRO-Infinity.
+You will find more indepth information in the DeepSpeed documentation.
 
 The ``zero_optimization`` section of the configuration file is the most important part (`docs
 <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
@@ -916,15 +920,19 @@ ZeRO-3 Config
 
 The following is an example configuration for ZeRO stage 3:
 
-
 .. code-block:: json
 
     {
         "zero_optimization": {
             "stage": 3,
-            "cpu_offload": true,
-            "cpu_offload_params": true,
-            "cpu_offload_use_pin_memory" : true,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "offload_param": {
+                "device": "cpu"
+                "pin_memory": true
+            },
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
@@ -937,8 +945,14 @@ The following is an example configuration for ZeRO stage 3:
         }
     }
 
-Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and
-``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these they will just be ignored.
+If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU
+memory offloading the optimizer states and parameters to CPU memory with ``"device": "cpu"`` may solve this limitation.
+If you don't want to offload to CPU memory, use ``none`` instead of ``cpu`` for the ``device`` entry. Offloading to
+NVMe is discussed further down.
+
+Pinned memory is enabled with ``pin_memory`` set to ``true``. This feature can improve the throughput at the cost of
+making less memory available to other processes. Pinned memory is set aside to the specific process that requested it
+and its typically accessed much faster than normal CPU memory.
 
 **Performance tuning:**
 
@@ -972,9 +986,14 @@ shown below and the right configuration will be passed to DeepSpeed:
     {
         "zero_optimization": {
             "stage": 3,
-            "cpu_offload": true,
-            "cpu_offload_params": true,
-            "cpu_offload_use_pin_memory" : true,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "offload_param": {
+                "device": "cpu"
+                "pin_memory": true
+            },
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
@@ -992,6 +1011,78 @@ models and multiple GPUs this is an expensive operation both in terms of memory
 you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
 flexible.
 
+Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and
+``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just
+be ignored.
+
+
+
+
+NVMe Support
+=======================================================================================================================
+
+ZeRO-Infinity allows for training incredibly large models by extending GPU and CPU memory with NVMe memory. Thanks to
+smart partitioning and tiling algorithms each GPU needs to send and receive very small amounts of data during
+offloading so modern NVMe proved to be fit to allow for an even larger total memory pool available to your training
+process. ZeRO-Infinity requires ZeRO-3 enabled.
+
+The following configuration example enables NVMe to offload both optimizer states and the params:
+
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage": 3,
+            "offload_optimizer": {
+                "device": "nvme",
+                "nvme_path": "/local_nvme",
+                "pin_memory": true,
+                "buffer_count": 4,
+                "fast_init": false
+            },
+            "offload_param": {
+                "device": "nvme",
+                "nvme_path": "/local_nvme",
+                "pin_memory": true,
+                "buffer_count": 5,
+                "buffer_size": 1e8,
+                "max_in_cpu": 1e9
+            }
+            "aio": {
+                "block_size": 262144,
+                "queue_depth": 32,
+                "thread_count": 1,
+                "single_submit": false,
+                "overlap_events": true
+            }
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 0,
+            "stage3_prefetch_bucket_size": 0,
+            "stage3_param_persistence_threshold": 0,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        },
+    }
+
+You can choose to offload both optimizer states and params to NVMe, or just one of them or none. For example, if you
+have copious amounts of CPU memory available, by all means offload to CPU memory only as it'd be faster (hint:
+`"device": "cpu"`).
+
+Here is the full documentation for offloading `optimizer states
+<https://www.deepspeed.ai/docs/config-json/#optimizer-offloading>`__ and `parameters
+<https://www.deepspeed.ai/docs/config-json/#parameter-offloading>`__.
+
+Make sure that your ``nvme_path`` is actually an NVMe, since it will work with the normal hard drive or SSD, but it'll
+be much much slower. The fast scalable training was designed with modern NVMe transfer speeds in mind (as of this
+writing one can have ~3.5GB/s read, ~3GB/s write peak speeds).
+
+In order to figure out the optimal ``aio`` configuration block you must run a benchmark on your target setup, as
+`explained here <https://github.com/microsoft/DeepSpeed/issues/998>`__.
+
+
 
 ZeRO-2 vs ZeRO-3 Performance
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -1085,9 +1176,14 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
 
         "zero_optimization": {
             "stage": 3,
-            "cpu_offload": true,
-            "cpu_offload_params": true,
-            "cpu_offload_use_pin_memory" : true,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "offload_param": {
+                "device": "cpu"
+                "pin_memory": true
+            },
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
@@ -1193,7 +1289,6 @@ Scheduler
 DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
 <https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
 
-
 Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
 
 * ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
@@ -1315,9 +1410,6 @@ setting this value and thus avoid potential subtle errors.
 
 
 
-
-
-
 Gradient Clipping
 =======================================================================================================================
 
@@ -1334,7 +1426,7 @@ Here is an example of the ``gradient_clipping`` configuration:
 
 
 
-Getting the model weights out
+Getting The Model Weights Out
 =======================================================================================================================
 
 As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores
@@ -1398,44 +1490,18 @@ This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights c
 
 Note: currently the script requires 2x general RAM of the final fp32 model weights.
 
-ZeRO 3 Nuances
+
+ZeRO-3 and Infinity Nuances
 =======================================================================================================================
 
-ZeRO 3 is quite different from ZeRO 2 because of its param sharding feature.
+ZeRO-3 is quite different from ZeRO-2 because of its param sharding feature.
+
+ZeRO-Infinity further extends ZeRO-3 to support NVMe memory and multiple other speed and scalability improvements.
 
 While all the efforts were made for things to just work without needing any special changes to your models, in certain
 circumstances you may find the following information to be needed.
 
 
-Registering External Parameters
-+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-If layer A needs to access weights belonging to layer B, currently layer A needs to tell DeepSpeed about it. This is
-done with the help of ``deepspeed.zero.register_external_parameter`` that needs to be called in ``A.__init__`` and can
-be seen in the following example:
-
-.. code-block:: python
-
-    class ModuleZ3(torch.nn.Module):
-        def __init__(self, *args):
-            super().__init__(self, *args)
-            self.layer1 = SomeLayer()
-            self.layer2 = OtherLayer()
-            deepspeed.zero.register_external_parameter(self, self.layer1.weight)
-
-        def forward(self, input):
-            x = self.layer1(input)
-            # self.layer1.weight is needed in ModuleZ3.forward
-            y = self.layer2(x, self.layer1.weight)
-            return y
-
-In general ``transformers`` models don't use this style of referring to other layer's weights so most likely you won't
-need to use it.
-
-For full details on this method please refer to `Registering External Parameters
-<https://deepspeed.readthedocs.io/en/latest/zero3.html#registering-external-parameters>`__.
-
-
 
 Constructing Massive Models
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -1475,8 +1541,6 @@ For full details on this method and other related features please refer to `Cons
 
 
 
-
-
 Gathering Parameters
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -1501,8 +1565,6 @@ larger multi-dimensional shape, this means that the parameter is partitioned and
 
 
 
-
-
 Notes
 =======================================================================================================================
 
@@ -1514,6 +1576,7 @@ Notes
   with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions
   <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
 
+
 Main DeepSpeed Resources
 =======================================================================================================================
 
@@ -1526,6 +1589,7 @@ Papers:
 
 - `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models <https://arxiv.org/abs/1910.02054>`__
 - `ZeRO-Offload: Democratizing Billion-Scale Model Training <https://arxiv.org/abs/2101.06840>`__
+- `ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning <https://arxiv.org/abs/2104.07857>`__
 
 Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
 have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
index 0f909959521e..cbee8a642724 100644
--- a/tests/deepspeed/ds_config_zero3.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -10,9 +10,14 @@
 
     "zero_optimization": {
         "stage": 3,
-        "cpu_offload": true,
-        "cpu_offload_params": true,
-        "cpu_offload_use_pin_memory" : true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
         "overlap_comm": true,
         "contiguous_gradients": true,
         "sub_group_size": 1e14,
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 9868966a5a32..3691fab70885 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -66,6 +66,9 @@ def require_deepspeed(test_case):
         return test_case
 
 
+if is_deepspeed_available():
+    from deepspeed.utils import logger  # noqa
+
 ZERO2 = "zero2"
 ZERO3 = "zero3"
 stages = [ZERO2, ZERO3]
@@ -191,6 +194,21 @@ def test_hf_optimizer_with_offload(self):
                 trainer.train()
         self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
 
+    def test_stage3_nvme_offload(self):
+
+        with CaptureLogger(logger) as cs:
+            with mockenv_context(**self.dist_env_1_gpu):
+                # this actually doesn't have to be on NVMe, any storage will do since this test only
+                # runs a simple check that we can use some directory as if it were NVMe
+                nvme_path = self.get_auto_remove_tmp_dir()
+                nvme_config = dict(device="nvme", nvme_path=nvme_path)
+                ds_config_zero3_dict = self.get_config_dict(ZERO3)
+                ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+                ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
+                trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict)
+                trainer.train()
+        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+
     # --- These tests need to run on both zero stages --- #
     @parameterized.expand(stages)
     def test_fake_notebook_no_launcher(self, stage):
@@ -200,7 +218,6 @@ def test_fake_notebook_no_launcher(self, stage):
         # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger.
-        from deepspeed.utils import logger
 
         with CaptureLogger(logger) as cs:
             with mockenv_context(**self.dist_env_1_gpu):

From 6b2317c8cd5cdc9af9de56e86c6c944eaaf2f90b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 24 Apr 2021 13:59:09 -0700
Subject: [PATCH 02/12] revamp config process

---
 docs/source/main_classes/trainer.rst | 534 +++++++++++++++++++--------
 src/transformers/integrations.py     | 191 +++++-----
 tests/deepspeed/ds_config_zero2.json |  42 ++-
 tests/deepspeed/ds_config_zero3.json |  50 +--
 tests/deepspeed/test_deepspeed.py    |  38 +-
 5 files changed, 537 insertions(+), 318 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index bd02cb06230b..523351f5ab22 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -612,7 +612,8 @@ For a practical usage example of this type of deployment, please, see this `post
 
 You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document.
 
-TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU.
+TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then recommend
+ZeRO-3 config as starting one.
 
 Notes:
 
@@ -647,7 +648,7 @@ If you're using only 1 GPU, here is how you'd have to adjust your training code
     os.environ['WORLD_SIZE'] = "1"
 
     # Now proceed as normal, plus pass the deepspeed config file
-    training_args = TrainingArguments(..., deepspeed="ds_config.json")
+    training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
     trainer = Trainer(...)
     trainer.train()
 
@@ -663,47 +664,62 @@ cell with:
 .. code-block:: python
 
     %%bash
-    cat <<'EOT' > ds_config.json
+    cat <<'EOT' > ds_config_zero3.json
     {
         "fp16": {
-            "enabled": true,
+            "enabled": "auto",
             "loss_scale": 0,
             "loss_scale_window": 1000,
+            "initial_scale_power": 16,
             "hysteresis": 2,
             "min_loss_scale": 1
         },
 
-        "zero_optimization": {
-            "stage": 2,
-            "allgather_partitions": true,
-            "allgather_bucket_size": 2e8,
-            "overlap_comm": true,
-            "reduce_scatter": true,
-            "reduce_bucket_size": 2e8,
-            "contiguous_gradients": true,
-            "cpu_offload": true
-        },
-
         "optimizer": {
             "type": "AdamW",
             "params": {
-                "lr": 3e-5,
-                "betas": [0.8, 0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
+                "lr": "auto",
+                "betas": "auto",
+                "eps": "auto",
+                "weight_decay": "auto"
             }
         },
 
         "scheduler": {
             "type": "WarmupLR",
             "params": {
-                "warmup_min_lr": 0,
-                "warmup_max_lr": 3e-5,
-                "warmup_num_steps": 500
+                "warmup_min_lr": "auto",
+                "warmup_max_lr": "auto",
+                "warmup_num_steps": "auto"
             }
         },
 
+        "zero_optimization": {
+            "stage": 3,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "offload_param": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": "auto",
+            "stage3_prefetch_bucket_size": "auto",
+            "stage3_param_persistence_threshold": "auto",
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        },
+
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
         "steps_per_print": 2000,
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
         "wall_clock_breakdown": false
     }
     EOT
@@ -764,48 +780,55 @@ When using DeepSpeed you always need to supply a DeepSpeed configuration file, y
 to be configured via the command line. You will find the nuances in the rest of this guide.
 
 To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
-enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler:
+including optimizer states cpu offload, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler and will enable mixed
+precision training if ``--fp16`` is passed:
 
 .. code-block:: json
 
     {
         "fp16": {
-            "enabled": true,
+            "enabled": "auto",
             "loss_scale": 0,
             "loss_scale_window": 1000,
+            "initial_scale_power": 16,
             "hysteresis": 2,
             "min_loss_scale": 1
         },
 
-       "zero_optimization": {
-           "stage": 2,
-           "allgather_partitions": true,
-           "allgather_bucket_size": 5e8,
-           "overlap_comm": true,
-           "reduce_scatter": true,
-           "reduce_bucket_size": 5e8,
-           "contiguous_gradients": true,
-           "cpu_offload": true
-       },
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": "auto",
+                "betas": "auto",
+                "eps": "auto",
+                "weight_decay": "auto"
+            }
+        },
 
-       "optimizer": {
-         "type": "AdamW",
-         "params": {
-           "lr": 3e-5,
-           "betas": [ 0.8, 0.999 ],
-           "eps": 1e-8,
-           "weight_decay": 3e-7
-         }
-       },
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": "auto",
+                "warmup_max_lr": "auto",
+                "warmup_num_steps": "auto"
+            }
+        },
 
-       "scheduler": {
-         "type": "WarmupLR",
-         "params": {
-           "warmup_min_lr": 0,
-           "warmup_max_lr": 3e-5,
-           "warmup_num_steps": 500
-         }
-       }
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 2e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 2e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        },
+
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
     }
 
 When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
@@ -839,25 +862,28 @@ or:
 Shared Configuration
 =======================================================================================================================
 
-Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function
-correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to
-configure those via the :class:`~transformers.Trainer` command line arguments.
-
-Therefore, the following DeepSpeed configuration params shouldn't be used with the :class:`~transformers.Trainer`:
 
-* ``train_batch_size``
-* ``train_micro_batch_size_per_gpu``
-* ``gradient_accumulation_steps``
+.. warning::
 
-as these will be automatically derived from the run time environment and the following 2 command line arguments:
+    This section is a must-read
 
-.. code-block:: bash
+Some configuration values are required by both the :class:`~transformers.Trainer` and DeepSpeed to function correctly,
+therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to configure those
+via the :class:`~transformers.Trainer` command line arguments.
 
-    --per_device_train_batch_size 8 --gradient_accumulation_steps 2
+Additionally, some configuration values are derived automatically based on the model's configuration, so instead of
+remembering to manually adjust multiple values, it's the best to let the :class:`~transformers.Trainer` do the majority
+of configuration for you.
 
-which are always required to be supplied.
+Therefore, in the rest of this guide you will find a special configuration value: ``auto``, which when set will be
+automatically replaced with the correct or most efficient value. Please feel free to choose to ignore this
+recommendation and set the values explicitly, in which case be very careful that your the
+:class:`~transformers.Trainer` arguments and DeepSpeed configurations agree. For example, are you using the same
+learning rate, or batch size, or gradient accumulation settings? if these mismatch the training may fail in very
+difficult to detect ways. You have been warned.
 
-Of course, you will need to adjust the values in this example to your situation.
+There are multiple other values that are specific to DeepSpeed-only and those you will have to set manually to suit
+your needs.
 
 
 
@@ -930,19 +956,19 @@ The following is an example configuration for ZeRO stage 3:
                 "pin_memory": true
             },
             "offload_param": {
-                "device": "cpu"
+                "device": "cpu",
                 "pin_memory": true
             },
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
-            "reduce_bucket_size": 1e6,
-            "stage3_prefetch_bucket_size": 0.94e6,
-            "stage3_param_persistence_threshold": 1e4,
+            "reduce_bucket_size": "auto",
+            "stage3_prefetch_bucket_size": "auto",
+            "stage3_param_persistence_threshold": "auto",
             "stage3_max_live_parameters": 1e9,
             "stage3_max_reuse_distance": 1e9,
             "stage3_gather_fp16_weights_on_model_save": true
-        }
+        },
     }
 
 If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU
@@ -957,9 +983,6 @@ and its typically accessed much faster than normal CPU memory.
 **Performance tuning:**
 
 - ``sub_group_size``: ``1e14``
-- ``reduce_bucket_size``: ``hidden_size*hidden_size``
-- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size``
-- ``stage3_param_persistence_threshold``: ``10 * hidden_size``
 - ``stage3_max_live_parameters``: ``1e9``
 - ``stage3_max_reuse_distance``: ``1e9``
 
@@ -974,46 +997,23 @@ going to be used again in near future (less than ``stage3_max_reuse_distance``)
 overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
 backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward
 
-If you set ``reduce_bucket_size``, ``stage3_prefetch_bucket_size`` and ``stage3_param_persistence_threshold`` as
-recommended above, they will already be fairly small so you won't have to tune those much.
+The following configuration values depend on the model's hidden size:
 
-Since ``hidden_size`` varies from model to model, the ``Trainer`` will automatically set the needed value for the 3
-config parameters that contain that variable (using ``model.config.hidden_size``). Just set these values to ``0`` as
-shown below and the right configuration will be passed to DeepSpeed:
-
-.. code-block:: json
+- ``reduce_bucket_size``: ``hidden_size*hidden_size``
+- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size``
+- ``stage3_param_persistence_threshold``: ``10 * hidden_size``
 
-    {
-        "zero_optimization": {
-            "stage": 3,
-            "offload_optimizer": {
-                "device": "cpu",
-                "pin_memory": true
-            },
-            "offload_param": {
-                "device": "cpu"
-                "pin_memory": true
-            },
-            "overlap_comm": true,
-            "contiguous_gradients": true,
-            "sub_group_size": 1e14,
-            "reduce_bucket_size": 0,
-            "stage3_prefetch_bucket_size": 0,
-            "stage3_param_persistence_threshold": 0,
-            "stage3_max_live_parameters": 1e9,
-            "stage3_max_reuse_distance": 1e9,
-            "stage3_gather_fp16_weights_on_model_save": true
-        }
-    }
+therefore set these values to ``auto`` and the :class:`~transformers.Trainer` will automatically assign the recommended
+values. But, of course, feel free to set these explicitly as well.
 
 ``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large
 models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
 you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
 flexible.
 
-Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and
+If you're migrating from ZeRO-2 configuration note that ``allgather_partitions``, ``allgather_bucket_size`` and
 ``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these in the config file they will just
-be ignored.
+be ignored. Make sure to remove ``cpu_offload`` though, since it has been deprecated in ZeRO-3.
 
 
 
@@ -1058,9 +1058,9 @@ The following configuration example enables NVMe to offload both optimizer state
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
-            "reduce_bucket_size": 0,
-            "stage3_prefetch_bucket_size": 0,
-            "stage3_param_persistence_threshold": 0,
+            "reduce_bucket_size": "auto",
+            "stage3_prefetch_bucket_size": "auto",
+            "stage3_param_persistence_threshold": "auto",
             "stage3_max_live_parameters": 1e9,
             "stage3_max_reuse_distance": 1e9,
             "stage3_gather_fp16_weights_on_model_save": true
@@ -1107,13 +1107,13 @@ these help you to trade scalability for speed depending on your needs.
 ZeRO-2 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
+Here is a full ZeRO-2 auto-configuration file ``ds_config_zero2.json``:
 
 .. code-block:: json
 
     {
         "fp16": {
-            "enabled": true,
+            "enabled": "auto",
             "loss_scale": 0,
             "loss_scale_window": 1000,
             "initial_scale_power": 16,
@@ -1121,6 +1121,25 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
             "min_loss_scale": 1
         },
 
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": "auto",
+                "betas": "auto",
+                "eps": "auto",
+                "weight_decay": "auto"
+            }
+        },
+
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": "auto",
+                "warmup_max_lr": "auto",
+                "warmup_num_steps": "auto"
+            }
+        },
+
         "zero_optimization": {
             "stage": 2,
             "allgather_partitions": true,
@@ -1132,6 +1151,30 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
             "cpu_offload": true
         },
 
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "steps_per_print": 2000,
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+        "wall_clock_breakdown": false
+    }
+
+
+Here is a full ZeRO-2 all-enabled manually set configuration file. It is here mainly for you to see what the typical
+values look like, but we highly recommend using the one with multiple ``auto`` settings in it.
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
         "optimizer": {
             "type": "AdamW",
             "params": {
@@ -1151,6 +1194,17 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
             }
         },
 
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 2e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 2e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        },
+
         "steps_per_print": 2000,
         "wall_clock_breakdown": false
     }
@@ -1160,13 +1214,14 @@ Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
 ZeRO-3 Example
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
+Here is a full ZeRO-3 auto-configuration file ``ds_config_zero3.json``:
+
 
 .. code-block:: json
 
     {
         "fp16": {
-            "enabled": true,
+            "enabled": "auto",
             "loss_scale": 0,
             "loss_scale_window": 1000,
             "initial_scale_power": 16,
@@ -1174,6 +1229,25 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
             "min_loss_scale": 1
         },
 
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": "auto",
+                "betas": "auto",
+                "eps": "auto",
+                "weight_decay": "auto"
+            }
+        },
+
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": "auto",
+                "warmup_max_lr": "auto",
+                "warmup_num_steps": "auto"
+            }
+        },
+
         "zero_optimization": {
             "stage": 3,
             "offload_optimizer": {
@@ -1181,20 +1255,43 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
                 "pin_memory": true
             },
             "offload_param": {
-                "device": "cpu"
+                "device": "cpu",
                 "pin_memory": true
             },
             "overlap_comm": true,
             "contiguous_gradients": true,
             "sub_group_size": 1e14,
-            "reduce_bucket_size": 1e6,
-            "stage3_prefetch_bucket_size": 0.94e6,
-            "stage3_param_persistence_threshold": 1e4,
+            "reduce_bucket_size": "auto",
+            "stage3_prefetch_bucket_size": "auto",
+            "stage3_param_persistence_threshold": "auto",
             "stage3_max_live_parameters": 1e9,
             "stage3_max_reuse_distance": 1e9,
             "stage3_gather_fp16_weights_on_model_save": true
         },
 
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "steps_per_print": 2000,
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+        "wall_clock_breakdown": false
+    }
+
+Here is a full ZeRO-3 all-enabled manually set configuration file. It is here mainly for you to see what the typical
+values look like, but we highly recommend using the one with multiple ``auto`` settings in it.
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
         "optimizer": {
             "type": "AdamW",
             "params": {
@@ -1214,6 +1311,27 @@ Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
             }
         },
 
+        "zero_optimization": {
+            "stage": 3,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": true
+            },
+            "offload_param": {
+                "device": "cpu"
+                "pin_memory": true
+            },
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 1e6,
+            "stage3_prefetch_bucket_size": 0.94e6,
+            "stage3_param_persistence_threshold": 1e4,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        },
+
         "steps_per_print": 2000,
         "wall_clock_breakdown": false
     }
@@ -1249,7 +1367,7 @@ If you don't configure the ``optimizer`` entry in the configuration file, the :c
 automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
 arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.
 
-Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``:
+Here is an example of the auto-configured ``optimizer`` entry for ``AdamW``:
 
 .. code-block:: json
 
@@ -1257,15 +1375,16 @@ Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``:
        "optimizer": {
            "type": "AdamW",
            "params": {
-             "lr": 0.001,
-             "betas": [0.8, 0.999],
-             "eps": 1e-8,
-             "weight_decay": 3e-7
+             "lr": "auto",
+             "betas": "auto",
+             "eps": "auto",
+             "weight_decay": "auto"
            }
          }
     }
 
-Note that the command line arguments will override the values in the configuration file. This is so that there is one
+
+Note that the command line arguments will set the values in the configuration file. This is so that there is one
 definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
 different values in different places. Command line rules. The values that get overridden are:
 
@@ -1276,18 +1395,42 @@ different values in different places. Command line rules. The values that get ov
 
 Therefore please remember to tune the shared hyperparameters on the command line.
 
-If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
-true`` to the top level configuration.
+You can also set the values explicitly:
 
-If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
-make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
+.. code-block:: json
+
+    {
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+             "lr": 0.001,
+             "betas": [0.8, 0.999],
+             "eps": 1e-8,
+             "weight_decay": 3e-7
+           }
+         }
+    }
+
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
+
+If you want to use another optimizer which is not listed above, you will have to add to the top level configuration.
+
+.. code-block:: json
+
+    {
+       "zero_allow_untested_optimizer": true
+    }
+
+Similarly to ``AdamW``, you can configure other officially supported optimizers. Just remember that may have different
+config values. e.g. for Adam you will want ``weight_decay`` around ``0.01``.
 
 
 Scheduler
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
-<https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
+DeepSpeed supports ``LRRangeTest``, ``OneCycle``, ``WarmupLR`` and ``WarmupDecayLR`` learning rate schedulers. The full
+documentation is `here <https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
 
 Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
 
@@ -1295,12 +1438,11 @@ Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
 * ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
   therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
 
-
 If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
 the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version
 of it.
 
-Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``:
+Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``:
 
 .. code-block:: json
 
@@ -1308,24 +1450,41 @@ Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``:
        "scheduler": {
              "type": "WarmupLR",
              "params": {
-                 "warmup_min_lr": 0,
-                 "warmup_max_lr": 0.001,
-                 "warmup_num_steps": 1000
+                 "warmup_min_lr": "auto",
+                 "warmup_max_lr": "auto",
+                 "warmup_num_steps": "auto"
              }
          }
     }
 
-Note that the command line arguments will override the values in the configuration file. This is so that there is one
-definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
-different values in different places. Command line rules. The values that get overridden are:
+Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the correct values in the configuration
+file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example,
+the learning rate is set to different values in different places. Command line rules. The values that get set are:
 
+- ``warmup_min_lr`` with the value of ``0``
 - ``warmup_max_lr`` with the value of ``--learning_rate``
 - ``warmup_num_steps`` with the value of ``--warmup_steps``
 - ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run
   time based on the environment and the size of the dataset and other command line arguments (needed for
   ``WarmupDecayLR``).
 
-Therefore please remember to tune the shared hyperparameters on the command line.
+You can, of course, take over any or all of the configuration values and set those yourself:
+
+.. code-block:: json
+
+    {
+       "scheduler": {
+             "type": "WarmupLR",
+             "params": {
+                 "warmup_min_lr": 0,
+                 "warmup_max_lr": 0.001,
+                 "warmup_num_steps": 1000
+             }
+         }
+    }
+
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
 
 For example, for ``WarmupDecayLR``, you can use the following entry:
 
@@ -1335,16 +1494,16 @@ For example, for ``WarmupDecayLR``, you can use the following entry:
        "scheduler": {
              "type": "WarmupDecayLR",
              "params": {
-                 "total_num_steps": 10,
+                 "total_num_steps": "auto",
+                 "warmup_min_lr": "auto",
+                 "warmup_max_lr": "auto",
+                 "warmup_num_steps": "auto"
                  "last_batch_iteration": -1,
-                 "warmup_min_lr": 0,
-                 "warmup_max_lr": 0.001,
-                 "warmup_num_steps": 1000
              }
          }
     }
 
-and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corrected at loading time.
+and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be set at loading time.
 
 
 
@@ -1353,10 +1512,30 @@ Automatic Mixed Precision
 
 You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
 
-If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the
-configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
+To configure pytorch AMP-like mode set:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": "auto",
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+    }
+
+and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of
+``args.fp16_backend``. The rest of config values are up to you.
+
+This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
 
-Here is an example of the ``fp16`` configuration:
+XXX: However, at the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it
+will be always set to ``true``.
+
+You can also enable/disable this mode explicitly:
 
 .. code-block:: json
 
@@ -1365,17 +1544,32 @@ Here is an example of the ``fp16`` configuration:
             "enabled": true,
             "loss_scale": 0,
             "loss_scale_window": 1000,
+            "initial_scale_power": 16,
             "hysteresis": 2,
             "min_loss_scale": 1
         },
     }
 
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
+
 Here is the `documentation <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
 
-If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
-use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.
+To configure apex AMP-like mode set:
+
+.. code-block:: json
+
+    "amp": {
+        "enabled": "auto",
+        "opt_level": "auto",
+    }
 
-Here is an example of the ``amp`` configuration:
+and the :class:`~transformers.Trainer` will automatically configure it based on the values of ``args.fp16_backend`` and
+``args.fp16_opt_level``.
+
+This mode gets enabled when ``--fp16 --fp16_backend apex --fp16_opt_level 01`` command line args are passed.
+
+You can also configure this mode explicitly:
 
 .. code-block:: json
 
@@ -1386,6 +1580,9 @@ Here is an example of the ``amp`` configuration:
         }
     }
 
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
+
 Here is the `documentation
 <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
 
@@ -1393,30 +1590,42 @@ Here is the `documentation
 Gradient Accumulation
 =======================================================================================================================
 
-While normally DeepSpeed gets gradient accumulation configured with:
+To configure gradient accumulation set:
 
 .. code-block:: json
 
     {
-        "gradient_accumulation_steps": 3,
+        "gradient_accumulation_steps": "auto"
     }
 
-in this case, to enable gradient accumulation, pass the command line ``--gradient_accumulation_steps 3`` argument as
-normal and it will get injected into the DeepSpeed configuration.
+and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.gradient_accumulation_steps``.
 
-If you try to add it directly to the configuration file, you will receive an error from the ``Trainer`` - this is
-because this setting is needed by the ``Trainer`` too, and so this approach ensures that there is a single way of
-setting this value and thus avoid potential subtle errors.
+You can also set the value explicitly:
 
+.. code-block:: json
+
+    {
+        "gradient_accumulation_steps": 3,
+    }
+
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
 
 
 Gradient Clipping
 =======================================================================================================================
 
-If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer`
-will use the value of the ``--max_grad_norm`` command line argument to set it.
+To configure gradient gradient clipping set:
+
+.. code-block:: json
 
-Here is an example of the ``gradient_clipping`` configuration:
+    {
+        "gradient_clipping": "auto"
+    }
+
+and the :class:`~transformers.Trainer` will automatically set it to the value of ``args.max_grad_norm``.
+
+You can also set the value explicitly:
 
 .. code-block:: json
 
@@ -1424,6 +1633,9 @@ Here is an example of the ``gradient_clipping`` configuration:
         "gradient_clipping": 1.0,
     }
 
+But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
+configuration.
+
 
 
 Getting The Model Weights Out
@@ -1444,6 +1656,16 @@ version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't
 DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it
 won't be possible to load it back.
 
+
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage3_gather_fp16_weights_on_model_save": true
+        }
+    }
+
+
 **FP32 Weights:**
 
 While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 7e4ab0f5c7a1..2a5f5b6e3dd5 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -360,36 +360,54 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
 
     config = deepspeed_parse_config(args.deepspeed)
 
-    # The following code translates relevant trainer's cl args into the DS config
+    def is_true(config, key):
+        if config is None:
+            return False
+        return bool(config.get(key))
+
+    def is_auto(config, key):
+        if config is None:
+            return False
+        return config.get(key) == "auto"
+
+    def set_if_auto(config, key, val):
+        if config is None:
+            return
+        if config.get(key) == "auto":
+            config[key] = val
 
-    # First to ensure that there is no mismatch between cl args values and presets in the config
-    # file, ask to not set in ds config file:
-    # - "train_batch_size",
-    # - "train_micro_batch_size_per_gpu",
-    # - "gradient_accumulation_steps"
-    bs_keys = ["train_batch_size", "train_micro_batch_size_per_gpu"]
-    if len([x for x in bs_keys if x in config.keys()]):
-        raise ValueError(
-            f"Do not include {bs_keys} entries in the ds config file, as they will be set via --per_device_train_batch_size or its default"
-        )
-    if "gradient_accumulation_steps" in config.keys():
-        raise ValueError(
-            "Do not include gradient_accumulation_steps entries in the ds config file, as they will be set via --gradient_accumulation_steps or its default"
-        )
+    # The following code translates relevant trainer's cl args into the DS config
 
     # DeepSpeed does:
-    #   train_batch_size = n_gpus * train_micro_batch_size_per_gpu * gradient_accumulation_steps
-    # therefore we just need to set:
-    config["train_micro_batch_size_per_gpu"] = args.per_device_train_batch_size
-    config["gradient_accumulation_steps"] = args.gradient_accumulation_steps
+    # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
+    # therefore we just need to set
+    train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
+    set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size)
+    set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps)
+    set_if_auto(config, "train_batch_size", train_batch_size)
+    set_if_auto(config, "gradient_clipping", args.max_grad_norm)
 
-    if "gradient_clipping" in config:
-        logger.info("Keeping the `gradient_clipping` config intact, ignoring any gradient clipping-specific cl args")
-    else:  # override only if the ds config doesn't already have this section
-        config["gradient_clipping"] = args.max_grad_norm
+    # zero
+    is_zero2 = False
+    is_zero3 = False
+    config_zero = config.get("zero_optimization", {})
+    if config_zero != {}:
+        if config_zero.get("stage") == 2:
+            is_zero2 = True
+        if config_zero.get("stage") == 3:
+            is_zero3 = True
+
+        # now we know for sure if zero3 is enabled
+        deepspeed_zero3_enable(is_zero3)
+
+        # automatically assign the optimal config values based on model config
+        hidden_size = model.config.hidden_size
+        set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
+        set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+        set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
 
     # Optimizer + Scheduler
-    # Currently support combos:
+    # Currently supported combos:
     # 1. DS scheduler + DS optimizer: Yes
     # 2. HF scheduler + HF optimizer: Yes
     # 3. DS scheduler + HF optimizer: Yes
@@ -402,36 +420,37 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     # 4. HF scheduler + DS optimizer: No
 
     optimizer = None
-    if "optimizer" in config:
-        logger.info("Updating the `scheduler` config with other command line arguments")
-
-        # to avoid inconsistent values of lr and warm up steps the command line args override config
-        params = dict(
-            lr=args.learning_rate,
-            betas=[args.adam_beta1, args.adam_beta2],
-            eps=args.adam_epsilon,
-            weight_decay=args.weight_decay,
-        )
-        for k, v in params.items():
-            if k in config["optimizer"]["params"]:
-                logger.info(f"setting optimizer.params.{k} to {v}")
-                config["optimizer"]["params"][k] = v
+    config_optim = config.get("optimizer", {})
+    if config_optim != {}:
+        config_optim_params = config_optim.get("params")
+        set_if_auto(config_optim_params, "lr", args.learning_rate)
+        set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
+        set_if_auto(config_optim_params, "eps", args.adam_epsilon)
+        set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
 
     else:  # override only if the ds config doesn't already have this section
-        if (
-            "zero_optimization" in config
-            and "cpu_offload" in config["zero_optimization"]
-            and config["zero_optimization"]["cpu_offload"] is True
-        ):
+        offload = False
+        if is_zero2:
+            offload = is_true(config_zero, "cpu_offload")
+        elif is_zero3:
+            config_offload_optimizer = config_zero.get("offload_optimizer", {})
+            config_offload_param = config_zero.get("offload_param", {})
+            offload_devices = ["cpu", "nvme"]
+            if (
+                config_offload_optimizer.get("device") in offload_devices
+                or config_offload_param.get("device") in offload_devices
+            ):
+                offload = True
+        if offload:
             raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers")
-        else:
-            # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
-            # But trainer uses AdamW by default.
-            # To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
-            trainer.create_optimizer()
-            optimizer = trainer.optimizer
-            # flag that this is non-native optimizer
-            config["zero_allow_untested_optimizer"] = True
+
+        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
+        # But trainer uses AdamW by default.
+        # To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
+        trainer.create_optimizer()
+        optimizer = trainer.optimizer
+        # flag that this is non-native optimizer
+        config["zero_allow_untested_optimizer"] = True
 
     # DS schedulers (deepspeed/runtime/lr_schedules.py):
     #
@@ -443,22 +462,12 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     # WarmupDecayLR| linear               | get_linear_schedule_with_warmup   |
     lr_scheduler = None
     if "scheduler" in config:
-        logger.info("Updating the `scheduler` config with other command line arguments")
-        # the user won't easily know the correct num_training_steps should they use WarmupDecayLR,
-        # so let's set it to the correct value
-        if config["scheduler"]["type"] == "WarmupDecayLR":
-            logger.info(f"setting scheduler.params.total_num_steps to {num_training_steps}")
-            config["scheduler"]["params"]["total_num_steps"] = num_training_steps
-
-        # to avoid inconsistent values of lr and warmup steps the command line args override config
-        params = dict(
-            warmup_max_lr=args.learning_rate,
-            warmup_num_steps=args.warmup_steps,
-        )
-        for k, v in params.items():
-            if k in config["scheduler"]["params"]:
-                logger.info(f"setting scheduler.params.{k} to {v}")
-                config["scheduler"]["params"][k] = v
+        config_sched = config.get("scheduler", {})
+        config_sched_params = config_sched.get("params")
+        set_if_auto(config_sched_params, "warmup_min_lr", 0)
+        set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
+        set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
+        set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
 
     else:  # override only if the ds config doesn't already have this section
         if "optimizer" in config:
@@ -469,42 +478,18 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
             trainer.create_scheduler(num_training_steps=num_training_steps)
             lr_scheduler = trainer.lr_scheduler
 
-    # fp16
-    if trainer.fp16_backend is not None:
-        # Deepspeed has 2 possible fp16 config entries:
-        # - `fp16`: for the native amp - it has a bunch of optional params but we won't set any here unless the user did the work
-        # - `amp`: which delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
-        if trainer.fp16_backend == "apex":
-            if "amp" in config:
-                logger.info("Keeping the `amp` config intact, ignoring any amp-specific cl args")
-            else:
-                config["amp"] = {
-                    "enabled": True,
-                    "opt_level": args.fp16_opt_level,
-                }
-        elif trainer.fp16_backend == "amp":
-            if "fp16" in config:
-                logger.info("Keeping the `fp16` config intact, ignoring any fp16-specific cl args")
-            else:
-                config["fp16"] = {
-                    "enabled": True,
-                }
-
-    # zero
-    if "zero_optimization" in config:
-        zero = config["zero_optimization"]
-
-        # now we know for sure if zero3 is enabled
-        deepspeed_zero3_enable(zero.get("stage") == 3)
-
-        # automatically assign the optimal config values based on model config
-        hidden_size = model.config.hidden_size
-        if zero.get("reduce_bucket_size") == 0:
-            zero["reduce_bucket_size"] = hidden_size * hidden_size
-        if zero.get("stage3_prefetch_bucket_size") == 0:
-            zero["stage3_prefetch_bucket_size"] = 0.9 * hidden_size * hidden_size
-        if zero.get("stage3_param_persistence_threshold") == 0:
-            zero["stage3_param_persistence_threshold"] = 10 * hidden_size
+    # fp16 / amp
+    # similar to the pytorch native amp - it has a bunch of optional params but we won't set any here unless the user did the work
+    config_fp16 = config.get("fp16")
+    # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
+    # merged and a new release is made, delete the next line and uncomment the one after it
+    set_if_auto(config_fp16, "enabled", True)
+    # set_if_auto(config_fp16, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "amp")
+    # fp16 / apex
+    # delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
+    config_amp = config.get("amp")
+    set_if_auto(config_amp, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "apex")
+    set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
 
     # keep for quick debug:
     # from pprint import pprint; pprint(config)
diff --git a/tests/deepspeed/ds_config_zero2.json b/tests/deepspeed/ds_config_zero2.json
index a516f33125ef..ef180edd1e5b 100644
--- a/tests/deepspeed/ds_config_zero2.json
+++ b/tests/deepspeed/ds_config_zero2.json
@@ -1,6 +1,6 @@
 {
     "fp16": {
-        "enabled": true,
+        "enabled": "auto",
         "loss_scale": 0,
         "loss_scale_window": 1000,
         "initial_scale_power": 16,
@@ -8,36 +8,40 @@
         "min_loss_scale": 1
     },
 
-    "zero_optimization": {
-        "stage": 2,
-        "allgather_partitions": true,
-        "allgather_bucket_size": 2e8,
-        "overlap_comm": true,
-        "reduce_scatter": true,
-        "reduce_bucket_size": 2e8,
-        "contiguous_gradients": true,
-        "cpu_offload": true
-    },
-
     "optimizer": {
         "type": "AdamW",
         "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
         }
     },
 
     "scheduler": {
         "type": "WarmupLR",
         "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
         }
     },
 
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true,
+        "cpu_offload": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
     "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
diff --git a/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
index cbee8a642724..6f7a80e9e455 100644
--- a/tests/deepspeed/ds_config_zero3.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -1,6 +1,6 @@
 {
     "fp16": {
-        "enabled": true,
+        "enabled": "auto",
         "loss_scale": 0,
         "loss_scale_window": 1000,
         "initial_scale_power": 16,
@@ -8,6 +8,25 @@
         "min_loss_scale": 1
     },
 
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
     "zero_optimization": {
         "stage": 3,
         "offload_optimizer": {
@@ -21,33 +40,18 @@
         "overlap_comm": true,
         "contiguous_gradients": true,
         "sub_group_size": 1e14,
-        "reduce_bucket_size": 0,
-        "stage3_prefetch_bucket_size": 0,
-        "stage3_param_persistence_threshold": 0,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
         "stage3_max_live_parameters": 1e9,
         "stage3_max_reuse_distance": 1e9,
         "stage3_gather_fp16_weights_on_model_save": true
     },
 
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
-        }
-    },
-
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
     "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
     "wall_clock_breakdown": false
 }
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 3691fab70885..e00fb59e7128 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -176,23 +176,10 @@ def test_hf_scheduler_ds_optimizer(self):
             trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
             with self.assertRaises(Exception) as context:
                 trainer.train()
-        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
-
-    def test_hf_optimizer_with_offload(self):
-        # must not allow non-DS optimizer when using ZERO-offload
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_dict = self.get_config_dict(ZERO2)
-            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True
-            # sanity check - should the default config change
-            assert (
-                "cpu_offload" in ds_config_zero2_dict["zero_optimization"]
-                and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True
-            ), "ensure the config is set up correctly"
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
+        self.assertTrue(
+            "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception),
+            f"got exception: {context.exception}",
+        )
 
     def test_stage3_nvme_offload(self):
 
@@ -210,6 +197,23 @@ def test_stage3_nvme_offload(self):
         assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
 
     # --- These tests need to run on both zero stages --- #
+
+    @parameterized.expand(stages)
+    def test_hf_optimizer_with_offload(self, stage):
+        # must not allow non-DS optimizer when using ZERO-offload
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = self.get_config_dict(stage)
+            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+            # force cpu offload
+            if stage == "stage2":
+                ds_config_dict["zero_optimization"]["cpu_offload"] = True
+            elif stage == "stage3":
+                ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
+
     @parameterized.expand(stages)
     def test_fake_notebook_no_launcher(self, stage):
         # this setup emulates a notebook where a launcher needs to be emulated by hand

From 43bde0e73c29311971d4635a9df0e1561917949c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 24 Apr 2021 14:03:44 -0700
Subject: [PATCH 03/12] up version requirement

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index cebcb65d096b..a1ad2880f18b 100644
--- a/setup.py
+++ b/setup.py
@@ -90,7 +90,7 @@
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
-    "deepspeed>=0.3.14",
+    "deepspeed>=0.3.15",
     "docutils==0.16.0",
     "fairscale>0.3",
     "faiss-cpu",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 3b7d158d1e23..f1695e2d3e14 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -7,7 +7,7 @@
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.3.14",
+    "deepspeed": "deepspeed>=0.3.15",
     "docutils": "docutils==0.16.0",
     "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",

From 2e41d63e45239b74b4ec88b19135caa2cd70dc98 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sat, 24 Apr 2021 19:37:15 -0700
Subject: [PATCH 04/12] wip

---
 src/transformers/integrations.py   | 217 +++++++++++++++++++----------
 src/transformers/modeling_utils.py |  10 +-
 src/transformers/trainer.py        |  61 +++++---
 tests/deepspeed/test_deepspeed.py  | 101 +++++++++-----
 tests/test_trainer.py              |  15 +-
 5 files changed, 264 insertions(+), 140 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 2a5f5b6e3dd5..4cd8c6fb6715 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -270,36 +270,73 @@ def rewrite_logs(d):
 
 
 _is_deepspeed_zero3_enabled = None
+_deepspeed_config = None
+
+
+def deepspeed_config_get():
+    """
+    Returns a global deepspeed config or ``None`` if one is not set
+    """
+    return _deepspeed_config
+
+
+def deepspeed_config_set(ds_config=None):
+    """
+    Try to auto-discover if we are about to use DeepSpeed. This will only work for scripts using cli to
+    pass``--deepspeed ds_config.json``.
+
+    All other scripts should pass ``ds_config`` (path or dict) explicitly.
+
+    Returns: a config as a dict if ``ds_config`` was passed or it was auto-discovered, ``None`` otherwise.
+    """
+    global _deepspeed_config
+
+    # auto-discovery attempt
+    if ds_config is None:
+        if "--deepspeed" in sys.argv:
+            idx = sys.argv.index("--deepspeed")
+            ds_config = sys.argv[idx + 1]
+            if not os.path.exists(ds_config):
+                raise ValueError("--deepspeed requires a valid path to a config file")
+        else:
+            return None
+
+    _deepspeed_config = deepspeed_parse_config(ds_config)
+
+    return _deepspeed_config
 
 
 def is_deepspeed_zero3_enabled():
     """
-    This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3.
+    This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3. It can
+    be called before the Trainer was instantiated.
 
     It includes an auto-discovery method, see comments in the code for details.
 
+    If you aren't using a pre-made example script and writing your own, best to explicitly set the config via
+    ``deepspeed_config_set(ds_config=ds_config)`` **before** instantiating a model object in order to get the model
+    efficiently loaded across multiple-gpus.
+
     Returns: ``True`` if either it was explicitly enabled via ``deepspeed_zero3_enable(True)`` or the auto-detector was
     able to derive that the ``Trainer`` will be running via DeepSpeed ZeRO stage 3.
+
     """
+    global _deepspeed_config
     global _is_deepspeed_zero3_enabled
     if _is_deepspeed_zero3_enabled is None:
         _is_deepspeed_zero3_enabled = False
-        # Try to auto-discover if we are about to use DeepSpeed with ZeRO3 enabled. This will only
-        # work for scripts using cli to pass --deepspeed ds_config.json. If cmd args aren't used,
-        # then to get the model efficiently loaded across multiple-gpus one has to explicitly call
-        # is_deepspeed_zero3_enabled(True) **before** instantiating a model object
-        if "--deepspeed" in sys.argv:
-            idx = sys.argv.index("--deepspeed")
-            ds_config = sys.argv[idx + 1]
-            if not os.path.exists(ds_config):
-                raise ValueError("--deepspeed requires a valid path to a config file")
-            config = deepspeed_parse_config(ds_config)
-            if (
-                "zero_optimization" in config
-                and "stage" in config["zero_optimization"]
-                and config["zero_optimization"]["stage"] == 3
-            ):
-                _is_deepspeed_zero3_enabled = True
+
+    if _deepspeed_config is None:
+        # try auto-discovery
+        _deepspeed_config = deepspeed_config_set()
+
+    if (
+        _deepspeed_config is not None
+        and "zero_optimization" in _deepspeed_config
+        and "stage" in _deepspeed_config["zero_optimization"]
+        and _deepspeed_config["zero_optimization"]["stage"] == 3
+    ):
+        _is_deepspeed_zero3_enabled = True
 
     return _is_deepspeed_zero3_enabled
 
@@ -339,43 +376,29 @@ def deepspeed_parse_config(ds_config):
     return config
 
 
-def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
-    """
-    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
+def is_true(config, key):
+    if config is None:
+        return False
+    return bool(config.get(key))
 
-    If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
 
-    Args:
-        trainer: Trainer object
-        num_training_steps: per single gpu
-        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
+def is_auto(config, key):
+    if config is None:
+        return False
+    return config.get(key) == "auto"
 
-    Returns: model, optimizer, lr_scheduler
 
-    """
-    import deepspeed
+def set_if_auto(config, key, val):
+    if config is None:
+        return
+    if config.get(key) == "auto":
+        config[key] = val
 
-    args = trainer.args
-    model = trainer.model
 
+def deepspeed_config_setup(trainer):
+    args = trainer.args
     config = deepspeed_parse_config(args.deepspeed)
 
-    def is_true(config, key):
-        if config is None:
-            return False
-        return bool(config.get(key))
-
-    def is_auto(config, key):
-        if config is None:
-            return False
-        return config.get(key) == "auto"
-
-    def set_if_auto(config, key, val):
-        if config is None:
-            return
-        if config.get(key) == "auto":
-            config[key] = val
-
     # The following code translates relevant trainer's cl args into the DS config
 
     # DeepSpeed does:
@@ -400,11 +423,59 @@ def set_if_auto(config, key, val):
         # now we know for sure if zero3 is enabled
         deepspeed_zero3_enable(is_zero3)
 
-        # automatically assign the optimal config values based on model config
-        hidden_size = model.config.hidden_size
-        set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
-        set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
-        set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
+    config_optim = config.get("optimizer", {})
+    if config_optim != {}:
+        config_optim_params = config_optim.get("params")
+        set_if_auto(config_optim_params, "lr", args.learning_rate)
+        set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
+        set_if_auto(config_optim_params, "eps", args.adam_epsilon)
+        set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
+
+    if "scheduler" in config:
+        config_sched = config.get("scheduler", {})
+        config_sched_params = config_sched.get("params")
+        set_if_auto(config_sched_params, "warmup_min_lr", 0)
+        set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
+        set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
+        # total_num_steps - will get set in deepspeed_init
+
+    # fp16 / amp
+    # similar to the pytorch native amp - it has a bunch of optional params but we won't set any here unless the user did the work
+    config_fp16 = config.get("fp16")
+    # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
+    # merged and a new release is made, delete the next line and uncomment the one after it
+    set_if_auto(config_fp16, "enabled", True)
+    # set_if_auto(config_fp16, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "amp")
+
+    # fp16 / apex
+    # delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
+    config_amp = config.get("amp")
+    set_if_auto(config_amp, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "apex")
+    set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
+
+    # XXX: this can be DeepSpeedConfig object w/ various pre-calculated attributes and self-destruct
+    trainer.deepspeed_config = config
+
+
+def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
+    """
+    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
+
+    If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
+
+    Args:
+        trainer: Trainer object
+        num_training_steps: per single gpu
+        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
+
+    Returns: model, optimizer, lr_scheduler
+
+    """
+    import deepspeed
+
+    # resume config update - some bits like `model` and `num_training_steps` only become available during train
+    config = trainer.deepspeed_config
+    model = trainer.model
 
     # Optimizer + Scheduler
     # Currently supported combos:
@@ -419,16 +490,25 @@ def set_if_auto(config, key, val):
     # 3. DS scheduler + HF optimizer: No
     # 4. HF scheduler + DS optimizer: No
 
+    # XXX: duplicated code
+    is_zero2 = False
+    is_zero3 = False
+    config_zero = config.get("zero_optimization", {})
+    if config_zero != {}:
+        if config_zero.get("stage") == 2:
+            is_zero2 = True
+        if config_zero.get("stage") == 3:
+            is_zero3 = True
+
+        # automatically assign the optimal config values based on model config
+        hidden_size = model.config.hidden_size
+        set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
+        set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+        set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
+
     optimizer = None
     config_optim = config.get("optimizer", {})
-    if config_optim != {}:
-        config_optim_params = config_optim.get("params")
-        set_if_auto(config_optim_params, "lr", args.learning_rate)
-        set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
-        set_if_auto(config_optim_params, "eps", args.adam_epsilon)
-        set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
-
-    else:  # override only if the ds config doesn't already have this section
+    if config_optim == {}:
         offload = False
         if is_zero2:
             offload = is_true(config_zero, "cpu_offload")
@@ -464,9 +544,6 @@ def set_if_auto(config, key, val):
     if "scheduler" in config:
         config_sched = config.get("scheduler", {})
         config_sched_params = config_sched.get("params")
-        set_if_auto(config_sched_params, "warmup_min_lr", 0)
-        set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
-        set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
         set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
 
     else:  # override only if the ds config doesn't already have this section
@@ -478,22 +555,12 @@ def set_if_auto(config, key, val):
             trainer.create_scheduler(num_training_steps=num_training_steps)
             lr_scheduler = trainer.lr_scheduler
 
-    # fp16 / amp
-    # similar to the pytorch native amp - it has a bunch of optional params but we won't set any here unless the user did the work
-    config_fp16 = config.get("fp16")
-    # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
-    # merged and a new release is made, delete the next line and uncomment the one after it
-    set_if_auto(config_fp16, "enabled", True)
-    # set_if_auto(config_fp16, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "amp")
-    # fp16 / apex
-    # delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
-    config_amp = config.get("amp")
-    set_if_auto(config_amp, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "apex")
-    set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
-
     # keep for quick debug:
     # from pprint import pprint; pprint(config)
 
+    # update the global config object
+    deepspeed_config_set(ds_config=config)
+
     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
 
     model, optimizer, _, lr_scheduler = deepspeed.initialize(
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 82a0a99179bc..bd4d5a8eeded 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -41,7 +41,7 @@
     replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
-from .integrations import is_deepspeed_zero3_enabled
+from .integrations import deepspeed_config_get, is_deepspeed_zero3_enabled
 from .utils import logging
 
 
@@ -1083,9 +1083,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
+            ds_config = deepspeed_config_get()
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
-            # this immediately partitions the model to avoid the overhead in time and memory copying it on CPU or each GPU first
-            with deepspeed.zero.Init():
+            # this immediately partitions the model across all gpus, to avoid the overhead in time
+            # and memory copying it on CPU or each GPU first
+
+            # XXX: param_dict will be shortly replaced by deepspeed_config
+            with deepspeed.zero.Init(param_dict=ds_config):
                 model = cls(config, *model_args, **model_kwargs)
         else:
             model = cls(config, *model_args, **model_kwargs)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 9635dc40a3f5..69fa67163e61 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -43,6 +43,7 @@
     run_hp_search_optuna,
     run_hp_search_ray,
     deepspeed_init,
+    deepspeed_config_setup,
     is_deepspeed_zero3_enabled,
 )
 
@@ -55,6 +56,8 @@
 from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import RandomSampler, SequentialSampler
 
+import transformers
+
 from . import __version__
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
@@ -287,6 +290,32 @@ def __init__(
         # force device and distributed setup init explicitly
         args._setup_devices
 
+        # Mixed precision setup
+        self.use_apex = False
+        self.use_amp = False
+        self.fp16_backend = None
+
+        if args.fp16:
+            if args.fp16_backend == "auto":
+                self.fp16_backend = "amp" if _is_native_amp_available else "apex"
+            else:
+                self.fp16_backend = args.fp16_backend
+            logger.info(f"Using {self.fp16_backend} fp16 backend")
+
+        if args.fp16 and not args.deepspeed:  # deepspeed manages its own fp16
+            if self.fp16_backend == "amp":
+                self.use_amp = True
+                self.scaler = ShardedGradScaler() if self.sharded_ddp is not None else torch.cuda.amp.GradScaler()
+            else:
+                if not is_apex_available():
+                    raise ImportError(
+                        "Using FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex."
+                    )
+                self.use_apex = True
+
+        if args.deepspeed:
+            deepspeed_config_setup(self)
+
         if model is None:
             if model_init is not None:
                 self.model_init = model_init
@@ -395,29 +424,6 @@ def __init__(
 
         self._signature_columns = None
 
-        # Mixed precision setup
-        self.use_apex = False
-        self.use_amp = False
-        self.fp16_backend = None
-
-        if args.fp16:
-            if args.fp16_backend == "auto":
-                self.fp16_backend = "amp" if _is_native_amp_available else "apex"
-            else:
-                self.fp16_backend = args.fp16_backend
-            logger.info(f"Using {self.fp16_backend} fp16 backend")
-
-        if args.fp16 and not args.deepspeed:  # deepspeed manages its own fp16
-            if self.fp16_backend == "amp":
-                self.use_amp = True
-                self.scaler = ShardedGradScaler() if self.sharded_ddp is not None else torch.cuda.amp.GradScaler()
-            else:
-                if not is_apex_available():
-                    raise ImportError(
-                        "Using FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex."
-                    )
-                self.use_apex = True
-
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
             self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
@@ -442,6 +448,15 @@ def __init__(
         # very last
         self._memory_tracker.stop_and_update_metrics()
 
+    def __del__(self):
+        # in order to get various deepspeed components into the transformers framework w/o creating
+        # too many changes to the API, we use a few global variables, which aren't a problem as long
+        # as there is one Trainer per program execution. But if multiple Trainers are used we have
+        # to take care to reset these globals when Trainer is destroyed
+        if self.deepspeed:
+            transformers.integrations._is_deepspeed_zero3_enabled = None
+            transformers.integrations._deepspeed_config = None
+
     def add_callback(self, callback):
         """
         Add a callback to the current list of :class:`~transformer.TrainerCallback`.
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index e00fb59e7128..725297ff085d 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -42,7 +42,7 @@
     from test_trainer import TrainerIntegrationCommon  # noqa
 
     if is_torch_available():
-        from test_trainer import get_regression_trainer  # noqa
+        from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer  # noqa
 
 
 set_seed(42)
@@ -67,7 +67,7 @@ def require_deepspeed(test_case):
 
 
 if is_deepspeed_available():
-    from deepspeed.utils import logger  # noqa
+    from deepspeed.utils import logger as deepspeed_logger  # noqa
 
 ZERO2 = "zero2"
 ZERO3 = "zero3"
@@ -118,12 +118,6 @@ def setUp(self):
         with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
             self.ds_config_dict[ZERO3] = json.load(f)
 
-    def tearDown(self):
-        # XXX: Fixme - this is a temporary band-aid since this global variable impacts other tests
-        import transformers
-
-        transformers.integrations._is_deepspeed_zero3_enabled = None
-
     def get_config_dict(self, stage):
         """ As the tests modify the dict, always make a copy """
         config = deepcopy(self.ds_config_dict[stage])
@@ -182,37 +176,77 @@ def test_hf_scheduler_ds_optimizer(self):
         )
 
     def test_stage3_nvme_offload(self):
-
-        with CaptureLogger(logger) as cs:
-            with mockenv_context(**self.dist_env_1_gpu):
-                # this actually doesn't have to be on NVMe, any storage will do since this test only
-                # runs a simple check that we can use some directory as if it were NVMe
-                nvme_path = self.get_auto_remove_tmp_dir()
-                nvme_config = dict(device="nvme", nvme_path=nvme_path)
-                ds_config_zero3_dict = self.get_config_dict(ZERO3)
-                ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
-                ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
-                trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict)
+        with mockenv_context(**self.dist_env_1_gpu):
+            # this actually doesn't have to be on NVMe, any storage will do since this test only
+            # runs a simple check that we can use some directory as if it were NVMe
+            nvme_path = self.get_auto_remove_tmp_dir()
+            nvme_config = dict(device="nvme", nvme_path=nvme_path)
+            ds_config_zero3_dict = self.get_config_dict(ZERO3)
+            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict)
+            with CaptureLogger(deepspeed_logger) as cs:
                 trainer.train()
-        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+            self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
 
     # --- These tests need to run on both zero stages --- #
 
+    @parameterized.expand(stages)
+    def test_fp32(self, stage):
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["enabled"] = False  # force non-fp16 mode
+
+        # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)
+
+        # XXX: rewrite this test once fp32 is supported by DeepSpeed
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+            self.assertIn(
+                "ZeRO is only supported if fp16 is enabled",
+                str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
     @parameterized.expand(stages)
     def test_hf_optimizer_with_offload(self, stage):
         # must not allow non-DS optimizer when using ZERO-offload
+        ds_config_dict = self.get_config_dict(stage)
+        del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+        # force cpu offload
+        if stage == "stage2":
+            ds_config_dict["zero_optimization"]["cpu_offload"] = True
+        elif stage == "stage3":
+            ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = self.get_config_dict(stage)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            # force cpu offload
-            if stage == "stage2":
-                ds_config_dict["zero_optimization"]["cpu_offload"] = True
-            elif stage == "stage3":
-                ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
             trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
             with self.assertRaises(Exception) as context:
                 trainer.train()
-        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
+            self.assertIn(
+                "ZeRO Offload can only work with DeepSpeed optimizers",
+                str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
+    @parameterized.expand(stages)
+    def test_model_init(self, stage):
+        ds_config_dict = self.get_config_dict(stage)
+
+        # XXX: not really testing anything that we actually need here
+        # need real model - move to launcher
+        def model_init(self):
+            # setup Deepspeed
+            a = 0
+            b = 0
+            double_output = False
+            config = RegressionModelConfig(a=a, b=b, double_output=double_output)
+            model = RegressionPreTrainedModel(config)
+            return model
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict, model_init=model_init)
+            trainer.train()
 
     @parameterized.expand(stages)
     def test_fake_notebook_no_launcher(self, stage):
@@ -221,13 +255,12 @@ def test_fake_notebook_no_launcher(self, stage):
         # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
         # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
-        # to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger.
-
-        with CaptureLogger(logger) as cs:
-            with mockenv_context(**self.dist_env_1_gpu):
-                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+            with CaptureLogger(deepspeed_logger) as cs:
                 trainer.train()
-        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+            self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
 
     @parameterized.expand(stages)
     def test_early_get_last_lr(self, stage):
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index b5071783f2bd..ca495cbf14cb 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -206,16 +206,21 @@ def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len
         label_names = kwargs.get("label_names", None)
         train_dataset = RegressionDataset(length=train_len, label_names=label_names)
         eval_dataset = RegressionDataset(length=eval_len, label_names=label_names)
-        if pretrained:
-            config = RegressionModelConfig(a=a, b=b, double_output=double_output)
-            model = RegressionPreTrainedModel(config)
+
+        model_init = kwargs.pop("model_init", None)
+        if model_init is not None:
+            model = None
         else:
-            model = RegressionModel(a=a, b=b, double_output=double_output)
+            if pretrained:
+                config = RegressionModelConfig(a=a, b=b, double_output=double_output)
+                model = RegressionPreTrainedModel(config)
+            else:
+                model = RegressionModel(a=a, b=b, double_output=double_output)
+
         compute_metrics = kwargs.pop("compute_metrics", None)
         data_collator = kwargs.pop("data_collator", None)
         optimizers = kwargs.pop("optimizers", (None, None))
         output_dir = kwargs.pop("output_dir", "./regression")
-        model_init = kwargs.pop("model_init", None)
 
         args = RegressionTrainingArguments(output_dir, a=a, b=b, **kwargs)
         return Trainer(

From 3ee1a0772669ce7b8932b82fea9a252d4824d3ff Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sun, 25 Apr 2021 21:42:25 -0700
Subject: [PATCH 05/12] massive rewrite

---
 docs/source/main_classes/trainer.rst |  18 +-
 src/transformers/integrations.py     | 352 ++++++++++++---------------
 src/transformers/modeling_utils.py   |   5 +-
 src/transformers/trainer.py          |  68 +++---
 src/transformers/training_args.py    |  15 +-
 tests/deepspeed/test_deepspeed.py    |  54 ++--
 6 files changed, 243 insertions(+), 269 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 523351f5ab22..f773177e9cf3 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1743,18 +1743,20 @@ context manager (which is also a function decorator), like so:
 As you can see this gives you a randomly initialized model.
 
 If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as
-``is_deepspeed_zero3_enabled()`` returns ``True``, which can be set manually via ``deepspeed_zero3_enable(True)``.
-Therefore to enable this feature here is the required sequence:
+``is_deepspeed_zero3_enabled()`` returns ``True``, which currently is setup by the
+class:`~transformers.TrainingArguments` object if the passed DeepSpeed configuration file contains ZeRO-3 config
+section. Thus you must create the class:`~transformers.TrainingArguments` object **before** calling
+``from_pretrained``. Here is an example of a possible sequence:
 
 .. code-block:: python
 
-    from transformers.integrations import deepspeed_zero3_enable
-    deepspeed_zero3_enable(True)
-    model = T5ForConditionalGeneration.from_pretrained("t5-small")
+    from transformers import AutoModel, Trainer, TrainingArguments
+    training_args = TrainingArguments(..., deepspeed=ds_config)
+    model = AutoModel.from_pretrained("t5-small")
+    trainer = Trainer(model=model, args=training_args, ...)
 
-If you're using ``Trainer`` command line arguments which include ``--deepspeed ds_config.json`` with ZeRO-3 config
-enabled, then you can skip ``deepspeed_zero3_enable(True)`` as it will try to discover whether it'll be run under
-ZeRO-3 and ``from_pretrained`` will automatically activate this feature.
+If you're using the official example scripts and your command line arguments include ``--deepspeed ds_config.json``
+with ZeRO-3 config enabled, then everything is already done for you, since this is how example scripts are written.
 
 Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used.
 
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 4cd8c6fb6715..8ca841be7dc0 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -19,8 +19,8 @@
 import json
 import numbers
 import os
-import sys
 import tempfile
+import weakref
 from copy import deepcopy
 from pathlib import Path
 
@@ -269,192 +269,180 @@ def rewrite_logs(d):
     return new_d
 
 
-_is_deepspeed_zero3_enabled = None
-_deepspeed_config = None
+def is_true(config, key):
+    if config is None:
+        return False
+    return bool(config.get(key))
 
 
-def deepspeed_config_get():
-    """
-    Returns a global deepspeed config or ``None`` if one is not set
-    """
-    return _deepspeed_config
+def is_auto(config, key):
+    if config is None:
+        return False
+    return config.get(key) == "auto"
 
 
-def deepspeed_config_set(ds_config=None):
-    """
-    Try to auto-discover if we are about to use DeepSpeed. This will only work for scripts using cli to
-    pass``--deepspeed ds_config.json``.
+def set_if_auto(config, key, val):
+    if config is None:
+        return
+    if config.get(key) == "auto":
+        config[key] = val
 
-    All other scripts should pass ``ds_config`` (path or dict) explicitly.
 
-    Returns: a config as a dict if ``ds_config`` was passed or it was auto-discovered, ``None`` otherwise.
+class DeepSpeedConfigHF:
     """
-    global _deepspeed_config
-
-    # auto-discovery attempt
-    if ds_config is None:
-        if "--deepspeed" in sys.argv:
-            idx = sys.argv.index("--deepspeed")
-            ds_config = sys.argv[idx + 1]
-            if not os.path.exists(ds_config):
-                raise ValueError("--deepspeed requires a valid path to a config file")
-        else:
-            return None
-
-    _deepspeed_config = deepspeed_parse_config(ds_config)
+    This object contains Deepspeed configuration and can be quickly queried for things like zero stage.
 
-    return _deepspeed_config
+    We store a ``weakref`` proxy of this object in the module's global to be able to access the config from areas where
+    Trainer is not available.
 
-
-def is_deepspeed_zero3_enabled():
+    The ``DeepSpeedConfigHF`` object is meant to be created during ``TrainingArguments`` object creation and has the
+    same lifespan as the latter.
     """
-    This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3. It can
-    be called before the Trainer was instantiated.
 
-    It includes an auto-discovery method, see comments in the code for details.
+    def __init__(self, args):
+        self.config = None
+        self.stage = 0
+        self.offload = False
 
-    If you aren't using a pre-made example script and writing your own, best to explicitly set the config via
-    ``deepspeed_config_set(ds_config=ds_config)`` **before** instantiating a model object in order to get the model
-    efficiently loaded across multiple-gpus.
+        dep_version_check("deepspeed")
 
-    Returns: ``True`` if either it was explicitly enabled via ``deepspeed_zero3_enable(True)`` or the auto-detector was
-    able to derive that the ``Trainer`` will be running via DeepSpeed ZeRO stage 3.
+        self.config_process(args)
 
-    """
-    global _deepspeed_config
-    global _is_deepspeed_zero3_enabled
-    if _is_deepspeed_zero3_enabled is None:
-        _is_deepspeed_zero3_enabled = False
-
-    if _deepspeed_config is None:
-        # try auto-discovery
-        _deepspeed_config = deepspeed_config_set()
+        # set global weakref object
+        deepspeed_config_hf_set(self)
 
-    if (
-        _deepspeed_config is not None
-        and "zero_optimization" in _deepspeed_config
-        and "stage" in _deepspeed_config["zero_optimization"]
-        and _deepspeed_config["zero_optimization"]["stage"] == 3
-    ):
-        _is_deepspeed_zero3_enabled = True
+    def is_zero2(self):
+        return self.stage == 2
 
-    return _is_deepspeed_zero3_enabled
+    def is_zero3(self):
+        return self.stage == 3
 
+    def is_offload(self):
+        return self.offload
 
-def deepspeed_zero3_enable(enable=True):
-    """
-    ``is_deepspeed_zero3_enabled()`` tries to derive automatically if DeepSpeed ZeRO 3 is going to be used by looking
-    at ``sys.argv`` which may or may contain information about where to find the DeepSpeed config if any.
-
-    This function allows for explicit enabling/disabling of this global flag.
+    def config_process(self, args):
+        """
+        1. load json if the ``args.deepspeed`` is a path
+        2. replace any ``auto`` values in the config with the correct or recommended value
 
-    Args:
-        enable: if set to ``True`` will make ``is_deepspeed_zero3_enabled()`` return ``True``
-    """
-    global _is_deepspeed_zero3_enabled
-    _is_deepspeed_zero3_enabled = enable
+        This is done as early as possible, before model is created, to allow ``is_deepspeed_zero3_enabled`` query and
+        getting to the early deepspeed config object during ``zero.Init()`` which needs whether fp16 is enabled, dtype,
+        etc.
 
+        """
+        config_file_or_dict = args.deepspeed
+        if isinstance(config_file_or_dict, dict):
+            # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
+            # modified it, it will not be accepted here again, since `auto` values would have been overriden
+            config = deepcopy(config_file_or_dict)
+        elif isinstance(config_file_or_dict, str):
+            with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        else:
+            raise ValueError("expecting either a path to a config file or a pre-populated dict")
+
+        self.config = config
+
+        # DeepSpeed does:
+        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
+        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
+        set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size)
+        set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps)
+        set_if_auto(config, "train_batch_size", train_batch_size)
+        set_if_auto(config, "gradient_clipping", args.max_grad_norm)
+
+        # zero
+        config_zero = config.get("zero_optimization", {})
+        self.stage = config_zero.get("stage", 0)
+
+        config_optim = config.get("optimizer", {})
+        if config_optim != {}:
+            config_optim_params = config_optim.get("params")
+            set_if_auto(config_optim_params, "lr", args.learning_rate)
+            set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
+            set_if_auto(config_optim_params, "eps", args.adam_epsilon)
+            set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
 
-def deepspeed_parse_config(ds_config):
-    """
-    If ``ds_config`` isn't already a dict, read it from the config file.
+        config_sched = config.get("scheduler", {})
+        if config_sched != {}:
+            config_sched_params = config_sched.get("params")
+            set_if_auto(config_sched_params, "warmup_min_lr", 0)
+            set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
+            set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
+            # total_num_steps - will get set in deepspeed_init
+
+        # fp16
+        if args.fp16:
+            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
+        else:
+            fp16_backend = None
+
+        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
+        # any here unless the user did the work
+        config_fp16 = config.get("fp16")
+        # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
+        # merged and a new release is made, delete the next line and uncomment the one after it
+        set_if_auto(config_fp16, "enabled", True)
+        # set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
+
+        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
+        # ZeRO features, so probably best to be avoided.
+        config_amp = config.get("amp")
+        set_if_auto(config_amp, "enabled", fp16_backend == "apex")
+        set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
+
+        config_zero = config.get("zero_optimization", {})
+        if self.is_zero2():
+            self.offload = is_true(config_zero, "cpu_offload")
+        elif self.is_zero3():
+            offload_devices = ["cpu", "nvme"]
+            if config_zero.get("offload_optimizer", {}).get("device") in offload_devices:
+                self.offload = True
+            if config_zero.get("offload_param", {}).get("device") in offload_devices:
+                self.offload = True
 
-    If it's already a dict, return a copy of it, so that we can freely modify it.
-    """
-    dep_version_check("deepspeed")
+    def config_finalize(self, args, model, num_training_steps):
+        """
+        This stage is run after we have the model and know num_training_steps.
 
-    if isinstance(ds_config, dict):
-        # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
-        # modified it, it will not be accepted here again, since some config params must be not set by users
-        config = deepcopy(ds_config)
-    elif isinstance(ds_config, str):
-        with io.open(ds_config, "r", encoding="utf-8") as f:
-            config = json.load(f)
-    else:
-        raise ValueError("expecting either a path to a config file or a pre-populated dict")
+        Now we we can complete the configuration process.
 
-    return config
+        """
+        config = self.config
+
+        # zero
+        config_zero = config.get("zero_optimization", {})
+        if self.is_zero3():
+            # automatically assign the optimal config values based on model config
+            hidden_size = model.config.hidden_size
+            set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
+            set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+            set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
+
+        # scheduler
+        config_sched = config.get("scheduler", {})
+        config_sched_params = config_sched.get("params", {})
+        set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
 
 
-def is_true(config, key):
-    if config is None:
-        return False
-    return bool(config.get(key))
+# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
+_deepspeed_config_hf_weak_ref = None
 
 
-def is_auto(config, key):
-    if config is None:
-        return False
-    return config.get(key) == "auto"
+def deepspeed_config_hf_set(deepspeed_config_hf_obj):
+    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
+    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
+    global _deepspeed_config_hf_weak_ref
+    # will go away automatically when DeepSpeedConfigHF is destroyed (when TrainingArguments is destroyed)
+    _deepspeed_config_hf_weak_ref = weakref.ref(deepspeed_config_hf_obj)
 
 
-def set_if_auto(config, key, val):
-    if config is None:
-        return
-    if config.get(key) == "auto":
-        config[key] = val
+def is_deepspeed_zero3_enabled():
+    return _deepspeed_config_hf_weak_ref().is_zero3() if _deepspeed_config_hf_weak_ref() is not None else False
 
 
-def deepspeed_config_setup(trainer):
-    args = trainer.args
-    config = deepspeed_parse_config(args.deepspeed)
-
-    # The following code translates relevant trainer's cl args into the DS config
-
-    # DeepSpeed does:
-    # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
-    # therefore we just need to set
-    train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
-    set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size)
-    set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps)
-    set_if_auto(config, "train_batch_size", train_batch_size)
-    set_if_auto(config, "gradient_clipping", args.max_grad_norm)
-
-    # zero
-    is_zero2 = False
-    is_zero3 = False
-    config_zero = config.get("zero_optimization", {})
-    if config_zero != {}:
-        if config_zero.get("stage") == 2:
-            is_zero2 = True
-        if config_zero.get("stage") == 3:
-            is_zero3 = True
-
-        # now we know for sure if zero3 is enabled
-        deepspeed_zero3_enable(is_zero3)
-
-    config_optim = config.get("optimizer", {})
-    if config_optim != {}:
-        config_optim_params = config_optim.get("params")
-        set_if_auto(config_optim_params, "lr", args.learning_rate)
-        set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
-        set_if_auto(config_optim_params, "eps", args.adam_epsilon)
-        set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
-
-    if "scheduler" in config:
-        config_sched = config.get("scheduler", {})
-        config_sched_params = config_sched.get("params")
-        set_if_auto(config_sched_params, "warmup_min_lr", 0)
-        set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
-        set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
-        # total_num_steps - will get set in deepspeed_init
-
-    # fp16 / amp
-    # similar to the pytorch native amp - it has a bunch of optional params but we won't set any here unless the user did the work
-    config_fp16 = config.get("fp16")
-    # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
-    # merged and a new release is made, delete the next line and uncomment the one after it
-    set_if_auto(config_fp16, "enabled", True)
-    # set_if_auto(config_fp16, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "amp")
-
-    # fp16 / apex
-    # delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
-    config_amp = config.get("amp")
-    set_if_auto(config_amp, "enabled", trainer.fp16_backend is not None and trainer.fp16_backend == "apex")
-    set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
-
-    # XXX: this can be DeepSpeedConfig object w/ various pre-calculated attributes and self-destruct
-    trainer.deepspeed_config = config
+def deepspeed_config():
+    return _deepspeed_config_hf_weak_ref().config if _deepspeed_config_hf_weak_ref() is not None else None
 
 
 def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
@@ -473,10 +461,14 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     """
     import deepspeed
 
-    # resume config update - some bits like `model` and `num_training_steps` only become available during train
-    config = trainer.deepspeed_config
     model = trainer.model
 
+    deepspeed_config_hf = trainer.args.deepspeed_config_hf
+    deepspeed_config_hf.config_finalize(trainer.args, model, num_training_steps)
+
+    # resume config update - some bits like `model` and `num_training_steps` only become available during train
+    config = deepspeed_config_hf.config
+
     # Optimizer + Scheduler
     # Currently supported combos:
     # 1. DS scheduler + DS optimizer: Yes
@@ -490,46 +482,16 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     # 3. DS scheduler + HF optimizer: No
     # 4. HF scheduler + DS optimizer: No
 
-    # XXX: duplicated code
-    is_zero2 = False
-    is_zero3 = False
-    config_zero = config.get("zero_optimization", {})
-    if config_zero != {}:
-        if config_zero.get("stage") == 2:
-            is_zero2 = True
-        if config_zero.get("stage") == 3:
-            is_zero3 = True
-
-        # automatically assign the optimal config values based on model config
-        hidden_size = model.config.hidden_size
-        set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
-        set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
-        set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
-
     optimizer = None
-    config_optim = config.get("optimizer", {})
-    if config_optim == {}:
-        offload = False
-        if is_zero2:
-            offload = is_true(config_zero, "cpu_offload")
-        elif is_zero3:
-            config_offload_optimizer = config_zero.get("offload_optimizer", {})
-            config_offload_param = config_zero.get("offload_param", {})
-            offload_devices = ["cpu", "nvme"]
-            if (
-                config_offload_optimizer.get("device") in offload_devices
-                or config_offload_param.get("device") in offload_devices
-            ):
-                offload = True
-        if offload:
+    if "optimizer" not in config:
+        if deepspeed_config_hf.is_offload():
             raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers")
 
         # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
         # But trainer uses AdamW by default.
-        # To use other optimizers so using a different scheduler requires voiding warranty with: `zero_allow_untested_optimizer`
         trainer.create_optimizer()
         optimizer = trainer.optimizer
-        # flag that this is non-native optimizer
+        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
         config["zero_allow_untested_optimizer"] = True
 
     # DS schedulers (deepspeed/runtime/lr_schedules.py):
@@ -541,12 +503,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     # WarmupLR     | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0
     # WarmupDecayLR| linear               | get_linear_schedule_with_warmup   |
     lr_scheduler = None
-    if "scheduler" in config:
-        config_sched = config.get("scheduler", {})
-        config_sched_params = config_sched.get("params")
-        set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
-
-    else:  # override only if the ds config doesn't already have this section
+    if "scheduler" not in config:
         if "optimizer" in config:
             # to make this option work, we need to init DS optimizer first, then init HS scheduler,
             # then pass the HS scheduler to DS init, which is not possible at the moment
@@ -558,9 +515,6 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     # keep for quick debug:
     # from pprint import pprint; pprint(config)
 
-    # update the global config object
-    deepspeed_config_set(ds_config=config)
-
     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
 
     model, optimizer, _, lr_scheduler = deepspeed.initialize(
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index bd4d5a8eeded..d25341dea6d1 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -41,7 +41,7 @@
     replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
-from .integrations import deepspeed_config_get, is_deepspeed_zero3_enabled
+from .integrations import deepspeed_config, is_deepspeed_zero3_enabled
 from .utils import logging
 
 
@@ -1083,13 +1083,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
-            ds_config = deepspeed_config_get()
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
 
             # XXX: param_dict will be shortly replaced by deepspeed_config
-            with deepspeed.zero.Init(param_dict=ds_config):
+            with deepspeed.zero.Init(param_dict=deepspeed_config()):
                 model = cls(config, *model_args, **model_kwargs)
         else:
             model = cls(config, *model_args, **model_kwargs)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 69fa67163e61..4eae8eaef70e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -43,7 +43,6 @@
     run_hp_search_optuna,
     run_hp_search_ray,
     deepspeed_init,
-    deepspeed_config_setup,
     is_deepspeed_zero3_enabled,
 )
 
@@ -56,8 +55,6 @@
 from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import RandomSampler, SequentialSampler
 
-import transformers
-
 from . import __version__
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
@@ -290,32 +287,6 @@ def __init__(
         # force device and distributed setup init explicitly
         args._setup_devices
 
-        # Mixed precision setup
-        self.use_apex = False
-        self.use_amp = False
-        self.fp16_backend = None
-
-        if args.fp16:
-            if args.fp16_backend == "auto":
-                self.fp16_backend = "amp" if _is_native_amp_available else "apex"
-            else:
-                self.fp16_backend = args.fp16_backend
-            logger.info(f"Using {self.fp16_backend} fp16 backend")
-
-        if args.fp16 and not args.deepspeed:  # deepspeed manages its own fp16
-            if self.fp16_backend == "amp":
-                self.use_amp = True
-                self.scaler = ShardedGradScaler() if self.sharded_ddp is not None else torch.cuda.amp.GradScaler()
-            else:
-                if not is_apex_available():
-                    raise ImportError(
-                        "Using FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex."
-                    )
-                self.use_apex = True
-
-        if args.deepspeed:
-            deepspeed_config_setup(self)
-
         if model is None:
             if model_init is not None:
                 self.model_init = model_init
@@ -424,6 +395,30 @@ def __init__(
 
         self._signature_columns = None
 
+        # XXX: can move this back to where it was
+        # Mixed precision setup
+        self.use_apex = False
+        self.use_amp = False
+        self.fp16_backend = None
+
+        if args.fp16:
+            if args.fp16_backend == "auto":
+                self.fp16_backend = "amp" if _is_native_amp_available else "apex"
+            else:
+                self.fp16_backend = args.fp16_backend
+            logger.info(f"Using {self.fp16_backend} fp16 backend")
+
+        if args.fp16 and not args.deepspeed:  # deepspeed manages its own fp16
+            if self.fp16_backend == "amp":
+                self.use_amp = True
+                self.scaler = ShardedGradScaler() if self.sharded_ddp is not None else torch.cuda.amp.GradScaler()
+            else:
+                if not is_apex_available():
+                    raise ImportError(
+                        "Using FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex."
+                    )
+                self.use_apex = True
+
         # Label smoothing
         if self.args.label_smoothing_factor != 0:
             self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
@@ -448,14 +443,13 @@ def __init__(
         # very last
         self._memory_tracker.stop_and_update_metrics()
 
-    def __del__(self):
-        # in order to get various deepspeed components into the transformers framework w/o creating
-        # too many changes to the API, we use a few global variables, which aren't a problem as long
-        # as there is one Trainer per program execution. But if multiple Trainers are used we have
-        # to take care to reset these globals when Trainer is destroyed
-        if self.deepspeed:
-            transformers.integrations._is_deepspeed_zero3_enabled = None
-            transformers.integrations._deepspeed_config = None
+    # def __del__(self):
+    #     # in order to get various deepspeed components into the transformers framework w/o creating
+    #     # too many changes to the API, we use a few global variables, which aren't a problem as long
+    #     # as there is one Trainer per program execution. But if multiple Trainers are used we have
+    #     # to take care to reset these globals when Trainer is destroyed
+    #     if self.deepspeed:
+    #         transformers.integrations._deepspeed_config = None
 
     def add_callback(self, callback):
         """
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index be98825e2282..4f746d622519 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -70,9 +70,6 @@ class TrainingArguments:
     <https://docs.python.org/3/library/argparse.html#module-argparse>`__ arguments that can be specified on the command
     line.
 
-
-
-
     Parameters:
         output_dir (:obj:`str`):
             The output directory where the model predictions and checkpoints will be written.
@@ -618,6 +615,18 @@ def __post_init__(self):
         elif ShardedDDPOption.ZERO_DP_2 in self.sharded_ddp and ShardedDDPOption.ZERO_DP_3 in self.sharded_ddp:
             raise ValueError("`--sharded_ddp zero_dp_2` is not compatible with `--sharded_ddp zero_dp_3`.")
 
+        if self.deepspeed:
+            # - must be run very last in arg parsing, since it will use a lot of these settings.
+            # - must be run before the model is created.
+            from transformers.integrations import DeepSpeedConfigHF
+
+            # will be used later by trainer
+            self.deepspeed_config_hf = DeepSpeedConfigHF(self)
+        # else:
+        #     # reset the previous global state if any - mainly for tests inside the same process -
+        #     import transformers.integrations
+        #     transformers.integrations.deepspeed_config_reset()
+
     def __repr__(self):
         # We override the default repr to remove deprecated arguments from the repr. This method should be removed once
         # those deprecated arguments are removed form TrainingArguments. (TODO: v5)
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 725297ff085d..cf455172f1f8 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -68,6 +68,7 @@ def require_deepspeed(test_case):
 
 if is_deepspeed_available():
     from deepspeed.utils import logger as deepspeed_logger  # noqa
+    from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
 
 ZERO2 = "zero2"
 ZERO3 = "zero3"
@@ -229,25 +230,6 @@ def test_hf_optimizer_with_offload(self, stage):
                 f"got exception: {context.exception}",
             )
 
-    @parameterized.expand(stages)
-    def test_model_init(self, stage):
-        ds_config_dict = self.get_config_dict(stage)
-
-        # XXX: not really testing anything that we actually need here
-        # need real model - move to launcher
-        def model_init(self):
-            # setup Deepspeed
-            a = 0
-            b = 0
-            double_output = False
-            config = RegressionModelConfig(a=a, b=b, double_output=double_output)
-            model = RegressionPreTrainedModel(config)
-            return model
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict, model_init=model_init)
-            trainer.train()
-
     @parameterized.expand(stages)
     def test_fake_notebook_no_launcher(self, stage):
         # this setup emulates a notebook where a launcher needs to be emulated by hand
@@ -479,6 +461,38 @@ def test_can_resume_training_normal(self, stage):
             self.assertEqual(b, b1)
             self.check_trainer_state_are_the_same(state, state1)
 
+    def test_config_object(self):
+        # test that we can switch from zero2 to zero3 in the same process for example
+        # test is_zero, etc.
+        output_dir = self.get_auto_remove_tmp_dir()
+        kwargs = dict(output_dir=output_dir, train_len=8)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero3_dict = self.get_config_dict("zero3")
+            ds_config_zero2_dict = self.get_config_dict("zero2")
+
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test we can repeat that and with train this time
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            trainer.train()
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test zero3 is disabled
+            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
+            self.assertFalse(is_deepspeed_zero3_enabled())
+
+            # check config obj
+            config = deepspeed_config()
+            self.assertTrue(bool(config), "Deepspeed config should be accessible")
+
+            del trainer
+            # now weakref should gc the global and we shouldn't get anything here
+            config = deepspeed_config()
+            self.assertFalse(is_deepspeed_zero3_enabled())
+            self.assertFalse(bool(config), "Deepspeed config should not be accessible")
+
 
 @slow
 @require_deepspeed
@@ -611,6 +625,7 @@ def run_trainer(
             --adafactor
             --source_lang en
             --target_lang ro
+            --report_to none
         """.split()
         args.extend(["--source_prefix", '"translate English to Romanian: "'])
 
@@ -680,6 +695,7 @@ def test_clm(self, stage):
             --num_train_epochs 1
             --warmup_steps 8
             --block_size 128
+            --report_to none
             """.split()
 
         ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()

From 0f221d2cce751182c455295ef2c03a2c1bd3d66b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sun, 25 Apr 2021 22:15:24 -0700
Subject: [PATCH 06/12] cleanup

---
 src/transformers/integrations.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 8ca841be7dc0..8513d54dab98 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -438,11 +438,17 @@ def deepspeed_config_hf_set(deepspeed_config_hf_obj):
 
 
 def is_deepspeed_zero3_enabled():
-    return _deepspeed_config_hf_weak_ref().is_zero3() if _deepspeed_config_hf_weak_ref() is not None else False
+    if _deepspeed_config_hf_weak_ref is not None and _deepspeed_config_hf_weak_ref() is not None:
+        return _deepspeed_config_hf_weak_ref().is_zero3()
+    else:
+        return False
 
 
 def deepspeed_config():
-    return _deepspeed_config_hf_weak_ref().config if _deepspeed_config_hf_weak_ref() is not None else None
+    if _deepspeed_config_hf_weak_ref is not None and _deepspeed_config_hf_weak_ref() is not None:
+        return _deepspeed_config_hf_weak_ref().config
+    else:
+        return None
 
 
 def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):

From 645d624100878628938c612adcab4acb60b0fe59 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Sun, 25 Apr 2021 22:42:09 -0700
Subject: [PATCH 07/12] cleanup

---
 src/transformers/integrations.py  | 4 ++--
 src/transformers/trainer.py       | 8 --------
 src/transformers/training_args.py | 6 +-----
 3 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 8513d54dab98..28e007094e44 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -292,8 +292,8 @@ class DeepSpeedConfigHF:
     """
     This object contains Deepspeed configuration and can be quickly queried for things like zero stage.
 
-    We store a ``weakref`` proxy of this object in the module's global to be able to access the config from areas where
-    Trainer is not available.
+    We store a ``weakref`` of this object in the module's global to be able to access the config from areas where the
+    Trainer is not available (e.g. `from_pretrained` and `_get_resized_embeddings`).
 
     The ``DeepSpeedConfigHF`` object is meant to be created during ``TrainingArguments`` object creation and has the
     same lifespan as the latter.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4eae8eaef70e..61c6f81a4856 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -443,14 +443,6 @@ def __init__(
         # very last
         self._memory_tracker.stop_and_update_metrics()
 
-    # def __del__(self):
-    #     # in order to get various deepspeed components into the transformers framework w/o creating
-    #     # too many changes to the API, we use a few global variables, which aren't a problem as long
-    #     # as there is one Trainer per program execution. But if multiple Trainers are used we have
-    #     # to take care to reset these globals when Trainer is destroyed
-    #     if self.deepspeed:
-    #         transformers.integrations._deepspeed_config = None
-
     def add_callback(self, callback):
         """
         Add a callback to the current list of :class:`~transformer.TrainerCallback`.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 4f746d622519..913ed68dec0f 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -620,12 +620,8 @@ def __post_init__(self):
             # - must be run before the model is created.
             from transformers.integrations import DeepSpeedConfigHF
 
-            # will be used later by trainer
+            # will be used later by the Trainer (leave self.deepspeed unmodified in case a user relies on it not to be modified)
             self.deepspeed_config_hf = DeepSpeedConfigHF(self)
-        # else:
-        #     # reset the previous global state if any - mainly for tests inside the same process -
-        #     import transformers.integrations
-        #     transformers.integrations.deepspeed_config_reset()
 
     def __repr__(self):
         # We override the default repr to remove deprecated arguments from the repr. This method should be removed once

From 84748d5a6541d1abb8fea05fe834c5b4666f072d Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 26 Apr 2021 10:08:25 -0700
Subject: [PATCH 08/12] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/main_classes/trainer.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index f773177e9cf3..651e78028cfb 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1318,7 +1318,7 @@ values look like, but we highly recommend using the one with multiple ``auto`` s
                 "pin_memory": true
             },
             "offload_param": {
-                "device": "cpu"
+                "device": "cpu",
                 "pin_memory": true
             },
             "overlap_comm": true,
@@ -1497,7 +1497,7 @@ For example, for ``WarmupDecayLR``, you can use the following entry:
                  "total_num_steps": "auto",
                  "warmup_min_lr": "auto",
                  "warmup_max_lr": "auto",
-                 "warmup_num_steps": "auto"
+                 "warmup_num_steps": "auto",
                  "last_batch_iteration": -1,
              }
          }
@@ -1745,7 +1745,7 @@ As you can see this gives you a randomly initialized model.
 If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as
 ``is_deepspeed_zero3_enabled()`` returns ``True``, which currently is setup by the
 class:`~transformers.TrainingArguments` object if the passed DeepSpeed configuration file contains ZeRO-3 config
-section. Thus you must create the class:`~transformers.TrainingArguments` object **before** calling
+section. Thus you must create the :class:`~transformers.TrainingArguments` object **before** calling
 ``from_pretrained``. Here is an example of a possible sequence:
 
 .. code-block:: python

From fb195e7ba65a93d59bb21ec1193b7e84c7009a4a Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 26 Apr 2021 10:12:24 -0700
Subject: [PATCH 09/12] consistent json commas

---
 docs/source/main_classes/trainer.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 651e78028cfb..4448d02d0de0 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -601,7 +601,7 @@ with DeepSpeed is to have at least the following configuration in the configurat
          "overlap_comm": true,
          "contiguous_gradients": true,
          "cpu_offload": true
-      },
+      }
     }
 
 which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will
@@ -968,7 +968,7 @@ The following is an example configuration for ZeRO stage 3:
             "stage3_max_live_parameters": 1e9,
             "stage3_max_reuse_distance": 1e9,
             "stage3_gather_fp16_weights_on_model_save": true
-        },
+        }
     }
 
 If you are getting OOMs, because your model or activations don't fit into the GPU memory and you have unutilized CPU
@@ -1380,7 +1380,7 @@ Here is an example of the auto-configured ``optimizer`` entry for ``AdamW``:
              "eps": "auto",
              "weight_decay": "auto"
            }
-         }
+       }
     }
 
 
@@ -1408,7 +1408,7 @@ You can also set the values explicitly:
              "eps": 1e-8,
              "weight_decay": 3e-7
            }
-         }
+       }
     }
 
 But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
@@ -1494,11 +1494,11 @@ For example, for ``WarmupDecayLR``, you can use the following entry:
        "scheduler": {
              "type": "WarmupDecayLR",
              "params": {
+                 "last_batch_iteration": -1,
                  "total_num_steps": "auto",
                  "warmup_min_lr": "auto",
                  "warmup_max_lr": "auto",
-                 "warmup_num_steps": "auto",
-                 "last_batch_iteration": -1,
+                 "warmup_num_steps": "auto"
              }
          }
     }
@@ -1524,7 +1524,7 @@ To configure pytorch AMP-like mode set:
             "initial_scale_power": 16,
             "hysteresis": 2,
             "min_loss_scale": 1
-        },
+        }
     }
 
 and the :class:`~transformers.Trainer` will automatically enable or disable it based on the value of
@@ -1547,7 +1547,7 @@ You can also enable/disable this mode explicitly:
             "initial_scale_power": 16,
             "hysteresis": 2,
             "min_loss_scale": 1
-        },
+        }
     }
 
 But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
@@ -1561,7 +1561,7 @@ To configure apex AMP-like mode set:
 
     "amp": {
         "enabled": "auto",
-        "opt_level": "auto",
+        "opt_level": "auto"
     }
 
 and the :class:`~transformers.Trainer` will automatically configure it based on the values of ``args.fp16_backend`` and
@@ -1605,7 +1605,7 @@ You can also set the value explicitly:
 .. code-block:: json
 
     {
-        "gradient_accumulation_steps": 3,
+        "gradient_accumulation_steps": 3
     }
 
 But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed
@@ -1630,7 +1630,7 @@ You can also set the value explicitly:
 .. code-block:: json
 
     {
-        "gradient_clipping": 1.0,
+        "gradient_clipping": 1.0
     }
 
 But then you're on your own synchronizing the :class:`~transformers.Trainer` command line arguments and the DeepSpeed

From 21ac0813d8d3604368f03530518bcd291eaacdd7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 26 Apr 2021 10:21:53 -0700
Subject: [PATCH 10/12] act on suggestions

---
 docs/source/main_classes/trainer.rst | 10 +++---
 src/transformers/integrations.py     | 50 ++++++++++++----------------
 src/transformers/trainer.py          |  1 -
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 4448d02d0de0..cdc796c017de 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -612,8 +612,8 @@ For a practical usage example of this type of deployment, please, see this `post
 
 You may also try the ZeRO-3 with CPU and NVMe offload as explained further in this document.
 
-TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then recommend
-ZeRO-3 config as starting one.
+<!--- TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then
+recommend ZeRO-3 config as starting one. -->
 
 Notes:
 
@@ -1532,8 +1532,10 @@ and the :class:`~transformers.Trainer` will automatically enable or disable it b
 
 This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
 
-XXX: However, at the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it
-will be always set to ``true``.
+.. note::
+
+   At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be
+   always set to ``true``.
 
 You can also enable/disable this mode explicitly:
 
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 9ba58f2ba626..a2d6743a1e2a 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -269,19 +269,13 @@ def rewrite_logs(d):
     return new_d
 
 
-def is_true(config, key):
+def _is_true(config, key):
     if config is None:
         return False
     return bool(config.get(key))
 
 
-def is_auto(config, key):
-    if config is None:
-        return False
-    return config.get(key) == "auto"
-
-
-def set_if_auto(config, key, val):
+def _set_if_auto(config, key, val):
     if config is None:
         return
     if config.get(key) == "auto":
@@ -346,10 +340,10 @@ def config_process(self, args):
         # DeepSpeed does:
         # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
         train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
-        set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size)
-        set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps)
-        set_if_auto(config, "train_batch_size", train_batch_size)
-        set_if_auto(config, "gradient_clipping", args.max_grad_norm)
+        _set_if_auto(config, "train_micro_batch_size_per_gpu", args.per_device_train_batch_size)
+        _set_if_auto(config, "gradient_accumulation_steps", args.gradient_accumulation_steps)
+        _set_if_auto(config, "train_batch_size", train_batch_size)
+        _set_if_auto(config, "gradient_clipping", args.max_grad_norm)
 
         # zero
         config_zero = config.get("zero_optimization", {})
@@ -358,17 +352,17 @@ def config_process(self, args):
         config_optim = config.get("optimizer", {})
         if config_optim != {}:
             config_optim_params = config_optim.get("params")
-            set_if_auto(config_optim_params, "lr", args.learning_rate)
-            set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
-            set_if_auto(config_optim_params, "eps", args.adam_epsilon)
-            set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
+            _set_if_auto(config_optim_params, "lr", args.learning_rate)
+            _set_if_auto(config_optim_params, "betas", [args.adam_beta1, args.adam_beta2])
+            _set_if_auto(config_optim_params, "eps", args.adam_epsilon)
+            _set_if_auto(config_optim_params, "weight_decay", args.weight_decay)
 
         config_sched = config.get("scheduler", {})
         if config_sched != {}:
             config_sched_params = config_sched.get("params")
-            set_if_auto(config_sched_params, "warmup_min_lr", 0)
-            set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
-            set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
+            _set_if_auto(config_sched_params, "warmup_min_lr", 0)
+            _set_if_auto(config_sched_params, "warmup_max_lr", args.learning_rate)
+            _set_if_auto(config_sched_params, "warmup_num_steps", args.warmup_steps)
             # total_num_steps - will get set in deepspeed_init
 
         # fp16
@@ -382,18 +376,18 @@ def config_process(self, args):
         config_fp16 = config.get("fp16")
         # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
         # merged and a new release is made, delete the next line and uncomment the one after it
-        set_if_auto(config_fp16, "enabled", True)
-        # set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
+        _set_if_auto(config_fp16, "enabled", True)
+        # _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
 
         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
         # ZeRO features, so probably best to be avoided.
         config_amp = config.get("amp")
-        set_if_auto(config_amp, "enabled", fp16_backend == "apex")
-        set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
+        _set_if_auto(config_amp, "enabled", fp16_backend == "apex")
+        _set_if_auto(config_amp, "opt_level", args.fp16_opt_level)
 
         config_zero = config.get("zero_optimization", {})
         if self.is_zero2():
-            self.offload = is_true(config_zero, "cpu_offload")
+            self.offload = _is_true(config_zero, "cpu_offload")
         elif self.is_zero3():
             offload_devices = ["cpu", "nvme"]
             if config_zero.get("offload_optimizer", {}).get("device") in offload_devices:
@@ -415,14 +409,14 @@ def config_finalize(self, args, model, num_training_steps):
         if self.is_zero3():
             # automatically assign the optimal config values based on model config
             hidden_size = model.config.hidden_size
-            set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
-            set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
-            set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
+            _set_if_auto(config_zero, "reduce_bucket_size", hidden_size * hidden_size)
+            _set_if_auto(config_zero, "stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+            _set_if_auto(config_zero, "stage3_param_persistence_threshold", 10 * hidden_size)
 
         # scheduler
         config_sched = config.get("scheduler", {})
         config_sched_params = config_sched.get("params", {})
-        set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
+        _set_if_auto(config_sched_params, "total_num_steps", num_training_steps)
 
 
 # keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 3416f8242fdc..5565bdb2eab4 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -397,7 +397,6 @@ def __init__(
 
         self._signature_columns = None
 
-        # XXX: can move this back to where it was
         # Mixed precision setup
         self.use_apex = False
         self.use_amp = False

From 27bb4a4617b4fa25ceb515e685eeaf8fbf63d601 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 26 Apr 2021 10:22:44 -0700
Subject: [PATCH 11/12] leave this feature for 0.3.16

---
 src/transformers/modeling_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index aa98a835efe7..ee0a6d4c489a 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1125,8 +1125,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
 
-            # XXX: param_dict will be shortly replaced by deepspeed_config
-            with deepspeed.zero.Init(param_dict=deepspeed_config()):
+            # XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config
+            # with deepspeed.zero.Init(param_dict=deepspeed_config()):
+            with deepspeed.zero.Init():
                 model = cls(config, *model_args, **model_kwargs)
         else:
             model = cls(config, *model_args, **model_kwargs)

From a0657e58a1b58246a3a4aa0989101296971506ee Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Mon, 26 Apr 2021 10:23:35 -0700
Subject: [PATCH 12/12] style

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index ee0a6d4c489a..7b1f477af528 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -44,7 +44,7 @@
     replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
-from .integrations import deepspeed_config, is_deepspeed_zero3_enabled
+from .integrations import is_deepspeed_zero3_enabled
 from .utils import logging