From fb569cbedc05f44ce6f95c268821cb8930c10bca Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 26 Feb 2021 11:19:14 +0000
Subject: [PATCH 1/6] add optimizers and schedules to rtd

---
 deepspeed/out2                      |  7 -------
 docs/code-docs/source/index.rst     | 13 +++++++++++++
 docs/code-docs/source/schedules.rst | 24 ++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 7 deletions(-)
 delete mode 100644 deepspeed/out2
 create mode 100755 docs/code-docs/source/schedules.rst

diff --git a/deepspeed/out2 b/deepspeed/out2
deleted file mode 100644
index 15ca670dcb99..000000000000
--- a/deepspeed/out2
+++ /dev/null
@@ -1,7 +0,0 @@
-============================= test session starts ==============================
-platform linux -- Python 3.6.9, pytest-6.0.1, py-1.9.0, pluggy-0.13.1
-rootdir: /home/chengli1/projects/DeepSpeed
-plugins: forked-1.3.0, hypothesis-5.41.3, xdist-2.1.0, cov-2.10.1
-collected 0 items
-
-============================ no tests ran in 0.01s =============================
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index faf818c696b3..dabfebdc3291 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -41,6 +41,19 @@ Pipeline Parallelism
 
    pipeline
 
+Optimizers
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   optimizers
+
+LR Schedules
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   schedules
 
 Indices and tables
 ------------------
diff --git a/docs/code-docs/source/schedules.rst b/docs/code-docs/source/schedules.rst
new file mode 100755
index 000000000000..3665e76056cf
--- /dev/null
+++ b/docs/code-docs/source/schedules.rst
@@ -0,0 +1,24 @@
+Learning Rate Schedulers
+===================
+
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedules.
+
+
+LRRangeTest
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.LRRangeTest
+
+
+OneCycle
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.OneCycle
+
+
+WarmupLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupLR
+
+
+WarmupDecayLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR

From 258d64a5121ec8d27c19d4d01e7b402189ba9f05 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 26 Feb 2021 11:39:35 +0000
Subject: [PATCH 2/6] update ds website and fix links

---
 docs/_pages/config-json.md                    | 20 +++++++++----------
 docs/code-docs/source/index.rst               |  4 ++--
 docs/code-docs/source/optimizers.rst          | 16 +++++++++++----
 .../source/{schedules.rst => schedulers.rst}  |  2 +-
 4 files changed, 25 insertions(+), 17 deletions(-)
 rename docs/code-docs/source/{schedules.rst => schedulers.rst} (85%)

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 8d99627c03cd..1524a065d76f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -32,10 +32,10 @@ title: "DeepSpeed Configuration JSON"
 
 ***optimizer***: [dictionary]
 
-| Fields | Value                                                                                                                                                                                                   | Example                      |
-| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
-| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
-| params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).  | `{"lr": 0.001, "eps": 1e-8}` |
+| Fields | Value                                                                                                                                                                                                                                                                                        | Example                      |
+| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
+| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
+| params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
   Example of ***optimizer*** with Adam
 
@@ -83,10 +83,10 @@ The Adam optimizer also supports the following two params keys/values in additio
 
 ***scheduler***: [dictionary]
 
-| Fields | Value                                                                                                                        | Example                                        |
-| ------ | ---------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
-| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/deepspeed.pt.html) for list of support schedulers. | `"WarmupLR"`                                   |
-| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.         | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
+| Fields | Value                                                                                                                      | Example                                        |
+| ------ | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
+| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers. | `"WarmupLR"`                                   |
+| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.       | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
 
 Example of ***scheduler***
 
@@ -163,8 +163,8 @@ Example of ***scheduler***
 
 ***fp16:initial\_scale\_power***: [integer]
 
-| Description                                                                                                                                                                                                   | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                                                                       | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | ***initial\_scale\_power*** is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup>***initial\_scale\_power***</sup>. | `32`    |
 
 ***fp16:loss\_scale\_window***: [integer]
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index dabfebdc3291..2740c7ea07ea 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -48,12 +48,12 @@ Optimizers
 
    optimizers
 
-LR Schedules
+Learning Rate Schedulers
 --------------------
 .. toctree::
    :maxdepth: 2
 
-   schedules
+   schedulers
 
 Indices and tables
 ------------------
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 65f1ca2bf33f..89fc47ac547b 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,12 +1,20 @@
 Optimizers
 ===================
 
-DeepSpeed offers high-performance implementations of Adam and Lamb optimizers on CPU and GPU, respectively.
+DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
 
-DeepSpeed CPU Adam
+Adam (CPU)
 ----------------------------
 .. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
 
-DeepSpeed Fused Lamb
+FusedAdam (GPU)
 ----------------------------
-.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
+.. autoclass:: deepspeed.ops.adam.FusedAdam
+
+FusedLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.lamb.FusedLamb
+
+OneBitAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
diff --git a/docs/code-docs/source/schedules.rst b/docs/code-docs/source/schedulers.rst
similarity index 85%
rename from docs/code-docs/source/schedules.rst
rename to docs/code-docs/source/schedulers.rst
index 3665e76056cf..6be3112164ef 100755
--- a/docs/code-docs/source/schedules.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,7 +1,7 @@
 Learning Rate Schedulers
 ===================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedules.
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers.
 
 
 LRRangeTest

From d789bc118dd75735ccf1469129f268fc2bc654fe Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 26 Feb 2021 11:19:14 +0000
Subject: [PATCH 3/6] add optimizers and schedules to rtd

---
 docs/code-docs/source/index.rst     | 13 +++++++++++++
 docs/code-docs/source/schedules.rst | 24 ++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100755 docs/code-docs/source/schedules.rst

diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index faf818c696b3..dabfebdc3291 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -41,6 +41,19 @@ Pipeline Parallelism
 
    pipeline
 
+Optimizers
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   optimizers
+
+LR Schedules
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   schedules
 
 Indices and tables
 ------------------
diff --git a/docs/code-docs/source/schedules.rst b/docs/code-docs/source/schedules.rst
new file mode 100755
index 000000000000..3665e76056cf
--- /dev/null
+++ b/docs/code-docs/source/schedules.rst
@@ -0,0 +1,24 @@
+Learning Rate Schedulers
+===================
+
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedules.
+
+
+LRRangeTest
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.LRRangeTest
+
+
+OneCycle
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.OneCycle
+
+
+WarmupLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupLR
+
+
+WarmupDecayLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR

From 9de3ec020866b5d808585770050e68e07d9cf074 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 26 Feb 2021 11:39:35 +0000
Subject: [PATCH 4/6] update ds website and fix links

---
 docs/_pages/config-json.md                    | 20 +++++++++----------
 docs/code-docs/source/index.rst               |  4 ++--
 docs/code-docs/source/optimizers.rst          | 16 +++++++++++----
 .../source/{schedules.rst => schedulers.rst}  |  2 +-
 4 files changed, 25 insertions(+), 17 deletions(-)
 rename docs/code-docs/source/{schedules.rst => schedulers.rst} (85%)

diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 8d99627c03cd..1524a065d76f 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -32,10 +32,10 @@ title: "DeepSpeed Configuration JSON"
 
 ***optimizer***: [dictionary]
 
-| Fields | Value                                                                                                                                                                                                   | Example                      |
-| ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
-| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
-| params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).  | `{"lr": 0.001, "eps": 1e-8}` |
+| Fields | Value                                                                                                                                                                                                                                                                                        | Example                      |
+| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
+| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
+| params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
   Example of ***optimizer*** with Adam
 
@@ -83,10 +83,10 @@ The Adam optimizer also supports the following two params keys/values in additio
 
 ***scheduler***: [dictionary]
 
-| Fields | Value                                                                                                                        | Example                                        |
-| ------ | ---------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
-| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/deepspeed.pt.html) for list of support schedulers. | `"WarmupLR"`                                   |
-| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.         | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
+| Fields | Value                                                                                                                      | Example                                        |
+| ------ | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- |
+| type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers. | `"WarmupLR"`                                   |
+| params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.       | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
 
 Example of ***scheduler***
 
@@ -163,8 +163,8 @@ Example of ***scheduler***
 
 ***fp16:initial\_scale\_power***: [integer]
 
-| Description                                                                                                                                                                                                   | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                                                                       | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | ***initial\_scale\_power*** is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup>***initial\_scale\_power***</sup>. | `32`    |
 
 ***fp16:loss\_scale\_window***: [integer]
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index dabfebdc3291..2740c7ea07ea 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -48,12 +48,12 @@ Optimizers
 
    optimizers
 
-LR Schedules
+Learning Rate Schedulers
 --------------------
 .. toctree::
    :maxdepth: 2
 
-   schedules
+   schedulers
 
 Indices and tables
 ------------------
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index 65f1ca2bf33f..89fc47ac547b 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,12 +1,20 @@
 Optimizers
 ===================
 
-DeepSpeed offers high-performance implementations of Adam and Lamb optimizers on CPU and GPU, respectively.
+DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
 
-DeepSpeed CPU Adam
+Adam (CPU)
 ----------------------------
 .. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
 
-DeepSpeed Fused Lamb
+FusedAdam (GPU)
 ----------------------------
-.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
+.. autoclass:: deepspeed.ops.adam.FusedAdam
+
+FusedLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.lamb.FusedLamb
+
+OneBitAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
diff --git a/docs/code-docs/source/schedules.rst b/docs/code-docs/source/schedulers.rst
similarity index 85%
rename from docs/code-docs/source/schedules.rst
rename to docs/code-docs/source/schedulers.rst
index 3665e76056cf..6be3112164ef 100755
--- a/docs/code-docs/source/schedules.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,7 +1,7 @@
 Learning Rate Schedulers
 ===================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedules.
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers.
 
 
 LRRangeTest

From c94048cd03090d3368da88587439c493ad22942f Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 26 Feb 2021 23:05:38 +0000
Subject: [PATCH 5/6] add flops profiler to rtd

---
 deepspeed/profiling/config.py                 |  6 ---
 .../profiling/flops_profiler/profiler.py      | 54 +++++++++++++++++++
 docs/_tutorials/flops-profiler.md             |  2 +
 docs/code-docs/source/flops-profiler.rst      | 16 ++++++
 docs/code-docs/source/index.rst               |  7 +++
 5 files changed, 79 insertions(+), 6 deletions(-)
 create mode 100644 docs/code-docs/source/flops-profiler.rst

diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index 3302d616366a..0e389baba18b 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -9,9 +9,6 @@
 
 class DeepSpeedFlopsProfilerConfig(object):
     def __init__(self, param_dict):
-        """
-        docstring
-        """
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
@@ -27,9 +24,6 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
-        """
-        docstring
-        """
         self.enabled = get_scalar_param(flops_profiler_dict,
                                         FLOPS_PROFILER_ENABLED,
                                         FLOPS_PROFILER_ENABLED_DEFAULT)
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index ca10d76c13f5..48ba9701ec24 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -12,6 +12,34 @@ class FlopsProfiler(object):
     """Measures the latency, number of estimated floating point operations and parameters of each module in a PyTorch model.
 
     The flops-profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module. It shows how latency, flops and parameters are spent in the model and which modules or layers could be the bottleneck. It also outputs the names of the top k modules in terms of aggregated latency, flops, and parameters at depth l with k and l specified by the user. The output profile is computed for each batch of input.
+    The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a standalone package.
+    When using DeepSpeed for model training, the flops profiler can be configured in the deepspeed_config file and no user code change is required.
+
+    If using the profiler as a standalone package, one imports the flops_profiler package and use the APIs.
+
+    Here is an example for usage in a typical training workflow:
+
+        .. code-block:: python
+
+            model = Model()
+            prof = FlopsProfiler(model)
+
+            for step, batch in enumerate(data_loader):
+                if step == profile_step:
+                    prof.start_profile()
+
+                loss = model(batch)
+
+                if step == profile_step:
+                    flops = prof.get_total_flops(as_string=True)
+                    params = prof.get_total_params(as_string=True)
+                    prof.print_model_profile(profile_step=profile_step)
+                    prof.end_profile()
+
+                loss.backward()
+                optimizer.step()
+
+    To profile a trained model in inference, use the `get_model_profile` API.
 
     Args:
         object (torch.nn.Module): The PyTorch model to profile.
@@ -118,6 +146,9 @@ def get_total_flops(self, as_string=False):
 
         Args:
             as_string (bool, optional): whether to output the flops as string. Defaults to False.
+
+        Returns:
+            The number of multiply-accumulate operations of the model forward pass.
         """
         total_flops = get_module_flops(self.model)
         return macs_to_string(total_flops) if as_string else total_flops
@@ -127,6 +158,9 @@ def get_total_duration(self, as_string=False):
 
         Args:
             as_string (bool, optional): whether to output the duration as string. Defaults to False.
+
+        Returns:
+            The latency of the model forward pass.
         """
         total_duration = self.model.__duration__
         return duration_to_string(total_duration) if as_string else total_duration
@@ -136,6 +170,9 @@ def get_total_params(self, as_string=False):
 
         Args:
             as_string (bool, optional): whether to output the parameters as string. Defaults to False.
+
+        Returns:
+            The number of parameters in the model.
         """
         return params_to_string(
             self.model.__params__) if as_string else self.model.__params__
@@ -146,6 +183,12 @@ def print_model_profile(self,
                             top_modules=3,
                             detailed=True):
         """Prints the model graph with the measured profile attached to each module.
+
+        Args:
+            profile_step (int, optional): The global training step at which to profile. Note that warm up steps are needed for accurate time measurement.
+            module_depth (int, optional): The depth of the model at which to print the aggregated module information. When set to -1, it prints information on the innermost modules (with the maximum depth).
+            top_modules (int, optional): Limits the aggregated profile output to the number of top modules specified.
+            detailed (bool, optional): Whether to print the detailed model profile.
         """
 
         total_flops = self.get_total_flops()
@@ -749,6 +792,14 @@ def get_model_profile(
 ):
     """Returns the total MACs and parameters of a model.
 
+    Example:
+
+    .. code-block:: python
+
+        model = torchvision.models.alexnet()
+        batch_size = 256
+        macs, params = get_model_profile(model=model, input_res= (batch_size, 3, 224, 224)))
+
     Args:
         model ([torch.nn.Module]): the PyTorch model to be profiled.
         input_res (list): input shape or input to the input_constructor
@@ -760,6 +811,9 @@ def get_model_profile(
         warm_up (int, optional): the number of warm-up steps before measuring the latency of each module. Defaults to 1.
         as_string (bool, optional): whether to print the output as string. Defaults to True.
         ignore_modules ([type], optional): the list of modules to ignore during profiling. Defaults to None.
+
+    Returns:
+        The number of multiply-accumulate operations (MACs) and parameters in the model.
     """
     assert type(input_res) is tuple
     assert len(input_res) >= 1
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 00bc07c0ef5f..3ccd8a45929f 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -88,6 +88,8 @@ The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a stan
       - [Example: Bert](#example-bert)
     - [In Model Training Workflow](#in-model-training-workflow)
       - [Example Training Workflow](#example-training-workflow)
+
+
 ### Usage With the DeepSpeed Runtime
 
 When using DeepSpeed for model training, the flops profiler can be configured in the `deepspeed_config` file. No explict API calls are needed to use the profiler. Refer to [flops profiler](https://www.deepspeed.ai/docs/config-json/#flops-profiler) for details.
diff --git a/docs/code-docs/source/flops-profiler.rst b/docs/code-docs/source/flops-profiler.rst
new file mode 100644
index 000000000000..be83015cb41c
--- /dev/null
+++ b/docs/code-docs/source/flops-profiler.rst
@@ -0,0 +1,16 @@
+Flops Profiler
+
+==============
+
+The flops profiler in DeepSpeed profiles the forward pass of a model and measures its parameters, latency, and floating point operations. The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a standalone package.
+
+When using DeepSpeed for model training, the flops profiler can be configured in the deepspeed_config file without user code changes. To use the flops profiler outside of the DeepSpeed runtime, one can simply install DeepSpeed and import the flops_profiler package to use the APIs directly.
+
+Please see the `Flops Profiler tutorial <https://www.deepspeed.ai/tutorials/flops-profiler/>`_ for usage details.
+
+Flops Profiler
+---------------------------------------------------
+
+.. automodule:: deepspeed.profiling.flops_profiler.profiler
+   :members:
+   :show-inheritance:
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 2740c7ea07ea..c9f6edc93a27 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -55,6 +55,13 @@ Learning Rate Schedulers
 
    schedulers
 
+Flops Profiler
+--------------------
+.. toctree::
+   :maxdepth: 2
+
+   flops-profiler
+
 Indices and tables
 ------------------
 

From 11b7108e963afb6d96dbc74105dd2c29656d0b02 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Tue, 9 Mar 2021 23:00:03 +0000
Subject: [PATCH 6/6] fix

---
 deepspeed/profiling/flops_profiler/profiler.py | 2 +-
 docs/code-docs/source/cpu-adam.rst             | 5 -----
 docs/code-docs/source/index.rst                | 1 -
 3 files changed, 1 insertion(+), 7 deletions(-)
 delete mode 100644 docs/code-docs/source/cpu-adam.rst

diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index 48ba9701ec24..7e225fc20f2b 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -262,7 +262,7 @@ def del_extra_repr(module):
                 "\n------------------------------ Detailed Profile ------------------------------"
             )
             print(
-                "Each module profile is listed after its name in the follwing order: \nnumber of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency)."
+                "Each module profile is listed after its name in the following order: \nnumber of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency)."
             )
             print(
                 "Note: \n1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.\n2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.\n"
diff --git a/docs/code-docs/source/cpu-adam.rst b/docs/code-docs/source/cpu-adam.rst
deleted file mode 100644
index 0b25f0e25e29..000000000000
--- a/docs/code-docs/source/cpu-adam.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-DeepSpeedCPUAdam
-################
-
-.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
-    :members:
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 443b8f2441cd..f7940668012b 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -33,7 +33,6 @@ ZeRO API
    :maxdepth: 2
 
    zero3
-   cpu-adam