diff --git a/DeepSpeedExamples b/DeepSpeedExamples
new file mode 160000
index 000000000000..78d69cb2f89a
--- /dev/null
+++ b/DeepSpeedExamples
@@ -0,0 +1 @@
+Subproject commit 78d69cb2f89a27b1e9b072df8c3e47d00c024fdc
diff --git a/README.md b/README.md
index c7bde12dd0ea..eed70fa5e04f 100755
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
-[![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
+[![Downloads](https://pepy.tech/badge/deepspeed/month)](https://pepy.tech/project/deepspeed)
 
 ### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
 
@@ -17,7 +17,7 @@ DeepSpeed delivers extreme-scale model training for everyone, from data scientis
 * Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.  
 * Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
 * Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.  
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam/1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
 
 Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
@@ -33,6 +33,9 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/04/20] [1-bit LAMB: up to 4.6x less communication and 2.8x faster training, together with LAMB's convergence speed at large batch sizes](https://www.deepspeed.ai/tutorials/onebit-lamb/)
+* [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+  * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
@@ -41,10 +44,6 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
-  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
-  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
-  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
-  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 
 
 # Table of Contents
@@ -121,7 +120,7 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
   * Memory- and compute-efficient sparse kernels
   * Support 10x longer sequences than dense
   * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
   * Custom communication collective
   * Up to 5x communication volume saving
 * [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
@@ -193,6 +192,8 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index d425dc3169ef..6726b895f12c 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -672,6 +672,13 @@ int ds_adam_step_plus_copy(int optimizer_id,
     return 0;
 }
 
+int destroy_adam_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
@@ -679,4 +686,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           &ds_adam_step_plus_copy,
           "DeepSpeed CPU Adam update and param copy (C++)");
     m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
+    m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
 }
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
new file mode 100644
index 000000000000..11927969c50f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -0,0 +1,333 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "deepspeed_aio_common.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_PERF 0
+#define DEBUG_DS_AIO_SUBMIT_PERF 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+    __attribute__((unused));
+
+static void _report_aio_statistics(const char* tag,
+                                   const std::vector<std::chrono::duration<double>>& latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    const auto min_lat = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    const auto max_lat = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    const auto avg_lat = std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+
+    std::cout << c_library_name << ": latency statistics(usec) " << tag
+              << " min/max/avg = " << min_lat << " " << max_lat << " " << avg_lat << std::endl;
+}
+
+static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_latencies,
+                               struct deepspeed_aio_latency_t& summary_latencies)
+{
+    std::vector<double> lat_usec;
+    for (auto& lat : raw_latencies) { lat_usec.push_back(lat.count() * 1e6); }
+    summary_latencies._min_usec = *(std::min_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._max_usec = *(std::max_element(lat_usec.begin(), lat_usec.end()));
+    summary_latencies._avg_usec =
+        std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
+}
+
+static void _do_io_submit_singles(const long long int n_iocbs,
+                                  const long long int iocb_index,
+                                  std::unique_ptr<aio_context>& aio_ctxt,
+                                  std::vector<std::chrono::duration<double>>& submit_times)
+{
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto st = std::chrono::high_resolution_clock::now();
+        const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, 1, aio_ctxt->_iocbs.data() + i);
+        submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+        printf("submit(usec) %f io_index=%lld buf=%p len=%lu off=%llu \n",
+               submit_times.back().count() * 1e6,
+               iocb_index,
+               aio_ctxt->_iocbs[i]->u.c.buf,
+               aio_ctxt->_iocbs[i]->u.c.nbytes,
+               aio_ctxt->_iocbs[i]->u.c.offset);
+#endif
+        assert(submit_ret > 0);
+    }
+}
+
+static void _do_io_submit_block(const long long int n_iocbs,
+                                const long long int iocb_index,
+                                std::unique_ptr<aio_context>& aio_ctxt,
+                                std::vector<std::chrono::duration<double>>& submit_times)
+{
+    const auto st = std::chrono::high_resolution_clock::now();
+    const auto submit_ret = io_submit(aio_ctxt->_io_ctxt, n_iocbs, aio_ctxt->_iocbs.data());
+    submit_times.push_back(std::chrono::high_resolution_clock::now() - st);
+#if DEBUG_DS_AIO_SUBMIT_PERF
+    printf("submit(usec) %f io_index=%lld nr=%lld buf=%p len=%lu off=%llu \n",
+           submit_times.back().count() * 1e6,
+           iocb_index,
+           n_iocbs,
+           aio_ctxt->_iocbs[0]->u.c.buf,
+           aio_ctxt->_iocbs[0]->u.c.nbytes,
+           aio_ctxt->_iocbs[0]->u.c.offset);
+#endif
+    assert(submit_ret > 0);
+}
+
+static int _do_io_complete(const long long int min_completes,
+                           const long long int max_completes,
+                           std::unique_ptr<aio_context>& aio_ctxt,
+                           std::vector<std::chrono::duration<double>>& reap_times)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    const auto n_completes = io_getevents(
+        aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr);
+    reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
+
+    assert(n_completes >= min_completes);
+    return n_completes;
+}
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
+
+    const auto num_io_blocks = static_cast<long long int>(
+        ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+    const auto max_queue_bytes =
+        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (long long iocb_index = 0; iocb_index < num_io_blocks;
+         iocb_index += aio_ctxt->_queue_depth) {
+        const auto start_offset = iocb_index * aio_ctxt->_block_size;
+        const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
+        const auto n_iocbs =
+            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+        const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
+        prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
+
+        if (config->_single_submit) {
+            _do_io_submit_singles(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        } else {
+            _do_io_submit_block(n_iocbs, iocb_index, aio_ctxt, submit_times);
+        }
+
+        _do_io_complete(n_iocbs, n_iocbs, aio_ctxt, reap_times);
+    }
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf)
+{
+    struct io_prep_generator io_gen(read_op, xfer_ctxt, aio_ctxt->_block_size);
+
+#if DEBUG_DS_AIO_PERF
+    const auto io_op_name = std::string(read_op ? "read" : "write");
+    std::cout << c_library_name << ": start " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes with " << io_gen._num_io_blocks << " io blocks" << std::endl;
+#endif
+
+    std::vector<std::chrono::duration<double>> submit_times;
+    std::vector<std::chrono::duration<double>> reap_times;
+
+    auto request_iocbs = aio_ctxt->_queue_depth;
+    auto n_pending_iocbs = 0;
+    const auto min_completes = 1;
+    auto start = std::chrono::high_resolution_clock::now();
+    while (true) {
+        const auto n_iocbs = io_gen.prep_iocbs(request_iocbs - n_pending_iocbs, &aio_ctxt->_iocbs);
+        if (n_iocbs > 0) {
+            if (config->_single_submit) {
+                _do_io_submit_singles(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            } else {
+                _do_io_submit_block(
+                    n_iocbs, (io_gen._next_iocb_index - n_iocbs), aio_ctxt, submit_times);
+            }
+        }
+
+        n_pending_iocbs += n_iocbs;
+        assert(n_pending_iocbs <= aio_ctxt->_queue_depth);
+
+        if (n_pending_iocbs == 0) { break; }
+
+        const auto n_complete =
+            _do_io_complete(min_completes, n_pending_iocbs, aio_ctxt, reap_times);
+        n_pending_iocbs -= n_complete;
+    }
+
+    const std::chrono::duration<double> elapsed = std::chrono::high_resolution_clock::now() - start;
+
+    if (perf) {
+        _get_aio_latencies(submit_times, perf->_submit);
+        _get_aio_latencies(reap_times, perf->_complete);
+        perf->_e2e_usec = elapsed.count() * 1e6;
+        perf->_e2e_rate_GB = (xfer_ctxt->_num_bytes / elapsed.count() / 1e9);
+    }
+
+#if DEBUG_DS_AIO_PERF
+    _report_aio_statistics("submit", submit_times);
+    _report_aio_statistics("complete", reap_times);
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": runtime(usec) " << elapsed.count() * 1e6
+              << " rate(GB/sec) = " << (xfer_ctxt->_num_bytes / elapsed.count() / 1e9) << std::endl;
+#endif
+
+#if DEBUG_DS_AIO_PERF
+    std::cout << c_library_name << ": finish " << io_op_name << " " << xfer_ctxt->_num_bytes
+              << " bytes " << std::endl;
+#endif
+}
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code)
+{
+    std::string err_msg = file_op + std::string(" failed on ") + std::string(filename) +
+                          " error = " + std::to_string(error_code);
+    std::cerr << c_library_name << ":  " << err_msg << std::endl;
+}
+
+int open_file(const char* filename, const bool read_op)
+{
+    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int mode = 0600;
+    const auto fd = open(filename, flags, mode);
+    if (fd == -1) {
+        const auto error_code = errno;
+        const auto error_msg = read_op ? " open for read " : " open for write ";
+        report_file_error(filename, error_msg, error_code);
+        return -1;
+    }
+    return fd;
+}
+
+int regular_read(const char* filename, std::vector<char>& buffer)
+{
+    long long int num_bytes;
+    const auto f_size = get_file_size(filename, num_bytes);
+    assert(f_size != -1);
+    buffer.resize(num_bytes);
+    const auto fd = open(filename, O_RDONLY, 0600);
+    assert(fd != -1);
+    long long int read_bytes = 0;
+    auto r = 0;
+    do {
+        const auto buffer_ptr = buffer.data() + read_bytes;
+        const auto bytes_to_read = num_bytes - read_bytes;
+        r = read(fd, buffer_ptr, bytes_to_read);
+        read_bytes += r;
+    } while (r > 0);
+
+    if (read_bytes != num_bytes) {
+        std::cerr << "read error "
+                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
+                  << std::endl;
+    }
+    assert(read_bytes == num_bytes);
+    close(fd);
+    return 0;
+}
+
+static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+{
+    std::vector<char> regular_buffer;
+    const auto reg_ret = regular_read(filename, regular_buffer);
+    assert(0 == reg_ret);
+    std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
+              << std::endl;
+
+    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+
+    return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
+}
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes)
+{
+    const auto msg_suffix = std::string("deepspeed_aio_") +
+                            std::string(read_op ? "read()" : "write()") +
+                            std::string("using read()");
+
+    if (false == _validate_buffer(filename, aio_buffer, num_bytes)) {
+        std::cout << "Fail: correctness of " << msg_suffix << std::endl;
+        return false;
+    }
+
+    std::cout << "Pass: correctness of  " << msg_suffix << std::endl;
+    return true;
+}
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
new file mode 100644
index 000000000000..1f32fc8f794f
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_utils.h>
+#include <stdlib.h>
+#include <memory>
+#include <string>
+
+using namespace std;
+
+void do_aio_operation_sequential(const bool read_op,
+                                 std::unique_ptr<aio_context>& aio_ctxt,
+                                 std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 deepspeed_aio_config_t* config,
+                                 deepspeed_aio_perf_t* perf);
+
+void do_aio_operation_overlap(const bool read_op,
+                              std::unique_ptr<aio_context>& aio_ctxt,
+                              std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                              deepspeed_aio_config_t* config,
+                              deepspeed_aio_perf_t* perf);
+
+int open_file(const char* filename, const bool read_op);
+
+void report_file_error(const char* filename, const std::string file_op, const int error_code);
+
+int regular_read(const char* filename, std::vector<char>& buffer);
+
+bool validate_aio_operation(const bool read_op,
+                            const char* filename,
+                            void* aio_buffer,
+                            const long long int num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
new file mode 100644
index 000000000000..5f717c3b5658
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -0,0 +1,74 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+deepspeed_aio_config_t::deepspeed_aio_config_t()
+    : _block_size(c_block_size),
+      _queue_depth(c_io_queue_depth),
+      _single_submit(false),
+      _overlap_events(false),
+      _lock_memory(false)
+{
+}
+
+deepspeed_aio_config_t::deepspeed_aio_config_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const bool lock_memory)
+    : _block_size(block_size),
+      _queue_depth(queue_depth),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _lock_memory(lock_memory)
+{
+}
+
+void deepspeed_aio_latency_t::dump(const std::string tag)
+{
+    std::cout << tag << _min_usec << " " << _max_usec << " " << _avg_usec << " " << std::endl;
+}
+
+void deepspeed_aio_latency_t::accumulate(const struct deepspeed_aio_latency_t& other)
+{
+    _min_usec += other._min_usec;
+    _max_usec += other._max_usec;
+    _avg_usec += other._avg_usec;
+}
+
+void deepspeed_aio_latency_t::scale(const float scaler)
+{
+    _min_usec *= scaler;
+    _max_usec *= scaler;
+    _avg_usec *= scaler;
+}
+
+aio_context::aio_context(const int block_size, const int queue_depth)
+{
+    _block_size = block_size;
+    _queue_depth = queue_depth;
+    for (auto i = 0; i < queue_depth; ++i) {
+        _iocbs.push_back((struct iocb*)calloc(1, sizeof(struct iocb)));
+    }
+    _io_events.resize(queue_depth);
+    io_queue_init(queue_depth, &_io_ctxt);
+}
+
+aio_context::~aio_context()
+{
+    for (auto& iocb : _iocbs) { free(iocb); }
+    _io_events.resize(0);
+    io_queue_release(_io_ctxt);
+}
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
new file mode 100644
index 000000000000..5c5dcdf0b559
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <libaio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+struct deepspeed_aio_latency_t {
+    double _min_usec;
+    double _max_usec;
+    double _avg_usec;
+
+    void dump(const std::string tag);
+    void accumulate(const deepspeed_aio_latency_t&);
+    void scale(const float value);
+};
+
+struct deepspeed_aio_perf_t {
+    deepspeed_aio_latency_t _submit;
+    deepspeed_aio_latency_t _complete;
+    double _e2e_usec;
+    double _e2e_rate_GB;
+};
+
+struct deepspeed_aio_config_t {
+    const int _block_size;
+    const int _queue_depth;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const bool _lock_memory;
+
+    deepspeed_aio_config_t();
+    deepspeed_aio_config_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool lock_memory);
+};
+
+struct aio_context {
+    io_context_t _io_ctxt;
+    std::vector<struct io_event> _io_events;
+    std::vector<struct iocb*> _iocbs;
+    int _block_size;
+    int _queue_depth;
+
+    aio_context(const int block_size, const int queue_depth);
+    ~aio_context();
+};
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
new file mode 100644
index 000000000000..a3d89be5ad3e
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -0,0 +1,123 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cmath>
+
+#include "deepspeed_aio_utils.h"
+
+using namespace std;
+
+const int c_block_size = 128 * 1024;
+const int c_io_queue_depth = 8;
+
+io_xfer_ctxt::io_xfer_ctxt(const int fd,
+                           const long long int file_offset,
+                           const long long int num_bytes,
+                           const void* buffer)
+    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+{
+}
+
+io_prep_context::io_prep_context(const bool read_op,
+                                 const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                 const size_t block_size,
+                                 const std::vector<struct iocb*>* iocbs)
+    : _read_op(read_op), _xfer_ctxt(xfer_ctxt), _block_size(block_size), _iocbs(iocbs)
+{
+}
+
+void io_prep_context::prep_iocbs(const int n_iocbs,
+                                 const size_t num_bytes,
+                                 const void* start_buffer,
+                                 const long long int start_offset)
+{
+    assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
+    for (auto i = 0; i < n_iocbs; ++i) {
+        const auto shift = i * _block_size;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        auto byte_count = _block_size;
+        if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
+
+        if (_read_op) {
+            io_prep_pread(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        } else {
+            io_prep_pwrite(_iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, byte_count, xfer_offset);
+        }
+    }
+}
+
+io_prep_generator::io_prep_generator(const bool read_op,
+                                     const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                                     const size_t block_size)
+    : _read_op(read_op),
+      _xfer_ctxt(xfer_ctxt),
+      _block_size(block_size),
+      _remaining_bytes(xfer_ctxt->_num_bytes),
+      _next_iocb_index(0)
+{
+    _num_io_blocks =
+        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+    _remaining_io_blocks = _num_io_blocks;
+}
+
+int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
+{
+    if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
+        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        return 0;
+    }
+
+    assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
+
+    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
+        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
+        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
+
+        if (_read_op) {
+            io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        } else {
+            io_prep_pwrite(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
+        }
+        _remaining_bytes -= num_bytes;
+    }
+    _remaining_io_blocks -= actual_n_iocbs;
+
+    return actual_n_iocbs;
+}
+
+int get_file_size(const char* filename, long long int& size)
+{
+    struct stat st;
+    if (stat(filename, &st) == -1) { return -1; }
+    size = st.st_size;
+    return 0;
+}
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock)
+{
+    void* ptr;
+    int retval;
+
+    retval = posix_memalign(&ptr, (size_t)sysconf(_SC_PAGESIZE), size);
+    if (retval) { return nullptr; }
+
+    if (lock == false) { return ptr; }
+
+    auto mlock_ret = mlock(ptr, size);
+    if (mlock_ret != 0) {
+        auto mlock_error = errno;
+        printf("mlock failed with %d %s\n", mlock_error, strerror(mlock_error));
+
+        free(ptr);
+        return nullptr;
+    }
+
+    return ptr;
+}
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
new file mode 100644
index 000000000000..f37a95c5149a
--- /dev/null
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -0,0 +1,77 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#pragma once
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <libaio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <deepspeed_aio_types.h>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+struct io_xfer_ctxt {
+    const int _fd;
+    const long long int _base_offset;
+    const void* _mem_buffer;
+    const long long int _num_bytes;
+
+    io_xfer_ctxt(const int fd,
+                 const long long int file_offset,
+                 const long long int num_bytes,
+                 const void* buffer);
+};
+
+struct io_prep_context {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+    const std::vector<struct iocb*>* _iocbs;
+
+    io_prep_context(const bool read_op,
+                    const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                    const size_t block_size,
+                    const std::vector<struct iocb*>* iocbs);
+
+    void prep_iocbs(const int n_iocbs,
+                    const size_t num_bytes,
+                    const void* start_buffer,
+                    const long long int start_offset);
+};
+
+struct io_prep_generator {
+    const bool _read_op;
+    const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
+    const size_t _block_size;
+
+    long long int _remaining_bytes;
+    long long int _num_io_blocks;
+    long long int _remaining_io_blocks;
+    long long int _next_iocb_index;
+
+    io_prep_generator(const bool read_op,
+                      const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
+                      const size_t block_size);
+
+    int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
+};
+
+void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+
+int get_file_size(const char* filename, long long int& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
new file mode 100644
index 000000000000..2c7509cb3ba0
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -0,0 +1,84 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_aio_thread.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const long long int num_bytes,
+                           const bool validate)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _num_bytes(num_bytes),
+      _validate(validate)
+{
+    _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer;
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::fini()
+{
+    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
+}
+
+deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
+    : _tid(tid),
+      _aio_config(aio_config),
+      _aio_ctxt(new aio_context(aio_config._block_size, aio_config._queue_depth)),
+      _time_to_exit(false)
+{
+}
+
+deepspeed_aio_thread_t::~deepspeed_aio_thread_t() {}
+
+void deepspeed_aio_thread_t::run()
+{
+    while (true) {
+        std::shared_ptr<struct io_op_desc_t> next_io_op = nullptr;
+
+        {
+            std::unique_lock<std::mutex> lock(_work_sync._mutex);
+            _work_sync._cond_var.wait(lock,
+                                      [this] { return (!_work_queue.empty() || _time_to_exit); });
+            if (!_work_queue.empty()) {
+                next_io_op = _work_queue.front();
+                _work_queue.pop();
+            }
+        }
+
+        if (next_io_op) {
+            const auto base_offset = next_io_op->_num_bytes * _tid;
+
+            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
+
+            if (_aio_config._overlap_events) {
+                do_aio_operation_overlap(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            } else {
+                do_aio_operation_sequential(
+                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+            }
+
+            {
+                std::lock_guard<std::mutex> lock(_complete_sync._mutex);
+                _complete_queue.push(next_io_op);
+            }
+            _complete_sync._cond_var.notify_one();
+        }
+
+        if (_time_to_exit) { break; }
+    }
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
new file mode 100644
index 000000000000..ee099dd2d16c
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const long long int _num_bytes;
+    torch::Tensor _cpu_buffer;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const long long int num_bytes,
+                 const bool validate);
+
+    char* data_ptr() const;
+    void fini();
+};
+
+struct thread_sync_t {
+    std::mutex _mutex;
+    std::condition_variable _cond_var;
+};
+
+struct deepspeed_aio_thread_t {
+    const int _tid;
+    deepspeed_aio_config_t& _aio_config;
+
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _work_queue;
+    std::queue<std::shared_ptr<struct io_op_desc_t>> _complete_queue;
+
+    bool _time_to_exit;
+
+    struct thread_sync_t _work_sync;
+    struct thread_sync_t _complete_sync;
+
+    deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config);
+
+    ~deepspeed_aio_thread_t();
+
+    void run();
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
new file mode 100644
index 000000000000..cc2895cc74b3
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -0,0 +1,121 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "deepspeed_py_aio.h"
+
+using namespace std;
+using namespace std::chrono;
+
+#define DEBUG_DS_AIO_READ 0
+#define DEBUG_DS_AIO_WRITE 0
+
+static const std::string c_library_name = "deepspeed_aio";
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+
+    deepspeed_aio_config_t config(block_size, queue_depth, single_submit, overlap_events, false);
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
+
+    if (config._overlap_events) {
+        do_aio_operation_overlap(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, aio_ctxt, xfer_ctxt, &config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
new file mode 100644
index 000000000000..a78d5734009d
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -0,0 +1,27 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+int deepspeed_py_aio_write(const torch::Tensor& buffer,
+                           const char* filename,
+                           const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const bool validate);
+
+int deepspeed_py_aio_read(torch::Tensor& buffer,
+                          const char* filename,
+                          const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const bool validate);
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
new file mode 100644
index 000000000000..8d4cfd9e8636
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -0,0 +1,282 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_aio_handle.h"
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int num_threads)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _num_threads(num_threads),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0)
+{
+    for (auto i = 0; i < num_threads; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_aio_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_aio_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
+
+int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): "
+              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
+              << std::endl;
+    return 0;
+}
+
+void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_aio_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_aio_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        completed_op->fini();
+
+        close(completed_op->_fd);
+
+        if (completed_op->_validate) {
+            validate_aio_operation(completed_op->_read_op,
+                                   completed_op->_filename.c_str(),
+                                   completed_op->data_ptr(),
+                                   completed_op->_num_bytes);
+        }
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
+                                                       const long long int num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_thread_count()) {
+        std::cout << "deepseed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_thread_count() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async)
+{
+    long long num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
+    if (buffer_bytes != num_file_bytes) {
+        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
+                  << " != " << num_file_bytes << std::endl;
+    }
+    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert((num_file_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
+                                   const char* filename,
+                                   const bool validate,
+                                   const bool async)
+{
+    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
+    assert((num_write_bytes % _num_threads) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op = std::make_shared<io_op_desc_t>(
+        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, false);
+}
+
+int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
+{
+    return pread(buffer, filename, false, true);
+}
+
+int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
+{
+    return pwrite(buffer, filename, false, true);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
new file mode 100644
index 000000000000..09358f4d927b
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -0,0 +1,68 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+
+struct deepspeed_aio_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _num_threads;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+
+    deepspeed_aio_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int num_threads);
+
+    ~deepspeed_aio_handle_t();
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_thread_count() const;
+
+    int read(torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int async_pread(torch::Tensor& buffer, const char* filename);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
+};
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
new file mode 100644
index 000000000000..3cdb5ed344bf
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -0,0 +1,133 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_copy.h"
+#include <omp.h>
+
+#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+
+#if defined(__AVX512__) or defined(__AVX256__)
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#else
+    __m256 data;
+#endif
+};
+#endif
+
+static void helper_memcpy_1(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, SIMD_WIDTH);
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH) {
+            AVX_Data src_4;
+            src_4.data = SIMD_LOAD(src + i);
+
+            SIMD_STORE(dest + i, src_4.data);
+        }
+    }
+
+#endif
+
+    if (param_size > rounded_size) {
+#pragma omp parallel for
+        for (size_t k = rounded_size; k < param_size; k++) { dest[k] = src[k]; }
+    }
+}
+
+static void helper_memcpy_4(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
+            AVX_Data src_4[4];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_1((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+static void helper_mempcy_8(float* dest, float* src, size_t param_size)
+{
+    size_t rounded_size = 0;
+
+#if defined(__AVX512__) or defined(__AVX256__)
+
+    rounded_size = ROUND_DOWN(param_size, (SIMD_WIDTH << 2));
+
+    for (size_t t = 0; t < rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
+            AVX_Data src_4[8];
+            src_4[0].data = SIMD_LOAD(src + i);
+            src_4[1].data = SIMD_LOAD(src + i + SIMD_WIDTH);
+            src_4[2].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 1));
+            src_4[3].data = SIMD_LOAD(src + i + SIMD_WIDTH * 3);
+            src_4[4].data = SIMD_LOAD(src + i + (SIMD_WIDTH << 2));
+            src_4[5].data = SIMD_LOAD(src + i + SIMD_WIDTH * 5);
+            src_4[6].data = SIMD_LOAD(src + i + SIMD_WIDTH * 6);
+            src_4[7].data = SIMD_LOAD(src + i + SIMD_WIDTH * 7);
+
+            SIMD_STORE(dest + i, src_4[0].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH, src_4[1].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 1), src_4[2].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 3, src_4[3].data);
+            SIMD_STORE(dest + i + (SIMD_WIDTH << 2), src_4[4].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 5, src_4[5].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 6, src_4[6].data);
+            SIMD_STORE(dest + i + SIMD_WIDTH * 7, src_4[7].data);
+        }
+    }
+#endif
+    if (param_size > rounded_size)
+        helper_memcpy_4((dest + rounded_size), (src + rounded_size), (param_size - rounded_size));
+}
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src)
+{
+    auto dest_c = dest.contiguous();
+    auto src_c = src.contiguous();
+
+    float* dest_ptr = (float*)dest_c.data_ptr();
+    float* src_ptr = (float*)src_c.data_ptr();
+
+    helper_mempcy_8(dest_ptr, src_ptr, dest_c.size(0));
+
+    return 0;
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
new file mode 100644
index 000000000000..823c298f6431
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -0,0 +1,40 @@
+
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <cpuid.h>
+#include <x86intrin.h>
+
+#include <deepspeed_aio_common.h>
+#include <stdlib.h>
+#include <torch/extension.h>
+
+#define TILE (1024 * 1024 * 1024)
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_WIDTH 16
+#else
+#if defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_WIDTH 8
+#endif
+#endif
+
+int deepspeed_py_memcpy(torch::Tensor& dest, const torch::Tensor& src);
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
new file mode 100755
index 000000000000..eee2cba0a962
--- /dev/null
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -0,0 +1,41 @@
+/*
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_aio_handle.h"
+#include "deepspeed_py_copy.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("aio_read", &deepspeed_py_aio_read, "DeepSpeed Asynchornous I/O Read");
+
+    m.def("aio_write", &deepspeed_py_aio_write, "DeepSpeed Asynchornous I/O Write");
+
+    m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
+
+    py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>())
+
+        .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
+        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+
+        .def("read", &deepspeed_aio_handle_t::read)
+        .def("write", &deepspeed_aio_handle_t::write)
+
+        .def("pread", &deepspeed_aio_handle_t::pread)
+        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+
+        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
+        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
+        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
+        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+
+        .def("wait", &deepspeed_aio_handle_t::wait);
+}
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
new file mode 100755
index 000000000000..305cacc54e18
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -0,0 +1,144 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_read, aio_write
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_read(ctxt['buffer'],
+             ctxt['file'],
+             args.block_size,
+             args.queue_depth,
+             args.single_submit,
+             args.overlap_events,
+             args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    aio_write(ctxt['buffer'],
+              ctxt['file'],
+              args.block_size,
+              args.queue_depth,
+              args.single_submit,
+              args.overlap_events,
+              args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
new file mode 100755
index 000000000000..fe1459c89013
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -0,0 +1,176 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import aio_handle
+from multiprocessing import Pool, Barrier
+from test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    if args.gpu:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
+    else:
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(
+        tid,
+        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
+    )
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    handle = aio_handle(args.block_size,
+                        args.queue_depth,
+                        args.single_submit,
+                        args.overlap_events,
+                        io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['handle'] = handle
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+    start_time = time.time()
+    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, args.threads)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, args.threads)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, args.threads)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, args.threads)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_takslet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    b = Barrier(args.threads)
+    pool_params = [(args, p, read_op) for p in range(args.threads)]
+    with Pool(processes=args.threads, initializer=_init_takslet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
new file mode 100755
index 000000000000..82adf85ea8a2
--- /dev/null
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -0,0 +1,169 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+import re
+
+RAW_RATE = 'raw_rate'
+E2E_RATE = 'e2e_rate'
+SUBMIT_LATENCY = 'submit_latency'
+COMPLETE_LATENCY = 'complete_latency'
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+TASK_READ_SPEED = 'task_read_speed'
+
+PERF_METRICS = [
+    RAW_RATE,
+    E2E_RATE,
+    SUBMIT_LATENCY,
+    COMPLETE_LATENCY,
+    READ_SPEED,
+    WRITE_SPEED
+]
+METRIC_SEARCH = {
+    RAW_RATE: 'ds_raw_time',
+    E2E_RATE: 'ds_time',
+    SUBMIT_LATENCY: 'aggr: submit',
+    COMPLETE_LATENCY: 'aggr: complete',
+    READ_SPEED: 'E2E Read Speed',
+    WRITE_SPEED: 'E2E Write Speed'
+}
+
+NUM_BYTES = (400 * 1024 * 1024)
+NUM_GIGA_BYTES = (1024 * 1024 * 1024)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--logdir',
+                        type=str,
+                        required=True,
+                        help='Folder of statistics logs')
+
+    parser.add_argument(
+        '--metric',
+        type=str,
+        required=True,
+        help=
+        'Performance metric to report: [raw_rate|e2e_rate|submit_latency|complete_latency]'
+    )
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(file)
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    num_giga_bytes = NUM_BYTES / NUM_GIGA_BYTES
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric == RAW_RATE:
+                    fields = line.split()
+                    raw_time_sec = float(fields[2]) / 1e06
+                    raw_rate = (thread_count * num_giga_bytes * 1.0) / raw_time_sec
+                    return raw_rate
+                elif metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.logdir):
+        print(f'{args.logdir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    log_files = [
+        f for f in os.listdir(args.logdir)
+        if os.path.isfile(os.path.join(args.logdir,
+                                       f))
+    ]
+
+    log_files_path = [os.path.join(args.logdir, f) for f in log_files]
+    results = get_results(log_files_path, args.metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
new file mode 100755
index 000000000000..8ef29d973d58
--- /dev/null
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <input file> <output log dir>"
+    exit 1
+fi
+
+INPUT_FILE=$1
+if [[ ! -f ${INPUT_FILE} ]]; then
+    echo "Input file not found: ${INPUT_FILE}"
+    exit 1
+fi
+
+LOG_DIR=$2
+RUN_SCRIPT=./test_ds_aio.py
+READ_OPT="--read_file ${INPUT_FILE}"
+
+if [[ -d ${LOG_DIR} ]]; then
+    rm -f ${LOG_DIR}/*
+else
+    mkdir -p ${LOG_DIR}
+fi
+
+DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sync"
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1 ; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${DISABLE_CACHE}
+                        echo ${cmd}
+                        echo ${SYNC}
+
+                        eval ${DISABLE_CACHE}
+                        eval ${cmd}
+                        eval ${SYNC}
+                        sleep 2
+                    done
+                done
+            done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
new file mode 100755
index 000000000000..0360934b469e
--- /dev/null
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
+
+if [[ $# -ne 3 ]]; then
+    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
+    exit 1
+fi
+
+SIZE="$1M"
+WRITE_DIR=$2
+LOG_DIR=$3
+
+OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+
+
+prep_folder ${WRITE_DIR}
+prep_folder ${LOG_DIR}
+
+RUN_SCRIPT=./test_ds_aio.py
+
+for sub in single block; do
+    if [[ $sub == "single" ]]; then
+        sub_opt="--single_submit"
+    else
+        sub_opt=""
+    fi
+    for ov in overlap sequential; do
+        if [[ $ov == "overlap" ]]; then
+            ov_opt="--overlap_events"
+        else
+            ov_opt=""
+        fi
+        for t in 1 2 4 8; do
+            for p in 1; do
+                for d in 1 2 4 8 16 32; do
+                    for bs in 128K 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads 1"
+                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+                        LOG="${LOG_DIR}/write_${SIZE}B_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        echo ${cmd}
+                        eval ${cmd}
+                        sleep 2
+                    done
+                done
+        done
+        done
+    done
+done
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
new file mode 100755
index 000000000000..607a59f58e08
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -0,0 +1,120 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+import argparse
+import time
+import sys
+from multiprocessing import Pool
+import multiprocessing as mp
+from deepspeed.ops.aio import aio_read, aio_write, aio_handle
+from ds_aio_basic import aio_basic_multiprocessing
+from ds_aio_handle import aio_handle_multiprocessing
+
+GB_DIVISOR = 1024**3
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
+
+    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
+
+    parser.add_argument('--write_size',
+                        type=str,
+                        default=None,
+                        help='Number of bytes to write.')
+
+    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
+
+    parser.add_argument('--threads',
+                        type=int,
+                        default=1,
+                        help='Thread parallelism count.')
+
+    parser.add_argument(
+        '--single_submit',
+        action='store_true',
+        help=
+        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
+    )
+
+    parser.add_argument('--overlap_events',
+                        action='store_true',
+                        help='Overlap I/O submission and completion requests.')
+
+    parser.add_argument('--validate',
+                        action='store_true',
+                        help='Perform validation in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops',
+                        type=int,
+                        default=1,
+                        help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel',
+                        type=int,
+                        default=None,
+                        help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def refine_args(args):
+    if args.write_size and type(args.write_size) == str:
+        args.write_size = refine_integer_value(args.write_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+
+def validate_args(args):
+    if args.read_file and not os.path.isfile(args.read_file):
+        print(f'args validation error: {args.read_file} not found')
+        return False
+
+    return True
+
+
+def main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = parse_arguments()
+    refine_args(args)
+    if not validate_args(args):
+        quit()
+
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    if args.read_file:
+        multiprocess_function(args, True)
+
+    if args.write_file:
+        multiprocess_function(args, False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
new file mode 100755
index 000000000000..689a0a503247
--- /dev/null
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -0,0 +1,42 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+
+GIGABYTE = 1024**3
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg):
+    if tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = total_bytes / task_latency_sec / GIGABYTE
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = total_bytes / e2e_latency_sec / GIGABYTE
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 5fae35261f55..ed33b04b3e10 100755
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -20,7 +20,7 @@
         }                                                                                      \
     }
 
-#define TILE (1024 * 1024 * 1024)
+#define TILE (128 * 1024 * 1024)
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
old mode 100644
new mode 100755
index 8e605d7558d5..6844fa563705
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -29,6 +29,7 @@ size_t get_workspace_size(int maxBatchSize,
 {
     size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
     if (training) {
+        workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
         workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
                                      2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
         if (gelu_checkpoint)
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 209b64a90902..12048006266e 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -35,13 +35,11 @@ the intermediate hyperbolic tangent, since there's no intrinsic
 that computes it directly.
 */
 
-__global__ void gelu_kernel(const float* input, float* vals, int intermediate_size)
+__global__ void gelu_kernel(const float* input, float* vals, int row_stride, int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float4* input_cast = reinterpret_cast<const float4*>(input);
     float4* vals_cast = reinterpret_cast<float4*>(vals);
@@ -60,14 +58,12 @@ __global__ void gelu_kernel(const float* input, float* vals, int intermediate_si
     }
 }
 
-__global__ void gelu_kernel(const __half* input, __half* vals, int intermediate_size)
+__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float2* input_cast = reinterpret_cast<const float2*>(input);
     float2* vals_cast = reinterpret_cast<float2*>(vals);
@@ -98,13 +94,12 @@ __global__ void gelu_kernel(const __half* input, __half* vals, int intermediate_
 __global__ void fused_bias_gelu(const float* input,
                                 const float* bias,
                                 float* vals,
-                                int intermediate_size)
+                                int row_stride,
+                                int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float4* input_cast = reinterpret_cast<const float4*>(input);
     float4* vals_cast = reinterpret_cast<float4*>(vals);
@@ -133,14 +128,13 @@ __global__ void fused_bias_gelu(const float* input,
 __global__ void fused_bias_gelu(const __half* input,
                                 const __half* bias,
                                 __half* vals,
-                                int intermediate_size)
+                                int row_stride,
+                                int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     const float2* input_cast = reinterpret_cast<const float2*>(input);
     float2* vals_cast = reinterpret_cast<float2*>(vals);
@@ -182,13 +176,12 @@ __global__ void fused_bias_gelu(const __half* input,
 __global__ void d_gelu_func(float* d_output,
                             const float* gelu_input,
                             const float* bias,
-                            int intermediate_size)
+                            int row_stride,
+                            int iterations)
 {
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     float4* d_output_cast = reinterpret_cast<float4*>(d_output);
     const float4* gelu_input_cast = reinterpret_cast<const float4*>(gelu_input);
@@ -218,14 +211,13 @@ __global__ void d_gelu_func(float* d_output,
 __global__ void d_gelu_func(__half* d_output,
                             const __half* gelu_input,
                             const __half* bias,
-                            int intermediate_size)
+                            int row_stride,
+                            int iterations)
 {
 #if __CUDA_ARCH__ >= 700
     int row = blockIdx.x;
     int id = threadIdx.x;
     int loop_stride = blockDim.x;
-    int iterations = intermediate_size / blockDim.x / 4;
-    int row_stride = intermediate_size / 4;
 
     float2* d_output_cast = reinterpret_cast<float2*>(d_output);
     const float2* gelu_input_cast = reinterpret_cast<const float2*>(gelu_input);
@@ -282,11 +274,12 @@ void launch_bias_gelu(const T* input,
                       cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(input, bias, output, intermediate_size);
+    fused_bias_gelu<<<grid_dims, block_dims, 0, stream>>>(
+        input, bias, output, intermediate_size / 4, iterations);
 }
 
 template <typename T>
@@ -297,11 +290,12 @@ void launch_gelu(const T* input,
                  cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(input, output, intermediate_size);
+    gelu_kernel<<<grid_dims, block_dims, 0, stream>>>(
+        input, output, intermediate_size / 4, iterations);
 }
 
 template void launch_bias_gelu<float>(const float*, const float*, float*, int, int, cudaStream_t);
@@ -324,11 +318,12 @@ void launch_d_gelu(T* d_output,
                    cudaStream_t stream)
 {
     int iterations = (intermediate_size + 1023) / 1024;
-    int threads = intermediate_size / iterations / 4;
+    int threads = (intermediate_size - 1) / (iterations * 4) + 1;
     dim3 block_dims(threads);
     dim3 grid_dims(batch_size);
 
-    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(d_output, input, bias, intermediate_size);
+    d_gelu_func<<<grid_dims, block_dims, 0, stream>>>(
+        d_output, input, bias, intermediate_size / 4, iterations);
 }
 
 template void launch_d_gelu<float>(float*, const float*, const float*, int, int, cudaStream_t);
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 1da8869cc718..7dffc472a71c 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -95,7 +95,7 @@ def parse_args(args=None):
     parser.add_argument("--launcher",
                         default=PDSH_LAUNCHER,
                         type=str,
-                        help="(optional) choose launcher backend for multi-node"
+                        help="(optional) choose launcher backend for multi-node "
                         "training. Options currently include PDSH, OpenMPI, MVAPICH.")
 
     parser.add_argument("--launcher_args",
@@ -104,6 +104,11 @@ def parse_args(args=None):
                         help="(optional) pass launcher specific arguments as a "
                         "single quoted argument.")
 
+    parser.add_argument("--force_multi",
+                        action="store_true",
+                        help="Force multi-node launcher mode, helps in cases where user "
+                        "wants to launch on single remote node.")
+
     parser.add_argument("--detect_nvlink_pairs", action="store_true",
                         help="(optional) autodetects nvlink pairs and remaps CUDA_VISIBLE_DEVICES along the "
                              "fastest connections")
@@ -308,10 +313,7 @@ def main(args=None):
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = len(active_resources) > 1
-
-    if multi_node_exec and not shutil.which('pdsh'):
-        raise RuntimeError("pdsh is not installed, unable to proceed")
+    multi_node_exec = args.force_multi or len(active_resources) > 1
 
     if not multi_node_exec:
         deepspeed_launch = [
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 7977d232b1fa..35eeedb86b5d 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -85,6 +85,11 @@ def __init__(self,
                                      weight_decay,
                                      adamw_mode)
 
+    def __del__(self):
+        # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
+        # is used multiple times in the same process (notebook or pytest worker)
+        self.ds_opt_adam.destroy_adam(self.opt_id)
+
     def __setstate__(self, state):
         super(DeepSpeedCPUAdam, self).__setstate__(state)
         for group in self.param_groups:
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
new file mode 100755
index 000000000000..50e6c9a3c988
--- /dev/null
+++ b/deepspeed/ops/aio/__init__.py
@@ -0,0 +1,6 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from ..op_builder import AsyncIOBuilder
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index 179a0b134756..e3343e7852fa 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -9,7 +9,7 @@
 
 ## Overview
 
-The DeepSpeed flops profiler profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module.
+This profiles the forward pass of a PyTorch model and prints the model graph with the measured profile attached to each module.
 It shows the parameters, latency, and number of floating point operations of the modules within the model to identify potential bottlenecks.
 It also outputs the names of the top `k` modules in terms of aggregated time, flops, and number of parameters at depth `l` with `k` and `l` specified by the user.
 The DeepSpeed flops profiler can be used with the DeepSpeed runtime or as a standalone package.
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 8a9785a9aedb..faa60f20efa3 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -24,7 +24,7 @@
 
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
-from deepspeed.runtime.utils import move_to_device
+from deepspeed.runtime.utils import move_to_device, see_memory_usage
 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers
 
 # DeepSpeed Checkpointing Enabled or Disabled
@@ -55,34 +55,6 @@
 SYNCHRONIZE = False
 PROFILE_TIME = False
 
-
-def see_memory_usage(message, force=False):
-    # return
-    if not force:
-        return
-    # dist.barrier()
-    if dist.get_rank() == 0:
-        logger.info(message)
-        logger.info(
-            "Memory Allocated %s GigaBytes",
-            torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max Memory Allocated %s GigaBytes",
-            torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Cache Allocated %s GigaBytes",
-            torch.cuda.memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info(
-            "Max cache Allocated %s GigaBytes",
-            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-        )
-        logger.info("")
-        #input("Press Any Key To Continue ..")
-
-
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
 transport_stream = None
@@ -685,8 +657,7 @@ def backward(ctx, *grads):
 
         torch.autograd.backward(output_tensors, grad_tensors)
 
-        see_memory_usage("After backward checkpointing code before backward",
-                         force=False)
+        see_memory_usage("After backward checkpointing code after backward", force=False)
 
         if PROFILE_TIME:
             timers('backward').stop()
diff --git a/deepspeed/runtime/bs_schedules.py b/deepspeed/runtime/bs_schedules.py
new file mode 100644
index 000000000000..e74009b742cd
--- /dev/null
+++ b/deepspeed/runtime/bs_schedules.py
@@ -0,0 +1,83 @@
+import math
+import numpy as np
+
+
+class BatchSizeScheduler(object):
+    """Increase the batch size linearly from int(mb_size_per_gpu * min_batch_size_multiplier) to mb_size_per_gpu
+        over warmup_num_steps steps, and then fix at mb_size_per_gpu.
+
+    TODO: documentation
+    """
+
+    def __init__(self,
+                 final_batch_size,
+                 min_batch_size_multiplier: float = 0.01,
+                 warmup_num_steps: int = 1000,
+                 num_intervals=4,
+                 last_batch_iteration: int = -1,
+                 deepspeed=None):
+
+        self.warmup_num_steps = warmup_num_steps
+        self.last_batch_iteration = last_batch_iteration
+        self.final_batch_size = final_batch_size
+        self.num_intervals = num_intervals
+        self.min_batch_size_multiplier = min_batch_size_multiplier
+        self.schedule = self._build_schedule()
+        self.current_batch_size = None
+        self.deepspeed = deepspeed
+
+    def _build_schedule(self):
+        start = math.ceil(self.min_batch_size_multiplier * self.final_batch_size)
+        batch_sizes = np.linspace(start, self.final_batch_size, num=self.num_intervals, endpoint=True, retstep=False,
+                                  dtype=int, axis=0)
+        steps = np.linspace(0, self.warmup_num_steps, num=self.num_intervals, endpoint=True, retstep=False, dtype=int,
+                            axis=0)
+        schedule = {step: batch_size for step, batch_size in zip(steps, batch_sizes)}
+        # deduplicate intervals with same batch size
+        prev_v = None
+        to_pop = []
+        for k, v in schedule.items():
+            if v == prev_v:
+                to_pop.append(k)
+            prev_v = v
+        for k in to_pop:
+            schedule.pop(k)
+        return schedule
+
+    def get_current_batch_size(self):
+        i = None
+        iterator = sorted(self.schedule.keys(), reverse=True)
+        for i, v in enumerate(iterator):
+            if self.last_batch_iteration >= v:
+                break
+            else:
+                pass
+        current_batch_size = self.schedule[iterator[i]]
+        return current_batch_size
+
+    def step(self, last_batch_iteration=None):
+        if last_batch_iteration is None:
+            last_batch_iteration = self.last_batch_iteration + 1
+        self.last_batch_iteration = last_batch_iteration
+        self.current_batch_size = self.get_current_batch_size()
+
+    def state_dict(self):
+        return {'last_batch_iteration': self.last_batch_iteration}
+
+    def load_state_dict(self, sd):
+        self.last_batch_iteration = sd['last_batch_iteration']
+
+
+if __name__ == "__main__":
+    sched = BatchSizeScheduler(
+        final_batch_size=16,
+        num_intervals=8,
+        warmup_num_steps=10000
+    )
+    print(f'SCHEDULE: {sched.schedule}')
+    prev_bs = None
+    for i in range(sched.warmup_num_steps + 1):
+        sched.step()
+        if sched.current_batch_size != prev_bs:
+            print(i, sched.current_batch_size)
+        prev_bs = sched.current_batch_size
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index b94f7caf81c5..88f0086e9814 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -86,7 +86,7 @@ def compressed_allreduce(self,
 
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
-        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+        # recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
         recvbuf_scale = [
             torch.zeros(1,
                         dtype=worker_scale.dtype,
@@ -96,7 +96,9 @@ def compressed_allreduce(self,
         # communication phase 1
         # gather_start = time.time()
         # Alltoall for sign
-        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group)
+        dist.all_to_all_single(recvbuf_sign,
+                               torch.stack(sign_list_packed),
+                               group=self.world_group)
         # Allgather for scale
         dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
 
@@ -155,7 +157,9 @@ def compressed_allreduce(self,
         ]
 
         # Communication Phase 2
-        dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group)
+        dist.all_gather(recvbuf_sign_server,
+                        server_sign_packed[0],
+                        group=self.world_group)
         dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
 
         cupy_server_sign_packed = None
@@ -173,7 +177,7 @@ def compressed_allreduce(self,
                     self.size,
                     -1)).float().add_(-0.5).mul_(2.0).mul_(
                         self.compression_backend.cupy2torch(
-                    cupy_recvbuf_scale_server)).flatten().data)
+                            cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
         if len(original_shape) > 1:
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 11e1d4037c8e..3fa0b32a6032 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -9,7 +9,7 @@
 
 from .constants import *
 from .fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, DELAYED_SHIFT, MIN_LOSS_SCALE
-from .config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys
+from .config_utils import get_scalar_param, dict_raise_error_on_duplicate_keys, ScientificNotationEncoder
 from .zero.config import DeepSpeedZeroConfig
 from .zero.constants import *
 from .activation_checkpointing.config import DeepSpeedActivationCheckpointingConfig
@@ -24,17 +24,21 @@
 
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 
+from .swap_tensor.aio_config import get_aio_config
+
 TENSOR_CORE_ALIGN_SIZE = 8
 
 ADAM_OPTIMIZER = 'adam'
 ADAMW_OPTIMIZER = 'adamw'
 LAMB_OPTIMIZER = 'lamb'
 ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
+ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
 DEEPSPEED_OPTIMIZERS = [
     ADAM_OPTIMIZER,
     ADAMW_OPTIMIZER,
     LAMB_OPTIMIZER,
     ONEBIT_ADAM_OPTIMIZER,
+    ONEBIT_LAMB_OPTIMIZER,
 ]
 
 # extra optimizer parameters for adam/adamw
@@ -656,6 +660,8 @@ def _initialize_params(self, param_dict):
         self.checkpoint_tag_validation_enabled = validation_mode != ValidationMode.IGNORE
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
 
+        self.aio_config = get_aio_config(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
@@ -744,6 +750,7 @@ def print(self, name):
             json.dumps(self._param_dict,
                        sort_keys=True,
                        indent=4,
+                       cls=ScientificNotationEncoder,
                        separators=(',',
                                    ':'))))
 
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 62782852a3d2..7a6f994e806a 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -6,7 +6,40 @@
 Collection of DeepSpeed configuration utilities
 """
 import json
-from collections import Counter
+import collections
+
+
+# adapted from https://stackoverflow.com/a/50701137/9201239
+class ScientificNotationEncoder(json.JSONEncoder):
+    """
+    This class overrides ``json.dumps`` default formatter.
+
+    This version keeps everything as normal except formats numbers bigger than 1e3 using scientific notation.
+
+    Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
+
+    """
+    def iterencode(self, o, _one_shot=False, level=0):
+        indent = self.indent if self.indent is not None else 4
+        prefix_close = " " * level * indent
+        level += 1
+        prefix = " " * level * indent
+        if isinstance(o, bool):
+            return "true" if o else "false"
+        elif isinstance(o, float) or isinstance(o, int):
+            if o > 1e3:
+                return f"{o:e}"
+            else:
+                return f"{o}"
+        elif isinstance(o, collections.Mapping):
+            x = [
+                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
+                v in o.items()
+            ]
+            return "{" + ', '.join(x) + f"\n{prefix_close}" + "}"
+        elif isinstance(o, collections.Sequence) and not isinstance(o, str):
+            return f"[{ f', '.join(map(self.iterencode, o)) }]"
+        return "\n, ".join(super().iterencode(o, _one_shot))
 
 
 class DeepSpeedConfigObject(object):
@@ -17,7 +50,12 @@ def repr(self):
         return self.__dict__
 
     def __repr__(self):
-        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+        return json.dumps(
+            self.__dict__,
+            sort_keys=True,
+            indent=4,
+            cls=ScientificNotationEncoder,
+        )
 
 
 def get_scalar_param(param_dict, param_name, param_default_value):
@@ -32,7 +70,7 @@ def dict_raise_error_on_duplicate_keys(ordered_pairs):
     """Reject duplicate keys."""
     d = dict((k, v) for k, v in ordered_pairs)
     if len(d) != len(ordered_pairs):
-        counter = Counter([pair[0] for pair in ordered_pairs])
+        counter = collections.Counter([pair[0] for pair in ordered_pairs])
         keys = [key for key, value in counter.items() if value > 1]
         raise ValueError("Duplicate keys in DeepSpeed config: {}".format(keys))
     return d
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index a5de063427a1..af8c8684060a 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -24,7 +24,7 @@
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
-    ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, \
+    ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT
 
 from deepspeed.runtime.dataloader import DeepSpeedDataLoader
@@ -341,14 +341,14 @@ def zero_reduce_scatter(self):
     def zero_overlap_comm(self):
         return self._config.zero_config.overlap_comm
 
-    def zero_cpu_offload(self):
-        return self._config.zero_config.cpu_offload
+    def zero_offload_optimizer(self):
+        return self._config.zero_config.offload_optimizer
 
-    def zero_cpu_offload_params(self):
-        return self._config.zero_config.cpu_offload_params
+    def zero_offload_param(self):
+        return self._config.zero_config.offload_param
 
-    def zero_cpu_offload_use_pin_memory(self):
-        return self._config.zero_config.cpu_offload_use_pin_memory
+    def zero_cpu_offload(self):
+        return self._config.zero_config.offload_optimizer is not None
 
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
@@ -437,6 +437,12 @@ def initial_dynamic_scale(self):
     def dynamic_loss_scale_args(self):
         return self._config.dynamic_loss_scale_args
 
+    def swap_tensor_config(self):
+        return self._config.swap_tensor_config
+
+    def aio_config(self):
+        return self._config.aio_config
+
     def _configure_lr_scheduler(self, client_lr_scheduler):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
@@ -548,7 +554,8 @@ def _do_sanity_check(self):
                 assert self._is_supported_optimizer(self.optimizer_name()), \
                     '{} is not a supported DeepSpeed Optimizer'.format(self.optimizer_name())
 
-        if self.optimizer_name() == LAMB_OPTIMIZER:
+        if self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name(
+        ) == ONEBIT_LAMB_OPTIMIZER:
             assert self.dynamic_loss_scale(), \
                 'DeepSpeed {} optimizer requires dynamic loss scaling'.format(self.optimizer_name())
 
@@ -595,8 +602,9 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
             client_optimizer.param_groups[:] = [
                 pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
             ]
-            logger.info(
-                "Removing param_group that has no 'params'in the client Optimizer")
+            if self.global_rank == 0:
+                logger.info(
+                    "Removing param_group that has no 'params' in the client Optimizer")
 
             basic_optimizer = client_optimizer
             if self.global_rank == 0:
@@ -682,12 +690,19 @@ def _configure_basic_optimizer(self, model_parameters):
             from deepspeed.ops.lamb import FusedLamb
             optimizer = FusedLamb(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
+            from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
                 logger.warning(
                     f'Currently the convergence of 1-bit Adam is only verified under FP16'
                 )
+        elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
+            from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
+            optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
+            if not self.fp16_enabled():
+                logger.warning(
+                    f'Currently the convergence of 1-bit Lamb is only verified under FP16'
+                )
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -704,6 +719,7 @@ def _configure_fp16_optimizer(self, optimizer):
                 timers = self.timers if self.wall_clock_breakdown() else None
                 optimizer = FP16_Optimizer(
                     optimizer,
+                    deepspeed=self,
                     dynamic_loss_scale=True,
                     initial_dynamic_scale=initial_dynamic_scale,
                     dynamic_loss_args=dynamic_loss_args,
@@ -717,6 +733,7 @@ def _configure_fp16_optimizer(self, optimizer):
                     ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
+                    deepspeed=self,
                     static_loss_scale=self.loss_scale(),
                     mpu=self.mpu,
                     clip_grad=clip_grad,
@@ -726,6 +743,7 @@ def _configure_fp16_optimizer(self, optimizer):
                      ranks=[0])
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
+                deepspeed=self,
                 static_loss_scale=self.loss_scale(),
                 dynamic_loss_scale=self.dynamic_loss_scale(),
                 dynamic_loss_args=dynamic_loss_args,
@@ -742,7 +760,6 @@ def _configure_zero_optimizer(self, optimizer):
         timers = self.timers if self.wall_clock_breakdown() else None
 
         if zero_stage == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
-            assert self.zero_reduce_scatter(), 'Stage 1 only supports reduce scatter mode'
             optimizer = FP16_DeepSpeedZeroOptimizer_Stage1(
                 optimizer,
                 static_loss_scale=self.loss_scale(),
@@ -794,14 +811,14 @@ def _configure_zero_optimizer(self, optimizer):
                 dp_process_group=self.data_parallel_group,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=self.zero_overlap_comm(),
-                cpu_offload_optimizer_state=self.zero_cpu_offload(),
-                cpu_offload_params=self.zero_cpu_offload_params(),
-                cpu_offload_use_pin_memory=self.zero_cpu_offload_use_pin_memory(),
+                offload_optimizer_config=self.zero_offload_optimizer(),
+                offload_param_config=self.zero_offload_param(),
                 sub_group_size=self.zero_sub_group_size(),
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor(),
-                gradient_accumulation_steps=self.gradient_accumulation_steps())
+                gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                aio_config=self.aio_config())
 
         else:
             raise NotImplementedError("ZeRO stage {} not implemented".format(zero_stage))
@@ -906,6 +923,13 @@ def forward(self, *inputs, **kwargs):
         if self.module.training and self.progressive_layer_drop:
             kwargs.update(self.progressive_layer_drop.get_state())
 
+        if self.zero_optimization_partition_weights():
+            # Enable automated discovery of external parameters by indicating that
+            # we are in a forward pass.
+            for module in self.module.modules():
+                module._parameters._in_forward = True
+                pass
+
         if self.wall_clock_breakdown():
             self.timers('forward').start()
 
@@ -913,11 +937,15 @@ def forward(self, *inputs, **kwargs):
             self.tput_timer.start()
         loss = self.module(*inputs, **kwargs)
 
-        # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
         if self.zero_optimization_partition_weights():
+            # Reset the ZeRO-3 state if we are only doing forward-passes (ie evaluation).
             if not torch._C.is_grad_enabled():
                 self.optimizer.param_coordinator.reset_step()
 
+            # Disable automated discovery of external parameters
+            for module in self.module.modules():
+                module._parameters._in_forward = False
+
         if self.wall_clock_breakdown():
             self.timers('forward').stop()
 
@@ -950,8 +978,8 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
 
         # Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
-            if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES:
-                assert self.zero_reduce_scatter()
+            if self.zero_optimization_stage(
+            ) == ZERO_OPTIMIZATION_OPTIMIZER_STATES and self.zero_reduce_scatter():
                 self.optimizer.reduce_scatter_gradients(
                     postscale_gradients=self.postscale_gradients(),
                     gradient_predivide_factor=self.gradient_predivide_factor(),
@@ -1105,7 +1133,8 @@ def _take_model_step(self, lr_kwargs):
         else:
             if self.lr_scheduler is not None:
                 self.lr_scheduler.step(**(lr_kwargs or {}))
-            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+
+        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
                 self._report_progress(self.global_steps + 1)
         self.timers('_step_check_overflow').stop()
 
@@ -1684,27 +1713,20 @@ def _save_checkpoint(self, save_dir, tag, client_state={}):
         # then instead just returns None.
         self._curr_ckpt_path = os.path.join(save_dir, tag)
 
-        state = {
-            'module':
-                self.module_state_dict(),
-            'optimizer':
-                self.optimizer.state_dict()
-                if self.optimizer and not self.zero_optimization() else None,
-            'lr_scheduler':
-                self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
-            'csr_tensor_module_names':
-                self.csr_tensor_module_names,
-            'skipped_steps':
-                self.skipped_steps,
-            'global_steps':
-                self.global_steps,
-            'global_samples':
-                self.global_samples,
-            'dp_world_size':
-                self.dp_world_size,
-            'mp_world_size':
-                self.mp_world_size
-        }
+        state = dict(
+            module=self.module_state_dict(),
+            optimizer=self.optimizer.state_dict()
+            if self.optimizer and not self.zero_optimization() else None,
+            lr_scheduler=self.lr_scheduler.state_dict()
+            if self.lr_scheduler is not None else None,
+            csr_tensor_module_names=self.csr_tensor_module_names,
+            skipped_steps=self.skipped_steps,
+            global_steps=self.global_steps,
+            global_samples=self.global_samples,
+            dp_world_size=self.dp_world_size,
+            mp_world_size=self.mp_world_size,
+        )
+
         state.update(client_state)
 
         log_dist(message=f'Saving model checkpoint: {save_path}', ranks=[0])
@@ -1745,6 +1767,8 @@ def _zero3_consolidated_fp16_state_dict(self):
 
         Get a full non-partitioned state_dict with fp16 weights on cpu.
 
+        Important: this function must be called on all ranks and not just rank 0.
+
         This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
 
         1. consolidates the weights from different partitions on gpu0
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index e875baa9f11c..515ac6a2e80b 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -8,12 +8,10 @@
 import torch
 import math
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import torch.distributed as dist
 
 from deepspeed.runtime.utils import get_grad_norm, CheckOverflow, get_weight_norm
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger, log_dist
-from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
 
 
 class FP16_Optimizer(object):
@@ -25,6 +23,7 @@ class FP16_Optimizer(object):
 
     def __init__(self,
                  init_optimizer,
+                 deepspeed=None,
                  static_loss_scale=1.0,
                  dynamic_loss_scale=False,
                  initial_dynamic_scale=2 ** 32,
@@ -103,7 +102,9 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.overflow_checker = CheckOverflow(self.fp16_groups,
+                                              mpu=self.mpu,
+                                              deepspeed=deepspeed)
         self.initialize_optimizer_states()
 
     def initialize_optimizer_states(self):
@@ -156,10 +157,10 @@ def step_fused_adam(self, closure=None):
 
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(
+                    prev_scale,
+                    self.cur_scale))
             return self.overflow
         combined_scale = self.unscale_and_clip_grads(grads_groups_flat,
                                                      norm_groups,
@@ -215,18 +216,6 @@ def step(self, closure=None):
         self.overflow = self.overflow_checker.has_overflow(fp16_params)
         self.stop_timers([OVERFLOW_CHECK])
         prev_scale = self.cur_scale
-        if isinstance(self.optimizer, OnebitAdam):
-            # if optimizer has mpu (i.e, is pipeline parallel), communicate the skipped step to all optimizers in group
-            if hasattr(self.optimizer.comm_backend_handle,
-                       "mpu") and self.optimizer.comm_backend_handle.mpu is not None:
-                if self.overflow:
-                    bool_tensor = torch.zeros(1).cuda()
-                else:
-                    bool_tensor = torch.ones(1).cuda()
-                dist.all_reduce(bool_tensor, op=dist.ReduceOp.PRODUCT)
-                if not any(bool_tensor):
-                    self.overflow = True
-
         self._update_scale(self.overflow)
 
         if self.overflow:
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index e69de29bb2d1..35e35411cfde 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -0,0 +1,322 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import importlib
+import numpy as np
+import time
+import torch.distributed as dist
+
+from deepspeed.utils.logging import logger
+
+
+class OnebitAdam(torch.optim.Optimizer):
+    """Implements the 1-bit Adam algorithm. Currently GPU-only.
+    For usage example please see https://www.deepspeed.ai/tutorials/onebit-adam/
+    For technical details please read https://arxiv.org/abs/2102.02888
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Adam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 amsgrad=False,
+                 cuda_aware=False,
+                 comm_backend_name='nccl'):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
+
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+
+        super(OnebitAdam, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
+
+        self.comm_time = 0.0
+        self.step_time = 0.0
+        self.ave_step = 1
+        self.bk_time = 0.0
+
+        self.deepspeed = deepspeed
+        self.adam_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+        self.using_pipeline = False
+
+        self.comm_backend_name = comm_backend_name
+
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
+
+        if self.comm_backend_name == 'nccl':
+            TORCH_MAJOR = int(torch.__version__.split('.')[0])
+            TORCH_MINOR = int(torch.__version__.split('.')[1])
+            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.using_pipeline = hasattr(self.deepspeed,
+                                          'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
+
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
+
+        self.size = self.comm_backend_handle.size
+
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced recision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        gather_time = 0
+        allgather_time = 0
+        all_time = 0
+
+        if self.adam_freeze_key is False:
+            v_diff_buffer = 0.0
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('1-bit Adam does not support sparse gradients')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                    state['tensor_size'] = torch.numel(p.data)
+                    state['corrected_tensor_size'] = state['tensor_size']
+
+                    if state['tensor_size'] % (self.size * self.divider) != 0:
+                        state['corrected_tensor_size'] += ((self.size * self.divider) -
+                                                           (state['tensor_size'] %
+                                                            (self.size * self.divider)))
+                    state['server_chunk_size'] = state[
+                        'corrected_tensor_size'] // self.size
+
+                if not self.initialize or (self.adam_freeze_key
+                                           and 'worker_error' not in state.keys()):
+                    torch.cuda.empty_cache()
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
+                                                        device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'],
+                                                        device=p.device)
+                    torch.cuda.empty_cache()
+                    self.adam_freeze_key = True
+                    if not self.initialize and torch.distributed.get_rank() == 0:
+                        print("Cupy Buffers Initialized Successfully.")
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                if self.adam_freeze_key is False:
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    grad = None
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                else:
+                    if 'non_freeze' in group.keys() and group['non_freeze'] is True:
+                        dist.all_reduce(grad)
+                        grad.mul_(1 / dist.get_world_size())
+                        exp_avg.mul_(beta1).add(1 - beta1, grad)
+                        exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                        grad = None
+                    else:
+                        if self.initialize is True:
+                            exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        grad = None
+
+                        if self.size > 1:
+                            exp_avg.set_(
+                                self.comm_backend_handle.compressed_allreduce(
+                                    exp_avg,
+                                    state['worker_error'],
+                                    state['server_error'],
+                                    self.deepspeed.local_rank))
+                        # Because 1-bit compression cannot represent exact zero, it is required to
+                        # provide a momentum mask for those params that have constant exact zeros in their
+                        # momentums, otherwise the compression error would keep accumulating.
+                        # For example, for BERT pre-training seq 128, bert.embeddings.position_embeddings.weight
+                        # always have exact zeros in its momentum for row 129 to 512, because it only
+                        # learns up to seq length 128 while the model supports up to 512 seq length.
+                        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+                        if 'exp_avg_mask' in group:
+                            if exp_avg.device != group['exp_avg_mask'].device:
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                    device=exp_avg.device)
+                            exp_avg.mul_(group['exp_avg_mask'])
+
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                if self.initialize:
+                    if group['weight_decay'] > 0.0:
+                        update += group['weight_decay'] * p.data
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * update)
+
+            if not self.initialize:
+                print('Pop out errors', flush=True)
+                state.pop('worker_error')
+                state.pop('server_error')
+
+        if not self.initialize:
+            self.adam_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.adam_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                print('OnebitAdam - starting compressed communication')
+                self.adam_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+
+        return loss
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides load_state_dict() to add special handling when loading checkpoints
+        """
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # BERT pre-training seqlen 128 and 512 ), we don't use the exp_avg_mask
+        # in checkpoints but always use the one user provided in training script.
+        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+        # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
+                    'param_groups'][i]:
+                state_dict['param_groups'][i].pop('exp_avg_mask')
+        super().load_state_dict(state_dict)
+        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print("Checkpoint loaded and OnebitAdam warmup stage starts/continues.")
+            if self.adam_freeze_key is True:
+                self.adam_freeze_key = False
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = True
+                else:
+                    self.deepspeed.enable_backward_allreduce = True
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and OnebitAdam compression stage starts/continues."
+                )
+            if self.adam_freeze_key is False:
+                self.adam_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+        # We reset the compression errors when loading checkpoints for 3 reasons:
+        # 1) The worker and server error at each GPU are distinct, so in current implementation
+        # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
+        # If we want to save them correctly we need O(num_gpu*model_size) memory in order to
+        # gather all the error, which is a very large memory requirement. It's possible to save
+        # them in a distributed way, but it will make the checkpoint saving/loading much more complicated.
+        # 2) Even if we are able to save the compression errors correctly, you need to have the
+        # exact same number of GPUs in order to load them correctly.
+        # 3) We verified on BERT pre-training that occasionally resetting the compression error
+        # at checkpoint loading does not affect the convergence.
+        # However, please avoid frequent checkpoint loading which could break the error
+        # compensation mechanism thus affect the convergence.
+        for group in self.param_groups:
+            for p in group['params']:
+                if 'worker_error' in self.state[p]:
+                    self.state[p].pop('worker_error')
+                if 'server_error' in self.state[p]:
+                    self.state[p].pop('server_error')
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
new file mode 100644
index 000000000000..01c6cd878488
--- /dev/null
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -0,0 +1,471 @@
+'''
+Copyright 2021 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import numpy as np
+import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+class OnebitLamb(torch.optim.Optimizer):
+    """Implements the 1-bit Lamb algorithm. Currently GPU-only.
+    For usage example please see https://www.deepspeed.ai/tutorials/onebit-lamb/
+    For technical details please see our paper https://arxiv.org/abs/2104.06069.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Lamb!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
+        coeff_beta (float, optional): coefficient used for computing
+            running averages of lamb coefficient (default: 0.9) note that you may want to
+            increase or decrease this beta depending on the freeze_step you choose, as
+            1/(1 - coeff_beta) should be smaller than or equal to freeze_step
+        factor_max (float, optional): maximum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 4.0)
+        factor_min (float, optional): minimum value of scaling factor to the frozen lamb
+            coefficient during compression stage (default: 0.5)
+        factor_threshold (float, optional): threshold of how much the scaling factor can
+            fluctuate between steps (default: 0.1)
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 max_coeff=10.0,
+                 min_coeff=0.01,
+                 amsgrad=False,
+                 cuda_aware=False,
+                 comm_backend_name='nccl',
+                 coeff_beta=0.9,
+                 factor_max=4.0,
+                 factor_min=0.5,
+                 factor_threshold=0.1):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Lamb does not support the AMSGrad variant.')
+
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm,
+                        max_coeff=max_coeff,
+                        min_coeff=min_coeff)
+
+        super(OnebitLamb, self).__init__(params, defaults)
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+        assert (dist.is_initialized())
+
+        self.deepspeed = deepspeed
+        self.lamb_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+        self.coeff_beta = coeff_beta
+        self.factor_max = factor_max
+        self.factor_min = factor_min
+        self.factor_threshold = factor_threshold
+        self.using_pipeline = False
+
+        self.comm_backend_name = comm_backend_name
+
+        # Empty initializer. Set handle based on the comm backend as follows.
+        self.comm_backend_handle = None
+
+        if self.comm_backend_name == 'nccl':
+            TORCH_MAJOR = int(torch.__version__.split('.')[0])
+            TORCH_MINOR = int(torch.__version__.split('.')[1])
+            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
+            from deepspeed.runtime.comm.nccl import NcclBackend
+            self.using_pipeline = hasattr(self.deepspeed,
+                                          'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
+
+        elif self.comm_backend_name == 'mpi':
+            from deepspeed.runtime.comm.mpi import MpiBackend
+            self.comm_backend_handle = MpiBackend(cuda_aware)
+
+        self.size = self.comm_backend_handle.size
+
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+
+        self.exp_avg_flat = []
+        self.dummy_exp_avg = {}
+        self.corrected_tensor_sizes = []
+        self.server_chunk_sizes = []
+        self.worker_errors = []
+        self.server_errors = []
+
+        self.lamb_coeffs = []
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        #remove the previous stats
+        del self.lamb_coeffs[:]
+
+        if self.lamb_freeze_key:
+            exp_avg_last_step = []
+            for group in self.param_groups:
+                exp_avg_last_step.append(
+                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
+            if 'scaling_coeff' not in self.state[self.param_groups[0]['params'][0]]:
+                # Compute the scaling_coeff for each momentum at the end of warmup stage.
+                # This is used to reduce compression error during compression stage.
+                momentum_scales = []
+                for group in self.param_groups:
+                    momentum_scales.append([
+                        (torch.norm(self.state[p]['exp_avg']) /
+                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
+                        for p in group['params']
+                    ])
+                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
+                    [len(x) for x in momentum_scales])
+                for i, group in enumerate(self.param_groups):
+                    for j, p in enumerate(group['params']):
+                        self.state[p][
+                            'scaling_coeff'] = united_scale / momentum_scales[i][j]
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('1-bit Lamb does not support sparse gradients')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0 or (len(state) == 1
+                                       and 'scaling_coeff' in state.keys()):
+                    state['step'] = 0
+                    state['lamb_coeff_freeze'] = 0.0
+                    state['last_factor'] = 1.0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    state['exp_avg_sq_fresh'] = torch.zeros_like(p.data)
+
+                if not self.initialize:
+                    self.lamb_freeze_key = True
+
+                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                beta1, beta2 = group['betas']
+                max_coeff = group['max_coeff']
+                min_coeff = group['min_coeff']
+
+                state['step'] += 1
+
+                if self.lamb_freeze_key is False:
+                    # warmup stage, baseline Lamb optimization
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    if state['step'] == self.freeze_step:
+                        exp_avg_sq_fresh.data = exp_avg_sq.detach().clone()
+                    grad = None
+                    if self.initialize:
+                        weight_norm = p.data.pow(2).sum().sqrt()
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+                        if group['weight_decay'] > 0.0:
+                            update += group['weight_decay'] * p.data
+                        update_norm = update.pow(2).sum().sqrt()
+                        lamb_coeff = 1.0
+                        if weight_norm != 0 and update_norm != 0:
+                            lamb_coeff = (weight_norm / update_norm).item()
+                            if lamb_coeff > max_coeff:
+                                lamb_coeff = max_coeff
+                            if lamb_coeff < min_coeff:
+                                lamb_coeff = min_coeff
+                        if lamb_coeff != 1.0:
+                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
+                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
+                        self.lamb_coeffs.append(lamb_coeff)
+                        with torch.no_grad():
+                            p.add_(-group['lr'] * lamb_coeff * update)
+                else:
+                    # compression stage, update each momentum locally, then
+                    # communicate based on the compressed_allreduce below
+                    if self.initialize:
+                        exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        exp_avg.mul_(self.state[p]['scaling_coeff'])
+                    grad = None
+
+        # init fused momentum
+        if len(self.exp_avg_flat) == 0:
+            momentum_groups = []
+            tensor_size = 0
+            for group in self.param_groups:
+                for p in group['params']:
+                    momentum_groups.append(self.state[p]['exp_avg'])
+                    tensor_size += torch.numel(p.data)
+            corrected_tensor_size = tensor_size
+            if tensor_size % (self.size * self.divider) != 0:
+                difference = ((self.size * self.divider) - (tensor_size %
+                                                            (self.size * self.divider)))
+                corrected_tensor_size += difference
+                self.dummy_exp_avg[0] = torch.zeros(
+                    difference,
+                    device=momentum_groups[0].data.device)
+                momentum_groups.append(self.dummy_exp_avg[0])
+            self.corrected_tensor_sizes.append(corrected_tensor_size)
+            self.server_chunk_sizes.append(corrected_tensor_size // self.size)
+
+            self.exp_avg_flat.append(
+                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
+            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
+                                                      momentum_groups)
+            for p, q in zip(momentum_groups, updated_params):
+                p.data = q.data
+
+        if self.initialize and len(self.worker_errors) == 0:
+            torch.cuda.empty_cache()
+            for i in range(len(self.exp_avg_flat)):
+                self.worker_errors.append(
+                    torch.zeros(self.corrected_tensor_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+                self.server_errors.append(
+                    torch.zeros(self.server_chunk_sizes[i],
+                                device=self.exp_avg_flat[i].device))
+            torch.cuda.empty_cache()
+
+        if self.lamb_freeze_key:
+            if self.size > 1:
+                for i in range(len(self.exp_avg_flat)):
+                    if not self.initialize:
+                        torch.cuda.empty_cache()
+                        self.worker_errors.append(
+                            torch.zeros(self.corrected_tensor_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        self.server_errors.append(
+                            torch.zeros(self.server_chunk_sizes[i],
+                                        device=self.exp_avg_flat[i].device))
+                        torch.cuda.empty_cache()
+                        if torch.distributed.get_rank() == 0:
+                            print("Cupy Buffers Initialized Successfully.")
+
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[0],
+                            self.server_errors[0],
+                            self.deepspeed.local_rank)
+
+                        if torch.distributed.get_rank() == 0:
+                            print('Pop out errors', flush=True)
+                        del self.worker_errors[:]
+                        del self.server_errors[:]
+                    else:
+                        self.comm_backend_handle.compressed_allreduce(
+                            self.exp_avg_flat[i],
+                            self.worker_errors[i],
+                            self.server_errors[i],
+                            self.deepspeed.local_rank)
+
+        if self.lamb_freeze_key and self.initialize:
+            for i, group in enumerate(self.param_groups):
+                bias_correction = 1 if group['bias_correction'] else 0
+
+                for j, p in enumerate(group['params']):
+                    state = self.state[p]
+                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                    beta1, beta2 = group['betas']
+                    exp_avg.div_(self.state[p]['scaling_coeff'])
+                    # Because 1-bit compression cannot represent exact zero, it is required to
+                    # provide a momentum mask for those params that have constant exact zeros in their
+                    # momentums, otherwise the compression error would keep accumulating.
+                    # For example, for BERT pre-training seq 128, bert.embeddings.position_embeddings.weight
+                    # always have exact zeros in its momentum for row 129 to 512, because it only
+                    # learns up to seq length 128 while the model supports up to 512 seq length.
+                    # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py about how
+                    # to add this exp_avg_mask for BERT pre-training.)
+                    if 'exp_avg_mask' in group:
+                        if exp_avg.device != group['exp_avg_mask'].device:
+                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
+                                device=exp_avg.device)
+                        exp_avg.mul_(group['exp_avg_mask'])
+
+                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
+                                        (1 - beta1))
+                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2,
+                                                          grad_reconstruct,
+                                                          grad_reconstruct)
+                    denom = exp_avg_sq.sqrt() + group['eps']
+                    update_prelim = exp_avg / denom
+
+                    if group['weight_decay'] > 0.0:
+                        update = update_prelim + group['weight_decay'] * p.data
+                    else:
+                        update = update_prelim
+
+                    lamb_coeff = 1.0
+                    update_norm = update.pow(2).sum().sqrt()
+                    denom_real = exp_avg_sq_fresh.sqrt() + group['eps']
+                    factor = (denom / denom_real).max().item()
+                    if group['weight_decay'] > 0.0:
+                        update_ratio = min(1.0,
+                                           (update_prelim.pow(2).sum().sqrt() /
+                                            update_norm).item())
+                        factor = factor * update_ratio + (1.0 - update_ratio)
+                    if factor > self.factor_max:
+                        factor = self.factor_max
+                    if factor < self.factor_min:
+                        factor = self.factor_min
+                    if factor > state['last_factor'] * (1.0 + self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 + self.factor_threshold)
+                    if factor < state['last_factor'] * (1.0 - self.factor_threshold):
+                        factor = state['last_factor'] * (1.0 - self.factor_threshold)
+                    state['last_factor'] = factor
+                    lamb_coeff = state['lamb_coeff_freeze'] * factor
+                    self.lamb_coeffs.append(lamb_coeff)
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * lamb_coeff * update)
+            del exp_avg_last_step[:]
+            exp_avg_last_step = None
+
+        if not self.initialize:
+            self.lamb_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.lamb_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                print('OnebitLamb - starting compressed communication')
+                self.lamb_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+
+        return loss
+
+    def load_state_dict(self, state_dict):
+        """
+        Overrides load_state_dict() to add special handling when loading checkpoints
+        """
+        # Because at different stage exp_avg_mask may change (e.g.,
+        # BERT pre-training seqlen 128 and 512 ), we don't use the exp_avg_mask
+        # in checkpoints but always use the one user provided in training script.
+        # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
+        # Thus here we keep the exp_avg_mask unchanged when loading checkpoint
+        for i, group in enumerate(self.param_groups):
+            if 'exp_avg_mask' in group:
+                state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
+                    'param_groups'][i]:
+                state_dict['param_groups'][i].pop('exp_avg_mask')
+        super().load_state_dict(state_dict)
+        # need to reset the fused momentum since loading states will break the linking
+        del self.exp_avg_flat[:]
+        self.dummy_exp_avg.clear()
+        del self.corrected_tensor_sizes[:]
+        del self.server_chunk_sizes[:]
+        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
+            if torch.distributed.get_rank() == 0:
+                print("Checkpoint loaded and OnebitLamb warmup stage starts/continues.")
+            if self.lamb_freeze_key is True:
+                self.lamb_freeze_key = False
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = True
+                else:
+                    self.deepspeed.enable_backward_allreduce = True
+            for group in self.param_groups:
+                for p in group['params']:
+                    self.state[p]['lamb_coeff_freeze'] = 0.0
+                    self.state[p]['last_factor'] = 1.0
+                    if 'scaling_coeff' in self.state[p]:
+                        self.state[p].pop('scaling_coeff')
+        else:
+            if torch.distributed.get_rank() == 0:
+                print(
+                    "Checkpoint loaded and OnebitLamb compression stage starts/continues."
+                )
+            if self.lamb_freeze_key is False:
+                self.lamb_freeze_key = True
+                if self.using_pipeline:
+                    self.deepspeed.pipeline_enable_backward_allreduce = False
+                else:
+                    self.deepspeed.enable_backward_allreduce = False
+        # We reset the compression errors when loading checkpoints for 3 reasons:
+        # 1) The worker and server error at each GPU are distinct, so in current implementation
+        # only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors.
+        # If we want to save them correctly we need O(num_gpu*model_size) memory in order to
+        # gather all the error, which is a very large memory requirement. It's possible to save
+        # them in a distributed way, but it will make the checkpoint saving/loading much more complicated.
+        # 2) Even if we are able to save the compression errors correctly, you need to have the
+        # exact same number of GPUs in order to load them correctly.
+        # 3) We verified on BERT pre-training that occasionally resetting the compression error
+        # at checkpoint loading does not affect the convergence.
+        # However, please avoid frequent checkpoint loading which could break the error
+        # compensation mechanism thus affect the convergence.
+        del self.worker_errors[:]
+        del self.server_errors[:]
+
+    def get_lamb_coeffs(self):
+        return self.lamb_coeffs
diff --git a/deepspeed/runtime/fp16/onebit/onebitadam.py b/deepspeed/runtime/fp16/onebit/onebitadam.py
deleted file mode 100644
index 1902b46ab2e3..000000000000
--- a/deepspeed/runtime/fp16/onebit/onebitadam.py
+++ /dev/null
@@ -1,288 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import types
-import torch
-import importlib
-import numpy as np
-import time
-import torch.distributed as dist
-
-from deepspeed.utils.logging import logger
-
-
-class OnebitAdam(torch.optim.Optimizer):
-    """Implements the 1-bit Adam algorithm. Currently GPU-only.
-    For usage example please see, https://www.deepspeed.ai/tutorials/onebit-adam/
-    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups.
-        lr (float, optional): learning rate. (default: 1e-3)
-        freeze_step (int, optional): Number of steps for warmup (uncompressed)
-            stage before we start using compressed communication. (default 100000)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square. (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in 1-bit Adam!
-        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
-            adds eps to the bias-corrected second moment estimate before
-            evaluating square root instead of adding it to the square root of
-            second moment estimate as in the original paper. (default: False)
-        cuda_aware (boolean, required): Set True if the underlying MPI implementation
-            supports CUDA-Aware communication. (default: False)
-        comm_backend_name (string, optional): Set to 'mpi' if needed. (default: 'nccl')
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-    def __init__(self,
-                 params,
-                 deepspeed=None,
-                 lr=1e-3,
-                 freeze_step=100000,
-                 bias_correction=True,
-                 betas=(0.9,
-                        0.999),
-                 eps=1e-8,
-                 eps_inside_sqrt=False,
-                 weight_decay=0.,
-                 max_grad_norm=0.,
-                 amsgrad=False,
-                 cuda_aware=False,
-                 comm_backend_name='nccl'):
-
-        if amsgrad:
-            raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
-
-        defaults = dict(lr=lr,
-                        bias_correction=bias_correction,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
-
-        super(OnebitAdam, self).__init__(params, defaults)
-        self.eps_mode = 0 if eps_inside_sqrt else 1
-        assert (dist.is_initialized())
-
-        self.comm_time = 0.0
-        self.step_time = 0.0
-        self.ave_step = 1
-        self.bk_time = 0.0
-
-        self.deepspeed = deepspeed
-        self.adam_freeze_key = False
-        self.initialize = False
-        self.freeze_step = freeze_step
-        self.cuda_aware = cuda_aware
-
-        self.comm_backend_name = comm_backend_name
-
-        # Empty initializer. Set handle based on the comm backend as follows.
-        self.comm_backend_handle = None
-
-        if self.comm_backend_name == 'nccl':
-            TORCH_MAJOR = int(torch.__version__.split('.')[0])
-            TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
-            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
-            from deepspeed.runtime.comm.nccl import NcclBackend
-            self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
-
-        elif self.comm_backend_name == 'mpi':
-            from deepspeed.runtime.comm.mpi import MpiBackend
-            self.comm_backend_handle = MpiBackend(cuda_aware)
-
-        self.size = self.comm_backend_handle.size
-
-        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
-
-    def step(self, closure=None, grads=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            grads (list of tensors, optional): weight gradient to use for the
-                optimizer update. If gradients have type torch.half, parameters
-                are expected to be in type torch.float. (default: None)
-            output params (list of tensors, optional): A reduced recision copy
-                of the updated weights written out in addition to the regular
-                updated weights. Have to be of same type as gradients. (default: None)
-            scale (float, optional): factor to divide gradient tensor values
-                by before applying to weights. (default: 1)
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        gather_time = 0
-        allgather_time = 0
-        all_time = 0
-
-        if self.adam_freeze_key is False:
-            v_diff_buffer = 0.0
-
-        if grads is None:
-            grads_group = [None] * len(self.param_groups)
-        # backward compatibility
-        # assuming a list/generator of parameter means single group
-        elif isinstance(grads, types.GeneratorType):
-            grads_group = [grads]
-        elif type(grads[0]) != list:
-            grads_group = [grads]
-        else:
-            grads_group = grads
-
-        for group, grads_this_group in zip(self.param_groups, grads_group):
-            if grads_this_group is None:
-                grads_this_group = [None] * len(group['params'])
-
-            bias_correction = 1 if group['bias_correction'] else 0
-
-            for p, grad in zip(group['params'], grads_this_group):
-                if p.grad is None and grad is None:
-                    continue
-                if grad is None:
-                    grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('1-bit Adam does not support sparse gradients')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                    state['tensor_size'] = torch.numel(p.data)
-                    state['corrected_tensor_size'] = state['tensor_size']
-
-                    if state['tensor_size'] % (self.size * self.divider) != 0:
-                        state['corrected_tensor_size'] += ((self.size * self.divider) -
-                                                           (state['tensor_size'] %
-                                                            (self.size * self.divider)))
-                    state['server_chunk_size'] = state[
-                        'corrected_tensor_size'] // self.size
-
-                if not self.initialize or (self.adam_freeze_key
-                                           and 'worker_error' not in state.keys()):
-                    torch.cuda.empty_cache()
-                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
-                                                        device=p.device)
-                    state['server_error'] = torch.zeros(state['server_chunk_size'],
-                                                        device=p.device)
-                    torch.cuda.empty_cache()
-                    self.adam_freeze_key = True
-                    if not self.initialize and torch.distributed.get_rank() == 0:
-                        print("Cupy Buffers Initialized Successfully.")
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                if self.adam_freeze_key is False:
-                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                    grad = None
-                    if self.initialize:
-                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
-
-                else:
-                    if 'non_freeze' in group.keys() and group['non_freeze'] is True:
-                        world_group = self.comm_backend_handle.world_group if hasattr(self.comm_backend_handle,
-                                                                                              "world_group") else None
-                        dist.all_reduce(grad,
-                                        group=world_group)
-                        grad.mul_(1 / dist.get_world_size(group=world_group))
-                        exp_avg.mul_(beta1).add(1 - beta1, grad)
-                        exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                        grad = None
-                    else:
-                        if self.initialize is True:
-                            exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                        grad = None
-
-                        if self.size > 1:
-                            exp_avg.set_(
-                                self.comm_backend_handle.compressed_allreduce(
-                                    exp_avg,
-                                    state['worker_error'],
-                                    state['server_error'],
-                                    self.deepspeed.local_rank))
-                        if 'exp_avg_mask' in group:
-                            if exp_avg.device != group['exp_avg_mask'].device:
-                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                    device=exp_avg.device)
-                            exp_avg.mul_(group['exp_avg_mask'])
-
-                    if self.initialize:
-                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
-
-                if self.initialize:
-                    if group['weight_decay'] > 0.0:
-                        update += group['weight_decay'] * p.data
-                    with torch.no_grad():
-                        p.add_(-group['lr'] * update)
-
-            if not self.initialize:
-                print('Pop out errors', flush=True)
-                state.pop('worker_error')
-                state.pop('server_error')
-
-        if not self.initialize:
-            self.adam_freeze_key = False
-            self.initialize = True
-            print(
-                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
-            )
-            return loss
-
-        if self.adam_freeze_key is False:
-            if state['step'] >= self.freeze_step:
-                print('Starting compressed communication')
-                self.adam_freeze_key = True
-                self.deepspeed.enable_backward_allreduce = False
-                self.deepspeed.pipeline_enable_backward_allreduce = False
-
-        return loss
-
-    def load_state_dict(self, state_dict):
-        """
-        Overrides state_dict() to reset 1-bit Adam states when needed
-        """
-        mask = {}
-        for i, group in enumerate(self.param_groups):
-            if 'exp_avg_mask' in group:
-                mask[i] = group['exp_avg_mask']
-        super().load_state_dict(state_dict)
-        # Because at different stage exp_avg_mask may change (e.g.,
-        # when loading seq 128 checkpoint for seq 512 pretraining),
-        # we don't load the exp_avg_mask from the checkpoint but always
-        # use the one provided in optimizer_grouped_parameters in deepspeed_train.py.
-        for k, v in mask.items():
-            self.param_groups[k]['exp_avg_mask'] = v
-        if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
-            if torch.distributed.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and warmup stage starts/continues, reset 1-bit Adam states."
-                )
-            if self.adam_freeze_key is True:
-                self.adam_freeze_key = False
-                self.deepspeed.enable_backward_allreduce = True
-                self.deepspeed.pipeline_enable_backward_allreduce = True
-
-            for group in self.param_groups:
-                for p in group['params']:
-                    self.state[p].pop('worker_error')
-                    self.state[p].pop('server_error')
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index c0cef6a56ba7..441dbd61ccb9 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -22,6 +22,7 @@ class FP16_UnfusedOptimizer(object):
     """
     def __init__(self,
                  init_optimizer,
+                 deepspeed=None,
                  static_loss_scale=1.0,
                  dynamic_loss_scale=False,
                  dynamic_loss_args=None,
@@ -96,7 +97,9 @@ def __init__(self,
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.overflow_checker = CheckOverflow(self.fp16_groups,
+                                              mpu=self.mpu,
+                                              deepspeed=deepspeed)
 
         self.initialize_optimizer_states()
 
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index 950f1d76ccbf..8969ddf069d8 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -42,12 +42,12 @@ def _tensor_bytes(tensor):
     return tensor.numel() * tensor.element_size()
 
 
-def print_rank_0(message):
+def print_rank_0(*message):
     if torch.distributed.is_initialized():
         if torch.distributed.get_rank() == 0:
-            print(message)
+            print(*message)
     else:
-        print(message)
+        print(*message)
 
 
 class PipelineEngine(DeepSpeedEngine):
@@ -61,9 +61,14 @@ def __init__(self, *super_args, **super_kwargs):
         super().__init__(*super_args, **super_kwargs)
         assert isinstance(self.module, PipelineModule), "model must base PipelineModule"
 
+        assert self.zero_optimization_stage() < 2, "ZeRO-2 and ZeRO-3 are incompatible with pipeline parallelism"
+
         # We schedule the all-reduces, so disable it in super().backward()
         self.enable_backward_allreduce = False
+
+        # used to disable the pipeline all-reduce when used with 1-bit Adam/1-bit LAMB
         self.pipeline_enable_backward_allreduce = True
+
         assert not self.elasticity_enabled(), "Elasticity is not currently supported" \
                                               " with pipeline parallelism."
 
@@ -141,7 +146,7 @@ def __init__(self, *super_args, **super_kwargs):
                         f'TOTAL_PARAMS={total_params} ({total_params / 1e6:0.3f}M) '
                         f'UNIQUE_PARAMS={unique_params} ({unique_params / 1e6:0.3f}M)')
 
-        # intialize peer-2-peer communication and allreduce groups
+        #intialize peer-2-peer communication and allreduce groups
         if self.is_pipe_parallel:
             p2p.init_process_groups(self.grid)
 
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index b2797bb9e652..a5c9087a2ffc 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -112,6 +112,9 @@ def forward(self, inputs):
                     x = layer(x)
                 return x
 
+        .. note::
+            Pipeline parallelism is not compatible with ZeRO-2 and ZeRO-3.
+
         Args:
             layers (Iterable): A sequence of layers defining pipeline structure. Can be a ``torch.nn.Sequential`` module.
             num_stages (int, optional): The degree of pipeline parallelism. If not specified, ``topology`` must be provided.
diff --git a/deepspeed/runtime/pipe/pipe_visualizer.py b/deepspeed/runtime/pipe/pipe_visualizer.py
new file mode 100644
index 000000000000..abfeb16fa656
--- /dev/null
+++ b/deepspeed/runtime/pipe/pipe_visualizer.py
@@ -0,0 +1,70 @@
+from deepspeed.runtime.pipe.schedule import (TrainSchedule, ForwardPass, BackwardPass, OptimizerStep, RecvGrad,
+                                             RecvActivation,
+                                             SendGrad, SendActivation, LoadMicroBatch, ReduceGrads, ReduceTiedGrads)
+from pprint import pprint
+from pytablewriter import MarkdownTableWriter
+
+flatten = lambda t: [item for sublist in t for item in sublist]
+
+
+def expand(steps, include_all=False):
+    for c, i in enumerate(steps):
+        string = ''
+        for j in range(len(i)):
+            if not include_all:
+                cond = lambda x: (isinstance(x, ForwardPass) or isinstance(x, BackwardPass))
+            else:
+                cond = lambda x: x
+            if not i[j]: i[j] = [None]
+            if i[j] is not None:
+                if cond(i[j]):
+                    if string != '':
+                        string += ' / '
+                    string += f'{reprs[type(i[j])]}'
+                    if hasattr(i[j], 'buffer_id'):
+                        string += f'_{i[j].buffer_id + 1}'
+        steps[c] = string if string != '' else None
+    return steps
+
+
+reprs = {
+    ForwardPass: 'fwd',
+    BackwardPass: 'bwd',
+    RecvActivation: 'recv_act',
+    SendActivation: 'send_act',
+    RecvGrad: 'recv_grad',
+    SendGrad: 'send_grad',
+    LoadMicroBatch: 'load_batch',
+    ReduceGrads: 'reduce_grads',
+    ReduceTiedGrads: 'reduce_tied_grads',
+    OptimizerStep: 'step',
+}
+
+
+def pipeline_visualizer(num_stages, num_microbatches, include_all=False):
+    stages = {}
+    for stage_id in range(num_stages):
+        steps = [i for i in TrainSchedule(micro_batches=num_microbatches, stages=num_stages - 1 ,
+                                          stage_id=stage_id).steps()]
+        steps = expand(steps, include_all=include_all)
+        stages[stage_id] = steps
+    value_matrix = [v for k, v in stages.items()]
+    headers = ['GPU ID'] + [str(i) for i in range(len(stages[0]))]
+    value_matrix = [[f'GPU {i}'] + value_matrix[i] for i in range(len(value_matrix))]
+    writer = MarkdownTableWriter(
+        table_name=f"Pipe Schedule\n",
+        headers=headers,
+        value_matrix=value_matrix
+    )
+    string = writer.dumps()
+    all_steps = flatten(value_matrix)
+    idle_time = len([i for i in all_steps if i is None])
+    print(all_steps)
+    non_idle_time = len([i for i in all_steps if (i is not None and 'GPU' not in i)])
+    string += f'\nNum Devices: {num_stages}\nNum Microbatches: {num_microbatches} \n' \
+              f'Idle Time: {idle_time}\nNon Idle Time: {non_idle_time}'
+    return string
+
+
+if __name__ == "__main__":
+    print(pipeline_visualizer(num_stages=8, num_microbatches=16, include_all=False))
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index a4a3541c8767..dda7aa9f4cb6 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from itertools import product as cartesian_product
+from pprint import pprint
 
 
 class ProcessTopology:
@@ -22,6 +23,7 @@ class ProcessTopology:
 
     Some methods return ProcessCoord namedtuples.
     """
+
     def __init__(self, axes, dims):
         """Create a mapping of n-dimensional tensor coordinates to linear indices.
 
@@ -58,7 +60,7 @@ def get_rank(self, **coord_kwargs):
             raise ValueError('get_rank() does not support slices. Use filter_match())')
 
         key = self.ProcessCoord(**coord_kwargs)
-        assert key in self.mapping, f'key {kwargs} invalid'
+        assert key in self.mapping, f'key {coord_kwargs} invalid'
         return self.mapping[key]
 
     def get_axis_names(self):
@@ -184,6 +186,7 @@ def filter_match(self, **filter_kwargs):
         Returns:
             The list of ranks whose coordinates match filter_kwargs.
         """
+
         def _filter_helper(x):
             for key, val in filter_kwargs.items():
                 if getattr(x, key) != val:
@@ -239,12 +242,14 @@ class PipeDataParallelTopology(ProcessTopology):
         reductions to use high-bandwidth intra-node links and lower-volume
         pipeline communications to use low-bandwidth inter-node links.
     """
+
     def __init__(self, num_pp, num_dp):
         super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
 
 
 class PipeModelDataParallelTopology(ProcessTopology):
     """ A topology for hybrid pipeline, model, and data parallelism. """
+
     def __init__(self, num_pp, num_mp, num_dp):
         super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
 
@@ -271,6 +276,7 @@ class PipelineParallelGrid:
     data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
     for data_parallel_id = 1.
     """
+
     def __init__(self, topology=None, process_group=None):
         # TODO use process_group if provided
         self.global_rank = dist.get_rank()
@@ -303,7 +309,7 @@ def __init__(self, topology=None, process_group=None):
         for dp in range(self.data_parallel_size):
             ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp))
             if self.global_rank == 0:
-                #print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
+                # print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
                 pass
             proc_group = dist.new_group(ranks=ranks)
             if self.global_rank in ranks:
@@ -333,7 +339,7 @@ def __init__(self, topology=None, process_group=None):
         self.pipe_groups = self._topo.get_axis_comm_lists('pipe')
         for ranks in self.pipe_groups:
             if self.global_rank == 0:
-                #print(f'RANK={self.global_rank} building pipeline group: {ranks}')
+                # print(f'RANK={self.global_rank} building pipeline group: {ranks}')
                 pass
             proc_group = dist.new_group(ranks=ranks)
             if self.global_rank in ranks:
@@ -392,8 +398,8 @@ def _is_grid_valid(self):
             ranks *= self._topo.get_dim(ax)
         return ranks == dist.get_world_size()
 
-    #returns the global rank of the process with the provided stage id
-    #which has the same data_parallel_id as caller process
+    # returns the global rank of the process with the provided stage id
+    # which has the same data_parallel_id as caller process
     def stage_to_global(self, stage_id, **kwargs):
         me = self._topo.get_coord(self.global_rank)
         transform = me._replace(pipe=stage_id, **kwargs)._asdict()
@@ -459,3 +465,8 @@ def get_slice_parallel_world_size(self):
 
     def get_slice_parallel_group(self):
         return self.slice_proc_group
+
+
+if __name__ == "__main__":
+    topo = PipeModelDataParallelTopology(num_pp=6, num_dp=2, num_mp=4)
+    pprint(str(topo))
diff --git a/deepspeed/runtime/swap_tensor/__init__.py b/deepspeed/runtime/swap_tensor/__init__.py
new file mode 100644
index 000000000000..8c18558e9423
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/__init__.py
@@ -0,0 +1,4 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
new file mode 100644
index 000000000000..6a7014c1863c
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -0,0 +1,44 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.swap_tensor.constants import *
+
+AIO_DEFAULT_DICT = {
+    AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
+    AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
+    AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
+    AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+}
+
+
+def get_aio_config(param_dict):
+    if AIO in param_dict.keys() and param_dict[AIO] is not None:
+        aio_dict = param_dict[AIO]
+        return {
+            AIO_BLOCK_SIZE:
+            get_scalar_param(aio_dict,
+                             AIO_BLOCK_SIZE,
+                             AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH:
+            get_scalar_param(aio_dict,
+                             AIO_QUEUE_DEPTH,
+                             AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_THREAD_COUNT:
+            get_scalar_param(aio_dict,
+                             AIO_THREAD_COUNT,
+                             AIO_THREAD_COUNT_DEFAULT),
+            AIO_SINGLE_SUBMIT:
+            get_scalar_param(aio_dict,
+                             AIO_SINGLE_SUBMIT,
+                             AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS:
+            get_scalar_param(aio_dict,
+                             AIO_OVERLAP_EVENTS,
+                             AIO_OVERLAP_EVENTS_DEFAULT)
+        }
+
+    return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
new file mode 100644
index 000000000000..e6e19a4c67ef
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -0,0 +1,173 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer
+
+INVALID_BUFFER_INDEX = -1
+ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait'
+
+
+class AsyncTensorSwapper(object):
+    def __init__(self, aio_handle, numel_alignment, timers):
+        self.free_buffer_index = []
+        self.swapping_buffer_index = []
+        self.ready_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.all_buffers = []
+        self.aio_handle = aio_handle
+        self.numel_alignment = numel_alignment
+        self.max_numel = 0
+        self.num_pending_swaps = 0
+        self.timers = timers
+        self.timer_names = set()
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+    def has_buffers(self):
+        return len(self.all_buffers) > 0
+
+    def add_buffers(self, buffer_list):
+        assert len(self.all_buffers) == 0
+        assert all([buffer.is_pinned() for buffer in buffer_list])
+        dtype = buffer_list[0].dtype
+        assert all([buffer.dtype == dtype for buffer in buffer_list])
+
+        self.dtype = dtype
+        self.all_buffers = [SwapBuffer(buffer) for buffer in buffer_list]
+        self.free_buffer_index += [i for i in range(len(self.all_buffers))]
+        self.max_numel = max([buffer.numel() for buffer in buffer_list])
+        self.timer_names = set()
+
+    def get_timer_names(self):
+        return list(self.timer_names)
+
+    def release_buffers(self):
+        self._report_statistics('Swapped out[Before flush]')
+        self._flush_buffers_until_complete()
+        self._report_statistics('Swapped out[After flush]')
+
+        pinned_buffers = [buf.buffer for buf in self.all_buffers]
+        self.all_buffers = []
+        self.free_buffer_index = []
+        self.current_buffer_index = INVALID_BUFFER_INDEX
+        self.num_elements_swapped = 0
+        self.dtype = None
+
+        return pinned_buffers
+
+    def swap_out_tensors(self, tensor_list, path_list):
+        for tensor, swap_path in zip(tensor_list, path_list):
+            self._swap_out_tensor(tensor, swap_path)
+
+    def _report_statistics(self, message):
+        if torch.distributed.get_rank() == 0:
+            element_size = torch.tensor([], dtype=self.dtype).element_size()
+            swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
+            logger.info(
+                f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
+            )
+
+    def _swap_out_tensor(self, tensor, swap_path):
+        assert len(self.all_buffers) > 0
+
+        aligned_numel = self._io_aligned_numel(tensor.numel())
+        assert aligned_numel <= self.max_numel
+
+        self._make_swap_space(aligned_numel)
+        assert self.current_buffer_index != INVALID_BUFFER_INDEX
+
+        swap_buffer = self._get_current_buffer()
+        swap_buffer.insert_tensor(tensor, swap_path, aligned_numel)
+
+    def _make_swap_space(self, numel):
+        if self.current_buffer_index == INVALID_BUFFER_INDEX:
+            self._allocate_buffer()
+            return
+
+        if not self._get_current_buffer().has_space(numel):
+            if len(self.free_buffer_index) > 0:
+                self._flush_ready_buffers()
+            else:
+                self._flush_buffers_until_complete()
+            self._allocate_buffer()
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
+
+    def _allocate_buffer(self):
+        assert self.current_buffer_index == INVALID_BUFFER_INDEX
+        assert len(self.all_buffers) > 0
+        assert len(self.free_buffer_index) > 0
+        self.current_buffer_index = self.free_buffer_index[-1]
+        self.free_buffer_index = self.free_buffer_index[:-1]
+
+    def _flush_ready_buffers(self):
+        if self.current_buffer_index != INVALID_BUFFER_INDEX:
+            self.ready_buffer_index.append(self.current_buffer_index)
+            self.current_buffer_index = INVALID_BUFFER_INDEX
+
+        self._swap_out_ready_buffers()
+
+    def _flush_buffers_until_complete(self):
+        self._flush_ready_buffers()
+        assert len(self.ready_buffer_index) == 0
+
+        self._wait_for_swap_complete()
+        assert len(self.swapping_buffer_index) == 0
+        assert len(self.free_buffer_index) == len(self.all_buffers)
+
+    def _swap_out_ready_buffers(self):
+        for buffer_index in self.ready_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            swap_tensors = buffer.get_swap_tensors()
+            swap_paths = buffer.get_swap_paths()
+            self.num_pending_swaps += len(swap_tensors)
+            swap_out_tensors(self.aio_handle, swap_tensors, swap_paths)
+
+        self.swapping_buffer_index += self.ready_buffer_index
+        self.ready_buffer_index = []
+
+    def _wait_for_swap_complete(self):
+        assert len(self.swapping_buffer_index) > 0
+
+        self._start_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        assert self.aio_handle.wait() == self.num_pending_swaps
+        self._stop_timer(ASYNC_SWAPPER_WAIT_TIMER)
+        self.timer_names.add(ASYNC_SWAPPER_WAIT_TIMER)
+
+        self.num_pending_swaps = 0
+
+        for buffer_index in self.swapping_buffer_index:
+            buffer = self._get_buffer(buffer_index)
+            self.num_elements_swapped += buffer.get_num_elem()
+            buffer.reset()
+
+        self.free_buffer_index += self.swapping_buffer_index
+        assert len(self.free_buffer_index) <= len(self.all_buffers)
+        self.swapping_buffer_index = []
+
+    def _get_buffer(self, index):
+        assert index != INVALID_BUFFER_INDEX
+        return self.all_buffers[index]
+
+    def _get_current_buffer(self):
+        return self._get_buffer(self.current_buffer_index)
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and force:
+            self.timers.log(name_list)
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
new file mode 100644
index 000000000000..752ec8dcaacf
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -0,0 +1,27 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# AIO
+#########################################
+AIO_FORMAT = '''
+"aio": {
+  "block_size": 1048576,
+  "queue_depth": 8,
+  "thread_count": 1,
+  "single_submit": false,
+  "overlap_events": true
+}
+'''
+AIO = "aio"
+AIO_BLOCK_SIZE = "block_size"
+AIO_BLOCK_SIZE_DEFAULT = 1048576
+AIO_QUEUE_DEPTH = "queue_depth"
+AIO_QUEUE_DEPTH_DEFAULT = 8
+AIO_THREAD_COUNT = "thread_count"
+AIO_THREAD_COUNT_DEFAULT = 1
+AIO_SINGLE_SUBMIT = "single_submit"
+AIO_SINGLE_SUBMIT_DEFAULT = False
+AIO_OVERLAP_EVENTS = "overlap_events"
+AIO_OVERLAP_EVENTS_DEFAULT = True
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
new file mode 100644
index 000000000000..5785ff7cc36d
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -0,0 +1,526 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool
+
+
+class FlattenedTensorSwapInfo(object):
+    def __init__(self, path, length, offset):
+        self.path = path
+        self.offset = offset
+        self.length = length
+
+
+class OptimizerStateSwapInfo(object):
+    def __init__(self, parameter, numel, base_folder):
+        self.tensors = []
+        self.param_id = id(parameter)
+        self.swap_folder = base_folder
+        self.swap_paths = []
+        self.swapped_gradients = {}
+        self.unswapped_gradients = {}
+        self.tensor_numel = numel
+        self.tensor_dtype = parameter.dtype
+        self.tensor_device = parameter.device
+        self.has_state_tensors = False
+        self._add_tensors([parameter])
+
+    def numel(self):
+        return self.tensor_numel
+
+    def has_gradients(self):
+        return self.swapped_gradients or self.unswapped_gradients
+
+    def _add_tensors(self, tensor_list):
+        for t in tensor_list:
+            self.tensors.append(t)
+            self.swap_paths.append(os.path.join(self.swap_folder, f'{id(t)}.tensor.swp'))
+
+    def add_state_tensors(self, tensor_list):
+        self.has_state_tensors = True
+        self._add_tensors(tensor_list)
+
+    def device(self):
+        return self.tensor_device
+
+    def dtype(self):
+        return self.tensor_dtype
+
+    def release_memory(self):
+        for tensor in self.tensors:
+            tensor.data = torch.Tensor()
+
+    def get_or_create_gradient_paths(self, offsets, lengths):
+        gradient_paths = []
+        for offset, length in zip(offsets, lengths):
+            if not offset in self.swapped_gradients.keys():
+                path = os.path.join(
+                    self.swap_folder,
+                    f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
+                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(
+                    path,
+                    length,
+                    offset)
+
+            gradient_paths.append(self.swapped_gradients[offset].path)
+
+        return gradient_paths
+
+    def set_swap_buffers(self, buffers):
+        compute_lengths = [self.numel()] * len(self.tensors)
+        compute_buffers = get_sized_buffers(buffers, compute_lengths)
+        for t, buffer in zip(self.tensors, compute_buffers):
+            t.data = buffer.data
+
+    def get_swap_gradient_buffers(self, swap_buffer):
+        assert self.numel() <= swap_buffer.numel()
+        return [
+            swap_buffer.narrow(0,
+                               grad.offset,
+                               grad.length) for grad in self.swapped_gradients.values()
+        ]
+
+    def get_swap_gradient_paths(self):
+        return [grad.path for grad in self.swapped_gradients.values()]
+
+    def get_unpinned_state_tensors(self):
+        return [t for t in self.tensors if not t.is_pinned()]
+
+    def read_unswapped_gradients(self, dest_buffer):
+        num_elem_count = 0
+        for offset, grad_partition in self.unswapped_gradients.items():
+            dst_tensor = dest_buffer.narrow(0, offset, grad_partition.numel())
+            dst_tensor.data.copy_(grad_partition.data)
+            num_elem_count += grad_partition.numel()
+
+        return num_elem_count
+
+    def release_unswapped_gradients(self):
+        self.unswapped_gradients = {}
+
+
+SWAPPER_DEBUG_MODE = False
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+
+
+class OptimizerSwapper(object):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        self.swap_config = swap_config
+        self.aio_config = aio_config
+
+        # NVMe swap management
+        self.swap_params_info = {}
+        self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
+        self.swap_folder = os.path.join(base_folder,
+                                        'optimizer',
+                                        f'rank{torch.distributed.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.optimizer = optimizer
+
+        # Swap buffer management
+        self.largest_numel = largest_numel
+        self.dtype = dtype
+        self.swap_buffer_manager = SwapBufferManager(
+            num_elems=largest_numel,
+            count=swap_config[OFFLOAD_OPTIMIZER_BUFFER_COUNT],
+            dtype=dtype)
+
+        # Read/Write alignment for each thread during Intra-request parallelism
+        self.min_aio_bytes = max(MIN_AIO_BYTES, aio_config[AIO_BLOCK_SIZE])
+        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_THREAD_COUNT]
+        self.numel_alignment = self.aligned_bytes // self.swap_element_size
+
+        # Timers
+        self.timers = timers
+        self.timer_names = set()
+
+        # Print exclusion list
+        self.print_exclude_list = [
+            'optimizer',
+            'swap_buffer_manager',
+            'swap_params_info',
+            'timers',
+            'timer_names',
+        ]
+
+    def swappable_tensor(self, param=None, numel=None):
+        assert param is not None or numel is not None, "Either param or numel must be provided"
+        if param is not None:
+            return self.min_aio_bytes <= (param.numel() * self.swap_element_size)
+        return self.min_aio_bytes <= (numel * self.swap_element_size)
+
+    def init_timers(self):
+        self.timer_names = set()
+
+    def log_timers(self):
+        if self.timer_names:
+            self._log_timers(list(self.timer_names), force=True)
+
+    def pre_backward(self):
+        self.init_timers()
+
+    def post_backward(self):
+        pass
+
+    def _flush_gradient_swapper(self, gradient_swapper):
+        if gradient_swapper.has_buffers():
+            self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+            pinned_buffers = gradient_swapper.release_buffers()
+            self.swap_buffer_manager.free(pinned_buffers)
+            self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+            self.timer_names.update(gradient_swapper.get_timer_names())
+
+    def _swap_out_gradients(self,
+                            parameter,
+                            gradient_offsets,
+                            gradient_tensors,
+                            gradient_swapper):
+        if not id(parameter) in self.swap_params_info.keys():
+            return
+
+        swap_info = self.swap_params_info[id(parameter)]
+
+        swappable_tensors = []
+        swappable_offsets = []
+        swappable_lengths = []
+
+        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(
+            tensors=gradient_tensors,
+            offsets=gradient_offsets
+        )
+
+        self._start_timer(SWAP_OUT_GRADIENT_TIMER)
+        for tensor, offset in zip(aligned_gradients, aligned_offsets):
+            if not self.swappable_tensor(param=tensor):
+                swap_info.unswapped_gradients[offset] = tensor
+                continue
+
+            swappable_tensors.append(tensor)
+            swappable_offsets.append(offset)
+            swappable_lengths.append(tensor.numel())
+
+        if len(swappable_tensors) > 0:
+            if not gradient_swapper.has_buffers():
+                pinned_buffers = self.swap_buffer_manager.allocate_all(
+                    num_elems=self.largest_numel,
+                    dtype=self.dtype)
+
+                gradient_swapper.add_buffers(pinned_buffers)
+
+            swappable_paths = swap_info.get_or_create_gradient_paths(
+                swappable_offsets,
+                swappable_lengths)
+
+            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors,
+                                              path_list=swappable_paths)
+
+        self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
+
+    def _initialize_from_swapped_fp16_params(self,
+                                             aio_handle,
+                                             fp16_partitions_info,
+                                             fp16_num_elems,
+                                             fp16_pinned_buffers,
+                                             fp32_parameters):
+        assert len(fp32_parameters) == len(fp16_partitions_info)
+        assert len(fp32_parameters) == len(fp16_num_elems)
+        assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers])
+
+        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters,
+                                               num_elems=fp16_num_elems)
+
+        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+
+        fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers]
+        assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \
+        f"numel of fp16 buffers {fp16_buffer_numel} is too small for initializing fp32 params {self.largest_numel}"
+
+        fp32_swap_buffers = SwapBufferPool(fp32_pinned_buffers)
+        fp16_swap_buffers = SwapBufferPool(fp16_pinned_buffers)
+
+        curr_index = 0
+        while curr_index < len(fp32_parameters):
+            fp16_pinned_tensors = self._swap_in_fp16_params(
+                aio_handle=aio_handle,
+                fp16_num_elems=fp16_num_elems[curr_index:],
+                fp16_partitions_info=fp16_partitions_info[curr_index:],
+                fp16_swap_buffers=fp16_swap_buffers)
+
+            if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+                for i, tensor in enumerate(fp16_pinned_tensors):
+                    true_index = curr_index + i
+                    logger.info(
+                        f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
+                    )
+
+            swap_out_count = self._swap_out_fp16_params(
+                aio_handle=aio_handle,
+                fp32_swap_paths=fp32_swap_paths[curr_index:],
+                fp32_swap_buffers=fp32_swap_buffers,
+                fp16_pinned_tensors=fp16_pinned_tensors)
+            assert swap_out_count == len(fp16_pinned_tensors), \
+            f"{swap_out_count} does not match {len(fp16_pinned_tensors)}"
+
+            fp16_swap_buffers.reset()
+            fp32_swap_buffers.reset()
+            curr_index += swap_out_count
+
+        self.swap_buffer_manager.free(fp32_pinned_buffers)
+
+    def _swap_in_fp16_params(self,
+                             aio_handle,
+                             fp16_num_elems,
+                             fp16_partitions_info,
+                             fp16_swap_buffers):
+        assert len(fp16_num_elems) > 0
+
+        swapped_fp16_tensors = []
+        swap_tensors = []
+        swap_paths = []
+        unswapped_srcs = []
+        unswapped_dsts = []
+
+        for i, numel in enumerate(fp16_num_elems):
+            pinned_tensor, _ = fp16_swap_buffers.allocate_tensor(numel, None, numel)
+            if pinned_tensor is None:
+                break
+
+            swapped_fp16_tensors.append(pinned_tensor)
+            offset = 0
+            for tensor, partition_numel, partition_path in fp16_partitions_info[i]:
+                dst_tensor = pinned_tensor.narrow(0, offset, partition_numel)
+                if partition_path is None:
+                    unswapped_srcs.append(tensor)
+                    unswapped_dsts.append(dst_tensor)
+                else:
+                    swap_paths.append(partition_path)
+                    swap_tensors.append(dst_tensor)
+                offset += partition_numel
+
+        assert len(swapped_fp16_tensors) + len(unswapped_srcs) > 0
+        ret = swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+        for src, dst in zip(unswapped_srcs, unswapped_dsts):
+            dst.data.copy_(src.data)
+
+        assert len(swap_tensors) == aio_handle.wait()
+
+        return swapped_fp16_tensors
+
+    def _swap_out_fp16_params(self,
+                              aio_handle,
+                              fp32_swap_paths,
+                              fp32_swap_buffers,
+                              fp16_pinned_tensors):
+
+        assert len(fp16_pinned_tensors) <= len(fp32_swap_paths)
+        swap_out_count = 0
+        for i, fp16_tensor in enumerate(fp16_pinned_tensors):
+            if not fp32_swap_buffers.has_space(fp16_tensor.numel()):
+                fp32_swap_buffers.swap_out(aio_handle)
+                fp32_swap_buffers.reset()
+
+            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(
+                fp16_tensor,
+                fp32_swap_paths[i],
+                self._io_aligned_numel(fp16_tensor.numel())
+                )
+            assert pinned_tensor is not None
+            swap_out_count += 1
+
+        if len(fp32_swap_buffers.get_swap_tensors()) > 0:
+            fp32_swap_buffers.swap_out(aio_handle)
+
+        return swap_out_count
+
+    def _initialize_parameters(self, parameters, src_tensors, aio_handle):
+        assert len(parameters) == len(src_tensors)
+
+        swap_paths = self._get_swap_paths(parameters=parameters,
+                                          num_elems=[src.numel() for src in src_tensors])
+
+        SWAP_INIT_TIMER = "swap_init_write"
+        self._start_timer(SWAP_INIT_TIMER)
+
+        pinned_buffers = self.swap_buffer_manager.allocate_all(
+            num_elems=self.largest_numel,
+            dtype=self.dtype)
+        assert pinned_buffers is not None
+
+        self._swap_out_unpinned_tensors(aio_handle=aio_handle,
+                                        unpinned_tensors=src_tensors,
+                                        dest_paths=swap_paths,
+                                        pinned_buffers=pinned_buffers)
+
+        if torch.distributed.get_rank() == 0 and SWAPPER_DEBUG_MODE:
+            for i, tensor in enumerate(src_tensors):
+                logger.info(
+                    f'copy_in_fp16_param: fp32_id = {id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
+                )
+
+        self.swap_buffer_manager.free(pinned_buffers)
+
+        self._stop_timer(SWAP_INIT_TIMER)
+        self._log_timers([SWAP_INIT_TIMER])
+
+    def _get_swap_paths(self, parameters, num_elems):
+        swap_info_list = [
+            self._create_param_swap_info(parameter=p,
+                                         numel=numel) \
+            for p, numel in zip(parameters, num_elems)
+        ]
+        assert len(swap_info_list) == len(num_elems)
+
+        swap_paths = [info.swap_paths[0] for info in swap_info_list]
+        return swap_paths
+
+    def _swap_out_unpinned_tensors(self,
+                                   aio_handle,
+                                   unpinned_tensors,
+                                   dest_paths,
+                                   pinned_buffers):
+
+        swap_buffer_count = len(pinned_buffers)
+        unpinned_tensor_count = len(unpinned_tensors)
+
+        for i in range(0, unpinned_tensor_count, swap_buffer_count):
+            swap_tensor_count = min((unpinned_tensor_count - i), swap_buffer_count)
+
+            src_tensors = unpinned_tensors[i:(i + swap_tensor_count)]
+            compute_lengths = [t.numel() for t in src_tensors]
+            compute_buffers = get_sized_buffers(pinned_buffers, compute_lengths)
+
+            for dst, src in zip(compute_buffers, src_tensors):
+                dst.data.copy_(src.data)
+
+            swap_lengths = [self._io_aligned_numel(t.numel()) for t in src_tensors]
+            swap_buffers = get_sized_buffers(pinned_buffers, swap_lengths)
+
+            swap_paths = dest_paths[i:(i + swap_tensor_count)]
+            swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+            assert aio_handle.wait() == swap_tensor_count
+
+    def _adjust_for_misaligned_lengths(self, tensors, offsets):
+        new_tensors = []
+        new_offsets = []
+
+        for orig_tensor, orig_offset in zip(tensors, offsets):
+            if not self.swappable_tensor(param=orig_tensor):
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            remainder = orig_tensor.numel() % self.numel_alignment
+            if remainder == 0:
+                new_tensors.append(orig_tensor)
+                new_offsets.append(orig_offset)
+                continue
+
+            # Split into two by making remainder a tensor
+            aligned_length = (orig_tensor.numel() //
+                              self.numel_alignment) * self.numel_alignment
+            new_tensors.append(orig_tensor.narrow(0, 0, aligned_length))
+            new_offsets.append(orig_offset)
+
+            # remainder tensor
+            new_tensors.append(orig_tensor.narrow(0, aligned_length, remainder))
+            new_offsets.append(orig_offset + aligned_length)
+
+        return new_tensors, new_offsets
+
+    def _retrieve_unswapped_grad_partitions(self, swap_info, dest_buffer):
+        UNSWAPPED_READ_GRADIENTS = 'unswapped_read_gradients'
+        self._start_timer(UNSWAPPED_READ_GRADIENTS)
+        tensor_count = len(swap_info.unswapped_gradients)
+        num_elem_count = swap_info.read_unswapped_gradients(dest_buffer)
+        self._stop_timer(UNSWAPPED_READ_GRADIENTS)
+        self._log_timers([UNSWAPPED_READ_GRADIENTS])
+
+        # It shoud be safe to discard unswapped gradient partitions
+        swap_info.release_unswapped_gradients()
+
+        if SWAPPER_DEBUG_MODE:
+            logger.info(
+                f'optimizer_retreive_unswapped_radients: param={swap_info.param_id} tensor_count={tensor_count} elem_count={num_elem_count}'
+            )
+
+    def _get_state_tensors(self, parameter):
+        if not parameter in self.optimizer.state:
+            return []
+
+        tensor_list = []
+        for value in self.optimizer.state[parameter].values():
+            if torch.is_tensor(value):
+                tensor_list.append(value)
+
+        return tensor_list
+
+    def _update_param_state_info(self, swap_info, parameter):
+        if not swap_info.has_state_tensors:
+            state_tensors = self._get_state_tensors(parameter)
+            if state_tensors:
+                swap_info.add_state_tensors(state_tensors)
+
+    def _create_param_swap_info(self, parameter, numel):
+        param_id = id(parameter)
+        assert not param_id in self.swap_params_info
+
+        self.swap_params_info[param_id] = OptimizerStateSwapInfo(
+            parameter=parameter,
+            numel=numel,
+            base_folder=self.swap_folder)
+        swap_info = self.swap_params_info[param_id]
+
+        self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _get_param_swap_info(self, parameter):
+        param_id = id(parameter)
+        swap_info = self.swap_params_info.get(param_id, None)
+
+        if swap_info is not None:
+            self._update_param_state_info(swap_info, parameter)
+
+        return swap_info
+
+    def _start_timer(self, name):
+        if self.timers:
+            self.timers(name).start()
+
+    def _stop_timer(self, name):
+        if self.timers:
+            self.timers(name).stop()
+
+    def _log_timers(self, name_list, force=False):
+        if self.timers and (SWAPPER_DEBUG_MODE or force):
+            self.timers.log(name_list)
+
+    def _io_aligned_numel(self, numel):
+        remainder = numel % self.numel_alignment
+        return numel if remainder == 0 else (numel + self.numel_alignment - remainder)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
new file mode 100644
index 000000000000..fb321f3b3937
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -0,0 +1,260 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers, get_sized_buffer
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+DEBUG_MODE = False
+
+SWAP_IN_PARAM_TIMER = 'swap_in_param'
+SWAP_OUT_PARAM_TIMER = 'swap_out_param'
+SWAP_IN_GRADIENT_TIMER = 'swap_in_gradient'
+
+
+class PartitionedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PartitionedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                            aio_config[AIO_QUEUE_DEPTH],
+                                            aio_config[AIO_SINGLE_SUBMIT],
+                                            aio_config[AIO_OVERLAP_EVENTS],
+                                            aio_config[AIO_THREAD_COUNT])
+
+        # Overlap swapping out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.print_exclude_list += [
+            'aio_handle',
+            'gradient_swapper',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PartitionedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter=None):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        required_buffer_count = len(
+            swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(swap_info.numel())
+        pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
+                                                           count=required_buffer_count,
+                                                           dtype=parameter.dtype)
+        assert pinned_buffers is not None
+        self.allocated_swap_buffers = pinned_buffers.copy()
+
+        self._start_timer(SWAP_IN_PARAM_TIMER)
+        self._swap_in_parameter(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffers=pinned_buffers[:required_buffer_count])
+        self._stop_timer(SWAP_IN_PARAM_TIMER)
+        self.timer_names.add(SWAP_IN_PARAM_TIMER)
+
+        self._start_timer(SWAP_IN_GRADIENT_TIMER)
+        self._swap_in_gradients(aio_handle=self.aio_handle,
+                                parameter=parameter,
+                                dest_buffer=pinned_buffers[-1])
+        self._stop_timer(SWAP_IN_GRADIENT_TIMER)
+        self.timer_names.add(SWAP_IN_GRADIENT_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap=False):
+        swap_info = self._get_param_swap_info(parameter=parameter)
+
+        if swap_info is None:
+            return
+
+        self._start_timer(SWAP_OUT_PARAM_TIMER)
+        pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._seperate_pinned_tensors(swap_info)
+        swap_bytes = sum([
+            self._io_aligned_numel(t.numel()) * t.element_size()
+            for t in swap_info.tensors
+        ])
+
+        WRITE_TIMER = 'swap_submit_write'
+        self._start_timer(WRITE_TIMER)
+
+        swap_out_tensors(self.aio_handle, pinned_tensors, pinned_paths)
+        assert self.aio_handle.wait() == len(pinned_tensors)
+        for t in pinned_tensors:
+            t.data = torch.Tensor()
+
+        if len(unpinned_tensors) > 0:
+            pinned_buffers = self.swap_buffer_manager.allocate_all(
+                num_elems=self.largest_numel,
+                dtype=self.dtype)
+            self._swap_out_unpinned_tensors(aio_handle=self.aio_handle,
+                                            unpinned_tensors=unpinned_tensors,
+                                            dest_paths=unpinned_paths,
+                                            pinned_buffers=pinned_buffers)
+            self.allocated_swap_buffers += pinned_buffers
+
+            for t in unpinned_tensors:
+                t.data = torch.Tensor()
+        self._stop_timer(WRITE_TIMER)
+
+        self.swap_buffer_manager.free(self.allocated_swap_buffers)
+        self.allocated_swap_buffers = []
+
+        self._stop_timer(SWAP_OUT_PARAM_TIMER)
+        self.timer_names.add(SWAP_OUT_PARAM_TIMER)
+
+        self._log_timers([WRITE_TIMER])
+
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_out: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _swap_in_parameter(self, aio_handle, parameter, dest_buffers):
+        swap_info = self._get_param_swap_info(parameter)
+        if swap_info is None:
+            return
+
+        assert len(swap_info.tensors) <= len(dest_buffers)
+
+        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(
+            swap_info.tensors)
+        swap_buffers = get_sized_buffers(dest_buffers, swap_lengths)
+
+        READ_TIMER = 'swap_submit_read_param'
+        WAIT_TIMER = 'swap_wait_read_param'
+
+        self._start_timer(READ_TIMER)
+        swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths)
+        self._stop_timer(READ_TIMER)
+
+        swap_bytes = sum(
+            [buffer.numel() * buffer.element_size() for buffer in swap_buffers])
+
+        self._start_timer(WAIT_TIMER)
+        aio_handle.wait()
+        self._stop_timer(WAIT_TIMER)
+
+        compute_lengths = [swap_info.numel()] * len(swap_info.tensors)
+        compute_buffers = get_sized_buffers(dest_buffers, compute_lengths)
+        for t, buffer in zip(swap_info.tensors, compute_buffers):
+            t.data = buffer.data
+
+        self._log_timers([READ_TIMER, WAIT_TIMER])
+        if DEBUG_MODE and torch.distributed.get_rank() == 0:
+            logger.info(f'optimizer_param_swap_in: {(swap_bytes/(1024**3)):5.2f} GB')
+
+    def _seperate_pinned_tensors(self, swap_info):
+        pinned_tensors = []
+        pinned_paths = []
+
+        unpinned_tensors = []
+        unpinned_paths = []
+
+        for tensor, path in zip(swap_info.tensors, swap_info.swap_paths):
+            if tensor.is_pinned():
+                pinned_tensors.append(tensor)
+                pinned_paths.append(path)
+            else:
+                unpinned_tensors.append(tensor)
+                unpinned_paths.append(path)
+
+        return pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths
+
+    def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
+        swap_info = self.swap_params_info[id(parameter)]
+        param_gradients = swap_info.swapped_gradients.values()
+        swap_buffers = [
+            gradient_tensor.narrow(0,
+                                   grad.offset,
+                                   grad.length) for grad in param_gradients
+        ]
+        swap_paths = [grad.path for grad in param_gradients]
+        SWAP_READ_GRADIENTS = 'swap_submit_read_gradient'
+        SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient'
+
+        self._start_timer(SWAP_READ_GRADIENTS)
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+        self._stop_timer(SWAP_READ_GRADIENTS)
+
+        self._start_timer(SWAP_WAIT_GRADIENTS)
+        assert len(swap_buffers) == aio_handle.wait()
+        self._stop_timer(SWAP_WAIT_GRADIENTS)
+
+        self._log_timers([SWAP_READ_GRADIENTS, SWAP_WAIT_GRADIENTS])
+
+    def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
+        swap_info = self.swap_params_info.get(id(parameter), None)
+        if not (swap_info and swap_info.has_gradients()):
+            return
+
+        assert dest_buffer.is_pinned()
+        assert parameter.numel() <= dest_buffer.numel()
+
+        parameter.grad = dest_buffer.narrow(0, 0, parameter.numel())
+
+        if swap_info.swapped_gradients:
+            self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad)
+
+        if swap_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=swap_info,
+                                                     dest_buffer=parameter.grad)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
new file mode 100644
index 000000000000..0f8a690b7a91
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -0,0 +1,308 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from enum import Enum
+import torch
+import torch.distributed as dist
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+from .constants import *
+from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, print_object
+from ..zero.offload_constants import *
+
+
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
+class PartitionedParamStatus(Enum):
+    # Partitioned parameters are present and ready for use
+    AVAILABLE = 1
+
+    # partitioned params are in some non-memory device
+    NOT_AVAILABLE = 2
+
+    # partitioned params are being read from some non-memory device.
+    INFLIGHT = 3
+
+
+class AsyncPartitionedParameterSwapper(object):
+    def __init__(self, ds_config):
+
+        aio_op = AsyncIOBuilder().load(verbose=False)
+        self.aio_handle = aio_op.aio_handle
+
+        #set swap buffers, create aio handles
+        self._configure_aio(ds_config)
+
+        #mapping from param id to path
+        self.id_to_path = {}
+
+        #mapping from pram_id to buffer id
+        self.param_id_to_buffer_id = {}
+
+        #number of elements in the param
+        self.param_id_to_numel = {}
+
+        self.pending_writes = 0
+        self.pending_reads = 0
+
+        #keep track of async swap in params and buffers
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+        #keep track of available params
+        self.available_params = set()
+        self.available_numel = 0
+
+        self.invalid_buffer = torch.tensor(1).half()
+
+        if dist.get_rank() == 0:
+            exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
+            print_object(obj=self,
+                         name='AsyncPartitionedParameterSwapper',
+                         exclude_list=exclude_list)
+
+    def available_swap_in_buffers(self):
+        return len(self.available_buffer_ids)
+
+    def _configure_aio(self, ds_config):
+        self.swap_config = ds_config.zero_config.offload_param
+        self.swap_folder = os.path.join(self.swap_config[OFFLOAD_PARAM_NVME_PATH],
+                                        'zero_stage_3',
+                                        'fp16params',
+                                        f'rank{dist.get_rank()}')
+        os.makedirs(self.swap_folder, exist_ok=True)
+
+        self.elements_per_buffer = self.swap_config[OFFLOAD_PARAM_BUFFER_SIZE]
+        self.param_buffer_count = self.swap_config[OFFLOAD_PARAM_BUFFER_COUNT]
+
+        self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
+        self.reserved_buffer_ids = []
+
+        self.buffers = torch.empty(int(self.elements_per_buffer *
+                                       self.param_buffer_count),
+                                   dtype=torch.half,
+                                   pin_memory=True,
+                                   requires_grad=False)
+
+        self.aio_config = ds_config.aio_config
+
+        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                               self.aio_config[AIO_QUEUE_DEPTH],
+                                               self.aio_config[AIO_SINGLE_SUBMIT],
+                                               self.aio_config[AIO_OVERLAP_EVENTS],
+                                               self.aio_config[AIO_THREAD_COUNT])
+
+        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
+                                                self.aio_config[AIO_QUEUE_DEPTH],
+                                                self.aio_config[AIO_SINGLE_SUBMIT],
+                                                self.aio_config[AIO_OVERLAP_EVENTS],
+                                                self.aio_config[AIO_THREAD_COUNT])
+
+        self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
+
+        self.swap_element_size = torch.tensor([], dtype=torch.half).element_size()
+        self.swap_out_params = []
+
+    #Check if partiitoned param or numel in a tensor is swappable or not
+    def swappable_tensor(self, param=None, numel=None):
+        if param is not None:
+            assert numel is None, "Both parma and numel cannot be provided"
+            numel = param.ds_tensor.ds_numel
+        if numel is not None:
+            return self.min_aio_bytes <= numel * self.swap_element_size
+        assert False, "Either param or numel must be provided"
+
+    def get_path(self, param, must_exist=False):
+        paths, _ = self._get_paths([param], must_exist=must_exist)
+        return paths[0]
+
+    def _get_paths(self, params, must_exist=False):
+        paths = []
+        tensors = []
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.id_to_path.keys():
+                param_path = self.id_to_path[param_id]
+            else:
+                assert not must_exist, f"Path for param id {param_id} does not exist"
+                param_path = os.path.join(self.swap_folder,
+                                          f'{param_id}_param.tensor.swp')
+
+                self.id_to_path[param_id] = param_path
+            paths.append(param_path)
+            tensors.append(param.ds_tensor)
+        return paths, tensors
+
+    def _track_numel(self, params):
+        for param in params:
+            assert param.ds_tensor is not None, "Partitioned tensor is None"
+            self.param_id_to_numel[param.ds_id] = param.ds_tensor.ds_numel
+
+    def _allocate_and_return_buffers_for_swap_in(self, params):
+        buffers = []
+        for param in params:
+            param_id = param.ds_id
+            assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown"
+            assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
+
+            buffer_id = self.available_buffer_ids.pop()
+            print_rank_0(
+                f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
+            self.param_id_to_buffer_id[param_id] = buffer_id
+            buffer = self.buffers.narrow(0,
+                                         int(buffer_id * self.elements_per_buffer),
+                                         self.param_id_to_numel[param_id])
+            buffers.append(buffer)
+
+        return buffers
+
+    #waits for inflight nvme write to complete
+    def synchronize_writes(self):
+        if self.pending_writes == 0:
+            return
+        assert self.pending_writes == self.aio_write_handle.wait()
+        self.pending_writes = 0
+        self.remove_partition_and_release_buffers(self.swap_out_params)
+        self.swap_out_params = []
+
+    #waits for inflight nvme reads to complete
+    def synchronize_reads(self):
+        if self.pending_reads == 0:
+            return
+
+        assert self.pending_reads == self.aio_read_handle.wait()
+
+        self.pending_reads = 0
+
+        for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers):
+            param.ds_tensor.data = swap_in_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+
+        self.available_params.update([param.ds_id for param in self.inflight_params])
+        self.available_numel += self.inflight_numel
+
+        self.inflight_params = []
+        self.inflight_swap_in_buffers = []
+        self.inflight_numel = 0
+
+    #Removes the memory assignment and releases the buffers
+    #Should only be executed after swapping out the tensors
+    def remove_partition_and_release_buffers(self, params):
+        for param in params:
+            param_id = param.ds_id
+
+            if param_id in self.param_id_to_buffer_id.keys():
+
+                buffer_id = self.param_id_to_buffer_id[param_id]
+
+                assert buffer_id is not None, "Missing buffer id for releasing"
+
+                self.available_buffer_ids.append(buffer_id)
+                del self.param_id_to_buffer_id[param_id]
+                print_rank_0(f"param {param.ds_id} releases buffer id {buffer_id}  ")
+
+                if param_id in self.available_params:
+                    self.available_params.remove(param_id)
+                    self.available_numel -= self.param_id_to_numel[param_id]
+
+            param.ds_tensor.data = self.invalid_buffer.data
+            param.ds_tensor.status = PartitionedParamStatus.NOT_AVAILABLE
+
+    #writes from in memory to nvme. Does not release the buffers
+    def _swap_out(self, params, async_op=True):
+
+        swap_out_paths, swap_out_params = self._get_paths(params)
+
+        self._track_numel(params)
+
+        swap_out_tensors(self.aio_write_handle, swap_out_params, swap_out_paths)
+
+        self.pending_writes += len(swap_out_params)
+        self.swap_out_params += params
+
+        if not async_op:
+            self.synchronize_writes()
+
+    #blocking swap out followed by releasing the memory buffers
+    def swap_out_and_release(self, params, async_op=False, force_buffer_release=False):
+        if async_op:
+            assert force_buffer_release, "Should not release preallocated buffers without completing the swap out. Set force_buffer_release to True to do it anyways"
+        self._swap_out(params, async_op=async_op)
+
+    #assigns an in memory buffer and swaps in from nvme
+    def swap_in(self, params, async_op=True, swap_in_buffers=None):
+
+        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight"
+        swap_in_paths, _ = self._get_paths(params)
+
+        if swap_in_buffers is None:
+            if len(self.available_buffer_ids) < len(swap_in_paths):
+                print_rank_0(
+                    f'Not enough swap in buffers {len(self.available_buffer_ids)} for params {len(swap_in_paths)}',
+                    force=True)
+                print_rank_0(
+                    f'Num inflight: params {len(self.inflight_params)}, buffers {len(self.inflight_swap_in_buffers)}, numel = {self.inflight_numel}',
+                    force=True)
+                print_rank_0(
+                    f'Num available: param {len(self.available_params)}, numel = {self.available_numel}',
+                    force=True)
+
+            assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
+            swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params)
+
+        swap_in_tensors(self.aio_read_handle, swap_in_buffers, swap_in_paths)
+
+        self.inflight_params.extend(params)
+        self.inflight_swap_in_buffers.extend(swap_in_buffers)
+        self.inflight_numel += sum([t.numel() for t in swap_in_buffers])
+
+        for param in params:
+            param.ds_tensor.status = PartitionedParamStatus.INFLIGHT
+
+        self.pending_reads += len(params)
+
+        if not async_op:
+            self.synchronize_reads()
+
+    #assign a buffer to a param and return the buffer
+    def get_buffer(self, param, numel):
+        assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}"
+        param_id = param.ds_id
+        self.param_id_to_numel[param_id] = numel
+        buffer_id = self.available_buffer_ids.pop()
+        self.param_id_to_buffer_id[param_id] = buffer_id
+
+        buffer = self.buffers.narrow(0,
+                                     int(buffer_id * self.elements_per_buffer),
+                                     self.param_id_to_numel[param_id])
+        print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id}")
+        return buffer
+
+    def reserve_available_buffers(self):
+        buffers = []
+        for id in self.available_buffer_ids:
+            buffers.append(
+                self.buffers.narrow(0,
+                                    int(id * self.elements_per_buffer),
+                                    int(self.elements_per_buffer)))
+            self.reserved_buffer_ids.append(id)
+
+        self.available_buffer_ids = []
+        return buffers
+
+    def release_reserved_buffers(self):
+        for id in self.reserved_buffer_ids:
+            self.available_buffer_ids.append(id)
+
+        self.reserved_buffer_ids = []
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
new file mode 100644
index 000000000000..7d0116faab5b
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -0,0 +1,284 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+
+from deepspeed.utils.logging import logger
+from deepspeed.ops.aio import AsyncIOBuilder
+
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object, \
+    MIN_AIO_BYTES, AIO_ALIGNED_BYTES
+from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper
+from deepspeed.runtime.swap_tensor.optimizer_utils import SwapBufferManager, get_sized_buffer
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
+
+
+class OptimizerSwapOp(object):
+    def __init__(self,
+                 aio_handle,
+                 read_op,
+                 param_info,
+                 allocated_buffers,
+                 state_buffers,
+                 num_ops):
+        self.aio_handle = aio_handle
+        self.read_op = read_op
+        self.param_info = param_info
+        self.allocated_buffers = allocated_buffers
+        self.state_buffers = state_buffers
+        self.wait_required = True
+        self.num_ops = num_ops
+
+    def is_parameter(self, parameter):
+        return id(parameter) == self.param_info.param_id
+
+    def wait(self):
+        assert self.wait_required
+        assert self.aio_handle.wait() == self.num_ops
+        self.wait_required = False
+
+
+SYNC_SWAP_IN = 'sync_swap_in'
+ASYNC_SWAP_IN = 'async_swap_in'
+SYNC_SWAP_OUT = 'sync_swap_out'
+ASYNC_SWAP_OUT = 'async_swap_out'
+
+SWAP_IN_STATE_TIMER = 'swap_in_state'
+SWAP_OUT_STATE_TIMER = 'swap_out_state'
+SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
+ASYNC_SWAP_IN_STATE_TIMER = "async_swap_in_state"
+ASYNC_SWAP_OUT_STATE_TIMER = 'async_swap_out_state'
+
+
+class PipelinedOptimizerSwapper(OptimizerSwapper):
+    def __init__(self,
+                 swap_config,
+                 aio_config,
+                 base_folder,
+                 optimizer,
+                 largest_numel,
+                 device,
+                 dtype,
+                 timers):
+        super(PipelinedOptimizerSwapper,
+              self).__init__(swap_config,
+                             aio_config,
+                             base_folder,
+                             optimizer,
+                             largest_numel,
+                             device,
+                             dtype,
+                             timers)
+
+        aio_op = AsyncIOBuilder().load()
+        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                  aio_config[AIO_QUEUE_DEPTH],
+                                                  aio_config[AIO_SINGLE_SUBMIT],
+                                                  aio_config[AIO_OVERLAP_EVENTS],
+                                                  aio_config[AIO_THREAD_COUNT])
+
+        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
+                                                 aio_config[AIO_QUEUE_DEPTH],
+                                                 aio_config[AIO_SINGLE_SUBMIT],
+                                                 aio_config[AIO_OVERLAP_EVENTS],
+                                                 aio_config[AIO_THREAD_COUNT])
+
+        # Overlap gradient swap out
+        self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.write_aio_handle,
+                                                   numel_alignment=self.numel_alignment,
+                                                   timers=self.timers)
+
+        self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ]
+        self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+
+        self.swap_ops = {
+            SYNC_SWAP_IN: None,
+            ASYNC_SWAP_IN: None,
+            SYNC_SWAP_OUT: None,
+            ASYNC_SWAP_OUT: None
+        }
+
+        self.print_exclude_list += [
+            'gradient_swapper',
+            'read_aio_handle',
+            'write_aio_handle',
+            'swap_ops',
+            'print_exclude_list'
+        ]
+
+        if torch.distributed.get_rank() == 0:
+            print_object(obj=self,
+                         name='PipelinedOptimizerSwapper',
+                         exclude_list=self.print_exclude_list)
+
+    def initialize_parameters(self, parameters, src_tensors):
+        self._initialize_parameters(parameters=parameters,
+                                    src_tensors=src_tensors,
+                                    aio_handle=self.write_aio_handle)
+
+    def initialize_from_swapped_fp16_params(self,
+                                            fp16_partitions_info,
+                                            fp16_num_elems,
+                                            fp16_pinned_buffers,
+                                            fp32_parameters):
+        self._initialize_from_swapped_fp16_params(
+            aio_handle=self.write_aio_handle,
+            fp16_partitions_info=fp16_partitions_info,
+            fp16_num_elems=fp16_num_elems,
+            fp16_pinned_buffers=fp16_pinned_buffers,
+            fp32_parameters=fp32_parameters)
+
+    def flush_gradients(self):
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+    def swap_in_optimizer_state(self, parameter, async_parameter):
+        assert parameter is not None
+        assert self.swap_ops[SYNC_SWAP_IN] is None
+
+        self._flush_gradient_swapper(self.gradient_swapper)
+
+        self._start_timer(SWAP_IN_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_IN]:
+            assert self.swap_ops[ASYNC_SWAP_IN].is_parameter(parameter)
+            self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN]
+            self.swap_ops[ASYNC_SWAP_IN] = None
+        else:
+            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=parameter)
+
+        if self.swap_ops[SYNC_SWAP_IN]:
+            self.swap_ops[SYNC_SWAP_IN].wait()
+
+        if self.async_swap_in and async_parameter is not None:
+            assert self.swap_ops[ASYNC_SWAP_IN] is None
+            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(
+                aio_handle=self.read_aio_handle,
+                parameter=async_parameter)
+
+        self._stop_timer(SWAP_IN_STATE_TIMER)
+        self.timer_names.add(SWAP_IN_STATE_TIMER)
+
+    def swap_out_optimizer_state(self, parameter, async_swap):
+        self._start_timer(SWAP_OUT_STATE_TIMER)
+
+        if self.swap_ops[ASYNC_SWAP_OUT]:
+            self._start_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self._complete_swap_out(ASYNC_SWAP_OUT)
+            self._stop_timer(ASYNC_SWAP_OUT_STATE_TIMER)
+            self.timer_names.add(ASYNC_SWAP_OUT_STATE_TIMER)
+
+        assert self.swap_ops[SYNC_SWAP_IN] is not None
+        assert not self.swap_ops[SYNC_SWAP_IN].wait_required
+        swap_op = self._swap_out_optimizer_state(aio_handle=self.write_aio_handle,
+                                                 parameter=parameter,
+                                                 swap_in_op=self.swap_ops[SYNC_SWAP_IN])
+        self.swap_ops[SYNC_SWAP_IN] = None
+
+        if self.async_swap_out and async_swap:
+            self.swap_ops[ASYNC_SWAP_OUT] = swap_op
+        else:
+            self.swap_ops[SYNC_SWAP_OUT] = swap_op
+            self._complete_swap_out(SYNC_SWAP_OUT)
+
+        self._stop_timer(SWAP_OUT_STATE_TIMER)
+        self.timer_names.add(SWAP_OUT_STATE_TIMER)
+
+    def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
+        self._swap_out_gradients(parameter=parameter,
+                                 gradient_offsets=gradient_offsets,
+                                 gradient_tensors=gradient_tensors,
+                                 gradient_swapper=self.gradient_swapper)
+
+    def _complete_swap_out(self, swap_out_type):
+        self.swap_ops[swap_out_type].wait()
+        self.swap_buffer_manager.free(self.swap_ops[swap_out_type].allocated_buffers)
+        self.swap_ops[swap_out_type] = None
+
+    def _swap_out_optimizer_state(self, aio_handle, parameter, swap_in_op):
+        assert swap_in_op.is_parameter(parameter)
+
+        allocated_buffers = swap_in_op.allocated_buffers.copy()
+        swap_buffers = swap_in_op.state_buffers.copy()
+
+        param_info = swap_in_op.param_info
+        self._update_param_state_info(param_info, parameter)
+        unpinned_tensors = param_info.get_unpinned_state_tensors()
+
+        if len(unpinned_tensors) > 0:
+            new_alloc_buffers = self.swap_buffer_manager.allocate(
+                num_elems=self._io_aligned_numel(param_info.numel()),
+                count=len(unpinned_tensors),
+                dtype=param_info.dtype())
+            assert new_alloc_buffers is not None
+
+            allocated_buffers += new_alloc_buffers
+            swap_buffers += new_alloc_buffers
+
+            for pinned_dst, unpinned_src in zip(new_alloc_buffers, unpinned_tensors):
+                dst = get_sized_buffer(pinned_dst, unpinned_src.numel())
+                dst.data.copy_(unpinned_src.data)
+
+        swap_paths = param_info.swap_paths.copy()
+        assert len(swap_paths) == len(swap_buffers)
+
+        swap_out_tensors(aio_handle, swap_buffers, swap_paths)
+
+        swap_out_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                      param_info=param_info,
+                                      read_op=False,
+                                      allocated_buffers=allocated_buffers,
+                                      state_buffers=swap_buffers,
+                                      num_ops=len(swap_buffers))
+
+        return swap_out_op
+
+    def _swap_in_optimizer_state(self, aio_handle, parameter):
+        param_info = self._get_param_swap_info(parameter)
+        if param_info is None:
+            return None
+
+        required_buffer_count = len(
+            param_info.tensors) + (1 if param_info.has_gradients() else 0)
+        aligned_numel = self._io_aligned_numel(param_info.numel())
+        allocated_buffers = self.swap_buffer_manager.allocate(
+            num_elems=aligned_numel,
+            count=required_buffer_count,
+            dtype=parameter.dtype)
+        assert allocated_buffers is not None, \
+        f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing {OFFLOAD_OPTIMIZER_BUFFER_COUNT}"
+
+        state_buffers = allocated_buffers[:len(param_info.tensors)]
+        param_info.set_swap_buffers(state_buffers)
+
+        swap_buffers = state_buffers.copy()
+        swap_paths = param_info.swap_paths.copy()
+
+        if param_info.has_gradients():
+            parameter.grad = allocated_buffers[-1].narrow(0, 0, param_info.numel())
+            if param_info.swapped_gradients:
+                swap_buffers += param_info.get_swap_gradient_buffers(parameter.grad)
+                swap_paths += param_info.get_swap_gradient_paths()
+
+        swap_in_tensors(aio_handle, swap_buffers, swap_paths)
+
+        if param_info.unswapped_gradients:
+            self._retrieve_unswapped_grad_partitions(swap_info=param_info,
+                                                     dest_buffer=parameter.grad)
+
+        swap_in_op = OptimizerSwapOp(aio_handle=aio_handle,
+                                     param_info=param_info,
+                                     read_op=True,
+                                     allocated_buffers=allocated_buffers,
+                                     state_buffers=state_buffers,
+                                     num_ops=len(swap_buffers))
+
+        return swap_in_op
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
new file mode 100644
index 000000000000..a88746fdc739
--- /dev/null
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -0,0 +1,241 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+Licensed under the MIT license.
+
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import torch
+from deepspeed.utils.logging import logger
+
+from deepspeed.runtime.swap_tensor.constants import AIO_BLOCK_SIZE, AIO_QUEUE_DEPTH, \
+    AIO_THREAD_COUNT, AIO_SINGLE_SUBMIT, AIO_OVERLAP_EVENTS
+
+MIN_AIO_BYTES = 1024**2
+AIO_ALIGNED_BYTES = 1024
+
+
+def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pread(buffer, path) == 0)
+
+
+def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
+    for buffer, path in zip(tensor_buffers, swap_paths):
+        assert (swap_handle.async_pwrite(buffer, path) == 0)
+
+
+def print_object(obj, name, exclude_list=[]):
+    logger.info('{}:'.format(name))
+    for arg in sorted(vars(obj)):
+        if not arg in exclude_list:
+            dots = '.' * (29 - len(arg))
+            logger.info('  {} {} {}'.format(arg, dots, getattr(obj, arg)))
+
+
+class SwapBuffer(object):
+    def __init__(self, buffer):
+        self.buffer = buffer
+        self.reset()
+
+    def reset(self):
+        self.offset = 0
+        self.swap_tensors = {}
+        self.compute_tensors = {}
+        self.swap_paths = {}
+        self.num_elem = 0
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        swap_tensor, compute_tensor = self.allocate_tensor(swap_path, tensor.numel(), aligned_numel)
+        compute_tensor.data.copy_(tensor.data)
+        return swap_tensor, compute_tensor
+
+    def allocate_tensor(self, swap_path, numel, aligned_numel):
+        assert self.has_space(aligned_numel)
+        assert not self.offset in self.swap_tensors
+
+        allocate_offset = self.offset
+        swap_tensor = self.buffer.narrow(0, allocate_offset, aligned_numel)
+        dest_tensor = swap_tensor.narrow(0, 0, numel)
+
+        self.swap_tensors[allocate_offset] = swap_tensor
+        self.compute_tensors[allocate_offset] = dest_tensor
+        self.swap_paths[allocate_offset] = swap_path
+        self.offset += aligned_numel
+        self.num_elem += numel
+
+        return self.swap_tensors[allocate_offset], self.compute_tensors[allocate_offset]
+
+    def has_space(self, numel):
+        return (self.offset + numel) <= self.buffer.numel()
+
+    def get_swap_tensors(self):
+        return [tensor for tensor in self.swap_tensors.values()]
+
+    def get_swap_paths(self):
+        return [path for path in self.swap_paths.values()]
+
+    def get_compute_tensors(self):
+        return [tensor for tensor in self.compute_tensors.values()]
+
+    def get_num_elem(self):
+        return self.num_elem
+
+    def get_swap_tensor(self, offset):
+        return self.swap_tensors.get(offset, None)
+
+    def get_compute_tensor(self, offset):
+        return self.compute_tensors.get(offset, None)
+
+    def get_swap_path(self, offset):
+        return self.swap_paths(offset, None)
+
+
+class SwapBufferPool(object):
+    def __init__(self, buffers):
+        assert all([buf.is_pinned() for buf in buffers])
+        self.buffers = [SwapBuffer(buf) for buf in buffers]
+        self.current_index = 0
+
+    def reset(self):
+        self.current_index = 0
+        for buffer in self.buffers:
+            buffer.reset()
+
+    def allocate_tensor(self, numel, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().allocate_tensor(swap_path, numel, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def insert_tensor(self, tensor, swap_path, aligned_numel):
+        if self.has_space(aligned_numel):
+            swap_tensor, compute_tensor = self._get_current_buffer().insert_tensor(tensor, swap_path, aligned_numel)
+            return swap_tensor, compute_tensor
+
+        return None, None
+
+    def get_swap_tensors(self):
+        swap_tensors = []
+        for buffer in self._get_used_buffers():
+            swap_tensors += buffer.get_swap_tensors()
+
+        return swap_tensors
+
+    def get_swap_paths(self):
+        swap_paths = []
+        for buffer in self._get_used_buffers():
+            swap_paths += buffer.get_swap_paths()
+
+        return swap_paths
+
+    def get_compute_tensors(self):
+        compute_tensors = []
+        for buffer in self._get_used_buffers():
+            compute_tensors += buffer.get_compute_tensors()
+
+        return compute_tensors
+
+    def has_space(self, numel):
+        if self._get_current_buffer().has_space(numel):
+            return True
+
+        if self.current_index == len(self.buffers) - 1:
+            return False
+
+        self.current_index += 1
+        return self._get_current_buffer().has_space(numel)
+
+    def swap_out(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_out_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def swap_in(self, aio_handle, async_op=False):
+        swap_tensors = self.get_swap_tensors()
+        swap_paths = self.get_swap_paths()
+        assert all([p is not None for p in swap_paths])
+
+        swap_in_tensors(aio_handle, swap_tensors, swap_paths)
+
+        if not async_op:
+            assert len(swap_tensors) == aio_handle.wait()
+
+    def _get_current_buffer(self):
+        return self.buffers[self.current_index]
+
+    def _get_used_buffers(self):
+        return self.buffers[:self.current_index + 1]
+
+
+class SwapBufferManager(object):
+    def __init__(self, num_elems, count, dtype):
+        self.num_elems = num_elems
+        self.count = count
+        self.dtype = dtype
+        self.all_buffers = [
+            torch.zeros(num_elems,
+                        device='cpu',
+                        dtype=dtype).pin_memory() for _ in range(count)
+        ]
+        self.free_buffer_index = [i for i in range(count)]
+        self.used_buffer_index = {}
+        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
+                                                                                     **3)
+
+        if torch.distributed.get_rank() == 0:
+            exclude_list = ['all_buffers']
+            print_object(obj=self, name='SwapBufferManager', exclude_list=exclude_list)
+
+    def allocate(self, num_elems, count, dtype):
+        assert dtype == self.dtype
+        assert num_elems <= self.num_elems
+        if count > len(self.free_buffer_index):
+            return None
+
+        used_indices = self.free_buffer_index[-count:]
+        self.free_buffer_index = self.free_buffer_index[:-count]
+
+        buffers = []
+        for i in used_indices:
+            tmp_buffer = self.all_buffers[i].narrow(0, 0, num_elems)
+            buffers.append(tmp_buffer)
+            self.used_buffer_index[id(tmp_buffer)] = i
+        return buffers
+
+    def allocate_all(self, num_elems, dtype):
+        return self.allocate(num_elems=num_elems,
+                             count=len(self.free_buffer_index),
+                             dtype=dtype)
+
+    def free(self, buffers):
+        buffer_ids = []
+        for buf in buffers:
+            buffer_ids.append(id(buf))
+
+        assert all([b_id in self.used_buffer_index for b_id in buffer_ids])
+
+        for b_id in buffer_ids:
+            self.free_buffer_index.append(self.used_buffer_index[b_id])
+            del (self.used_buffer_index[b_id])
+
+
+def get_sized_buffer(buffer, num_elems):
+    assert num_elems <= buffer.numel(), \
+        f'num_elems {num_elems}> buffer {buffer.numel()}'
+    return buffer.narrow(0, 0, num_elems) if num_elems < buffer.numel() else buffer
+
+
+def get_sized_buffers(buffer_list, num_elems_list):
+    swap_buffers = [
+        get_sized_buffer(buffer, num_elems) \
+        for buffer, num_elems in zip(buffer_list, num_elems_list)
+    ]
+    return swap_buffers
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index bb565b5cb87f..35df3422304e 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -64,11 +64,15 @@ def move_to_device(item, device):
 
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
-
-    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False):
+    def __init__(self,
+                 param_groups=None,
+                 mpu=None,
+                 zero_reduce_scatter=False,
+                 deepspeed=None):
         self.mpu = mpu
         self.params = [] if param_groups else None
         self.zero_reduce_scatter = zero_reduce_scatter
+        self.deepspeed = deepspeed
         if param_groups:
             for group in param_groups:
                 for param in group:
@@ -126,9 +130,24 @@ def has_overflow(self, params):
                                          op=torch.distributed.ReduceOp.MAX,
                                          group=torch.distributed.group.WORLD)
         elif self.mpu is not None:
+            if self.deepspeed is not None:
+                using_pipeline = hasattr(self.deepspeed,
+                                         'pipeline_enable_backward_allreduce')
+                if (using_pipeline
+                        and self.deepspeed.pipeline_enable_backward_allreduce is False
+                    ) or (not using_pipeline
+                          and self.deepspeed.enable_backward_allreduce is False):
+                    torch.distributed.all_reduce(
+                        overflow_gpu,
+                        op=torch.distributed.ReduceOp.MAX,
+                        group=self.mpu.get_data_parallel_group())
             torch.distributed.all_reduce(overflow_gpu,
                                          op=torch.distributed.ReduceOp.MAX,
                                          group=self.mpu.get_model_parallel_group())
+        elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
+            torch.distributed.all_reduce(overflow_gpu,
+                                         op=torch.distributed.ReduceOp.MAX,
+                                         group=torch.distributed.group.WORLD)
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index d521573e1a77..bd881c8ec69f 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -1,5 +1,13 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 from .partition_parameters import ZeroParamType
 from .partition_parameters import ZeroParamStatus
 from .partition_parameters import Init
 from .partition_parameters import GatheredParameters
 from .partition_parameters import register_external_parameter
+
+from .tiling import TiledLinear
+from .tiling import TiledLinearReturnBias
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 622ffa9ba1cb..6bae0f1fbcd0 100755
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -5,7 +5,10 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.utils import logger
-from deepspeed.runtime.zero.constants import *
+from .constants import *
+from .offload_constants import *
+from .offload_config import get_offload_param_config, get_default_offload_param_config, \
+    get_offload_optimizer_config, get_default_offload_optimizer_config
 
 
 class DeepSpeedZeroConfig(DeepSpeedConfigObject):
@@ -24,9 +27,8 @@ def __init__(self, param_dict):
         self.elastic_checkpoint = None
 
         #Offload Specific Parameters
-        self.cpu_offload = None
-        self.cpu_offload_params = None
-        self.cpu_offload_use_pin_memory = None
+        self.offload_param = None
+        self.offload_optimizer = None
         self.sub_group_size = None
 
         #Stage3 Specific Parameters
@@ -36,12 +38,6 @@ def __init__(self, param_dict):
         self.max_reuse_distance = None
         self.gather_fp16_weights_on_model_save = None
 
-        #Stage3 Specific Parameters
-        self.prefetch_bucket_size = None
-        self.param_persistence_threshold = None
-        self.max_live_parameters = None
-        self.max_reuse_distance = None
-
         if ZERO_OPTIMIZATION in param_dict.keys():
             zero_config_dict = param_dict[ZERO_OPTIMIZATION]
             if type(zero_config_dict) is bool:
@@ -66,7 +62,24 @@ def read_zero_config_deprecated(self, param_dict):
             .format(ZERO_FORMAT))
         return zero_config_dict
 
+    def _sanity_check(self, zero_config_dict):
+        deprecated_dict = {
+            ZERO_OPTIMIZATION_CPU_OFFLOAD:
+            ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
+            ZERO_OPTIMIZATION_OFFLOAD_PARAM,
+            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
+            f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}'
+        }
+
+        for old_key, new_key in deprecated_dict.items():
+            if old_key in zero_config_dict:
+                logger.warning(
+                    f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.')
+
     def _initialize(self, zero_config_dict):
+        self._sanity_check(zero_config_dict)
+
         self.stage = get_scalar_param(zero_config_dict,
                                       ZERO_OPTIMIZATION_STAGE,
                                       ZERO_OPTIMIZATION_STAGE_DEFAULT)
@@ -109,24 +122,30 @@ def _initialize(self, zero_config_dict):
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
             ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
 
-        self.cpu_offload = get_scalar_param(zero_config_dict,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD,
-                                            ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
-
         self.elastic_checkpoint = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT,
             ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT)
 
-        self.cpu_offload_params = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
-
-        self.cpu_offload_use_pin_memory = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
-            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT)
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD in zero_config_dict:
+            cpu_offload_optimizer = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT)
+            if cpu_offload_optimizer:
+                self.offload_optimizer = get_default_offload_optimizer_config()
+        else:
+            self.offload_optimizer = get_offload_optimizer_config(zero_config_dict)
+
+        if ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS in zero_config_dict:
+            cpu_offload_params = get_scalar_param(
+                zero_config_dict,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS,
+                ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT)
+            if cpu_offload_params:
+                self.offload_param = get_default_offload_param_config()
+        else:
+            self.offload_param = get_offload_param_config(zero_config_dict)
 
         self.sub_group_size = get_scalar_param(zero_config_dict,
                                                ZERO_OPTIMIZATION_SUB_GROUP_SIZE,
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
index e5812980a337..9cfe313c75b0 100755
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -3,6 +3,8 @@
 Licensed under the MIT license.
 """
 
+from .offload_constants import *
+
 #########################################
 # ZeRO optimization
 #########################################
@@ -22,10 +24,12 @@
     "overlap_comm": [true|false],
     "reduce_bucket_size": 500000000,
     "load_from_fp32_weights": [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory": [true|false],
-    "sub_group_size" : 1000000000000
+    "cpu_offload": [true|false] (deprecated),
+    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_use_pin_memory": [true|false] (deprecated),
+    "sub_group_size" : 1000000000000,
+    "offload_param": {...},
+    "offload_optimizer": {...}
     }
 }
 '''
@@ -48,7 +52,7 @@
 ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
 
 ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
-ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
+ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = False
 
 ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
 ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
@@ -67,18 +71,24 @@
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
 ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
 
-ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
-
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT = 'elastic_checkpoint'
 ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT = True
 
+ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
+ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
+
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS = 'cpu_offload_params'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT = False
 
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY = 'cpu_offload_use_pin_memory'
 ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT = False
 
+ZERO_OPTIMIZATION_OFFLOAD_PARAM = OFFLOAD_PARAM
+ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT = None
+
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER = OFFLOAD_OPTIMIZER
+ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT = None
+
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE = 'sub_group_size'
 ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT = 1000000000000
 
@@ -118,16 +128,12 @@
     ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
     ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT:
     ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT,
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY:
-    ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY,
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM:
+    ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT,
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER:
+    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT,
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE:
     ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT,
     ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS:
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index f29fcda2bb19..fb65673bd9b4 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -21,6 +21,11 @@
 tensor_map = {}
 
 
+def print_rank_0(message, debug=False, force=False):
+    if torch.distributed.get_rank() == 0 and (debug or force):
+        print(message)
+
+
 class LinearFunctionForZeroStage3(torch.autograd.Function):
 
     # Note that both forward and backward are @staticmethods
@@ -46,6 +51,7 @@ def forward(ctx, input, weight, bias=None):
             if bias is not None:
                 output += bias
             ret = output
+
         return ret
 
     # This function has only a single output, so it gets only one gradient
@@ -77,10 +83,10 @@ def backward(ctx, grad_output):
             #print("Computing grad weight")
             dim = grad_output.dim()
             if dim > 2:
-                grad_weight = grad_output.view(-1,
-                                               grad_output.shape[-1]).t().matmul(
-                                                   input.view(-1,
-                                                              input.shape[-1]))
+                grad_weight = grad_output.reshape(-1,
+                                                  grad_output.shape[-1]).t().matmul(
+                                                      input.reshape(-1,
+                                                                    input.shape[-1]))
             else:
                 grad_weight = grad_output.t().matmul(input)
             #print(f"Computed grad weight grad_weight {grad_weight.shape}")
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
new file mode 100644
index 000000000000..eaf3f13e2819
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -0,0 +1,63 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+'''
+
+from deepspeed.runtime.config_utils import get_scalar_param
+from .offload_constants import *
+
+OFFLOAD_PARAM_KEY_DEFAULT_DICT = {
+    OFFLOAD_PARAM_DEVICE: OFFLOAD_PARAM_DEVICE_DEFAULT,
+    OFFLOAD_PARAM_NVME_PATH: OFFLOAD_PARAM_NVME_PATH_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_COUNT: OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_PARAM_BUFFER_SIZE: OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT,
+    OFFLOAD_PARAM_MAX_IN_CPU: OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT,
+    OFFLOAD_PARAM_PIN_MEMORY: OFFLOAD_PARAM_PIN_MEMORY_DEFAULT
+}
+
+OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT = {
+    OFFLOAD_OPTIMIZER_DEVICE: OFFLOAD_OPTIMIZER_DEVICE_DEFAULT,
+    OFFLOAD_OPTIMIZER_NVME_PATH: OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT,
+    OFFLOAD_OPTIMIZER_BUFFER_COUNT: OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIN_MEMORY: OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_READ: OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT,
+    OFFLOAD_OPTIMIZER_PIPELINE_WRITE: OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT,
+    OFFLOAD_OPTIMIZER_FAST_INIT: OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT
+}
+
+
+def _get_offload_config(param_dict, key_default_dict):
+    offload_config = {}
+    for key, default_value in key_default_dict.items():
+        offload_config[key] = get_scalar_param(param_dict, key, default_value)
+
+    return offload_config
+
+
+def get_offload_param_config(param_dict):
+    if OFFLOAD_PARAM in param_dict and param_dict[OFFLOAD_PARAM] is not None:
+        return _get_offload_config(param_dict=param_dict[OFFLOAD_PARAM],
+                                   key_default_dict=OFFLOAD_PARAM_KEY_DEFAULT_DICT)
+
+    return None
+
+
+def get_default_offload_param_config():
+    return OFFLOAD_PARAM_KEY_DEFAULT_DICT
+
+
+def get_offload_optimizer_config(param_dict):
+    if OFFLOAD_OPTIMIZER in param_dict and param_dict[OFFLOAD_OPTIMIZER] is not None:
+        offload_config = _get_offload_config(
+            param_dict=param_dict[OFFLOAD_OPTIMIZER],
+            key_default_dict=OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT)
+        offload_config[OFFLOAD_OPTIMIZER_PIPELINE] = offload_config[
+            OFFLOAD_OPTIMIZER_PIPELINE_READ] or offload_config[
+                OFFLOAD_OPTIMIZER_PIPELINE_WRITE]
+        return offload_config
+
+    return None
+
+
+def get_default_offload_optimizer_config():
+    return OFFLOAD_OPTIMIZER_KEY_DEFAULT_DICT
diff --git a/deepspeed/runtime/zero/offload_constants.py b/deepspeed/runtime/zero/offload_constants.py
new file mode 100644
index 000000000000..3ba71df4a807
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_constants.py
@@ -0,0 +1,67 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+#########################################
+# TENSOR OFFLOADING
+#########################################
+OFFLOAD_CPU_DEVICE = "cpu"
+OFFLOAD_NVME_DEVICE = "nvme"
+
+#########################################
+# PARAM TENSOR OFFLOADING
+#########################################
+OFFLOAD_PARAM_FORMAT = '''
+"offload_param": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 5,
+  "buffer_size": 1e8,
+  "max_in_cpu": 1e9,
+  "pin_memory": [true|false]
+}
+'''
+OFFLOAD_PARAM = "offload_param"
+OFFLOAD_PARAM_DEVICE = "device"
+OFFLOAD_PARAM_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_PARAM_NVME_PATH = "nvme_path"
+OFFLOAD_PARAM_NVME_PATH_DEFAULT = None
+OFFLOAD_PARAM_BUFFER_COUNT = "buffer_count"
+OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT = 5
+OFFLOAD_PARAM_BUFFER_SIZE = "buffer_size"
+OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT = 1e8
+OFFLOAD_PARAM_MAX_IN_CPU = "max_in_cpu"
+OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT = 1e9
+OFFLOAD_PARAM_PIN_MEMORY = "pin_memory"
+OFFLOAD_PARAM_PIN_MEMORY_DEFAULT = False
+
+#########################################
+# OPTIMIZER TENSOR OFFLOADING
+#########################################
+OFFLOAD_OPTIMIZER_FORMAT = '''
+"offload_optimizer": {
+  "device": [cpu|nvme],
+  "nvme_path": "/local_nvme",
+  "buffer_count": 4,
+  "pin_memory": [true|false],
+  "pipeline_read": false,
+  "pipeline_write": false,
+  "fast_init": false
+}
+'''
+OFFLOAD_OPTIMIZER = "offload_optimizer"
+OFFLOAD_OPTIMIZER_DEVICE = "device"
+OFFLOAD_OPTIMIZER_DEVICE_DEFAULT = OFFLOAD_CPU_DEVICE
+OFFLOAD_OPTIMIZER_NVME_PATH = "nvme_path"
+OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT = None
+OFFLOAD_OPTIMIZER_BUFFER_COUNT = "buffer_count"
+OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT = 4
+OFFLOAD_OPTIMIZER_PIN_MEMORY = "pin_memory"
+OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_READ = "pipeline_read"
+OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE = "pipeline_write"
+OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT = False
+OFFLOAD_OPTIMIZER_PIPELINE = "pipeline"
+OFFLOAD_OPTIMIZER_FAST_INIT = "fast_init"
+OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT = False
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 4465adfd7c16..e831911efd62 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1,3 +1,8 @@
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
+
 import os
 import time
 import types
@@ -8,11 +13,17 @@
 import torch
 from torch.distributed.distributed_c10d import _get_global_rank
 
-from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
-from deepspeed.runtime.utils import see_memory_usage
+from .linear import LinearModuleForZeroStage3, LinearFunctionForZeroStage3
+from .offload_constants import *
+
+from ..utils import see_memory_usage
 from deepspeed.utils import log_dist, init_distributed
 
+from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
+from ..config import DeepSpeedConfig
+
 param_count = 0
+partitioned_param_data_shape = [1]
 
 
 def print_rank_0(message, debug=False, force=False):
@@ -21,6 +32,8 @@ def print_rank_0(message, debug=False, force=False):
 
 
 def is_zero_param(parameter):
+    if not torch.is_tensor(parameter):
+        return False
     return hasattr(parameter, 'ds_id')
 
 
@@ -29,8 +42,6 @@ def _init_external_params(module):
         module._external_params = {}
 
         def external_parameters(self):
-            if not hasattr(self, '_external_params'):
-                self._external_params = {}
             return self._external_params.items()
 
         def all_parameters(self):
@@ -94,6 +105,28 @@ def forward(self, input):
     module._external_params[key] = parameter
 
 
+def unregister_external_parameter(module, parameter):
+    """Reverses the effects of :meth:`register_external_parameter`.
+
+    Args:
+        module (``torch.nn.Module``): The module to affect.
+        parameter (``torch.nn.Parameter``): The parameter to unregister.
+
+    Raises:
+        RuntimeError: If ``parameter`` is not of type ``torch.nn.Parameter``.
+        RuntimeError: If ``parameter`` is not a registered external parameter of ``module``.
+    """
+    if not isinstance(parameter, torch.nn.Parameter):
+        raise RuntimeError('Parameter is not a torch.nn.Parameter')
+
+    if not hasattr(module,
+                   '_external_params') or id(parameter) not in module._external_params:
+        raise RuntimeError('Parameter is not a registered external parameter of module.')
+
+    key = id(parameter)
+    del module._external_params[key]
+
+
 class ZeroParamType(Enum):
 
     # same as regular pytorch parameters
@@ -190,6 +223,9 @@ def _init_subclass(cls, **kwargs):
         torch.empty = empty_cuda_tensor
 
         if self.mem_efficient_linear:
+            print_rank_0(
+                "nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
+                force=True)
             self.linear_bk = torch.nn.functional.linear
             torch.nn.functional.linear = LinearFunctionForZeroStage3.apply
 
@@ -210,8 +246,11 @@ def _disable_class(cls):
         torch.Tensor.__new__ = torch.Tensor.__old_new__
         torch.empty = _orig_torch_empty
 
-        if self.mem_efficient_linear:
-            torch.nn.functional.linear = self.linear_bk
+        #un doing it here will undo it during training
+        #if self.mem_efficient_linear:
+        #    torch.nn.functional.linear = self.linear_bk
+        #        if self.mem_efficient_linear:
+        #            torch.nn.functional.linear = self.linear_bk
 
         # Now that we cleaned up the metaclass injection, raise the exception.
         if exc_type is not None:
@@ -232,6 +271,8 @@ def __init__(self,
                  mem_efficient_linear=True,
                  remote_device=None,
                  pin_memory=False,
+                 deepspeed_config=None,
+                 param_dict=None,
                  enabled=True):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
@@ -245,14 +286,17 @@ def __init__(self,
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
                 DeepSpeed to partition parameters. Defaults to ``True``.
-            remote_device (string, optional): The device to store model
-                weights. Passing ``"cpu"`` will create the model in CPU
-                memory. The model may still be moved to GPU if
-                ``cpu_offload_param`` is ``False`` in the config provided to
-                :meth:`deepspeed.initialize`. Defaults to the local GPU.
+            remote_device (string, optional): The initial device to store model
+                weights e.g., ``cpu``, ``nvme``. Passing ``"cpu"`` will create the model in CPU
+                memory. The model may still be moved to GPU based on the
+                offload settings for training. Defaults to the local GPU.
             pin_memory (bool, optional): Potentially increase performance by
                 using pinned memory for model weights. ``remote_device`` must be
                 ``"cpu"``. Defaults to ``False``.
+            deepspeed_config (``json file``, optional): If provided, provides configuration
+                for swapping fp16 params to NVMe.
+            param_dict (dict, optional): Instead of requiring a deepspeed_config you can pass your deepspeed config
+                as a dictionary instead for swapping fp16 params to NVMe.
             enabled (bool, optional): If ``False``, this context has no
                 effect. Defaults to ``True``.
 
@@ -260,15 +304,15 @@ def __init__(self,
         are too large to allocate in their entirety in CPU memory. It has the
         following effects:
 
-        #. allocates tensors to either GPU or CPU memory
+        #. allocates tensors to either GPU or CPU memory or NVMe
         #. converts floating point tensors to half precision
         #. immediately partitions tensors among the group of data-parallel devices
         #. (*optional*) replaces ``torch.nn.functional.linear`` with a more
            memory-efficient implementation
 
         These modifications allow for models that exceed the size of local CPU/GPU
-        memory, but fit within the total system memory (*i.e.*, aggregate CPU
-        or GPU memory) across all nodes. Consider initializing a model with one
+        memory/NVMe, but fit within the total NVMe capacity (*i.e.*, aggregate CPU
+        or GPU memory or NVMe) across all nodes. Consider initializing a model with one
         trillion parameters, whose weights occupy two terabytes (TB) in half
         precision. The initial CPU allocation in full precision requires 4TB of
         memory *per process*, and so a system with 8 GPUs per node would need 32TB of
@@ -298,7 +342,6 @@ def get_model():
         .. note::
             Only applicable to training with ZeRO-3.
 
-
         Examples
         --------
 
@@ -343,10 +386,20 @@ def get_model():
         #It is the device where parameters are fully instantiated using allgather
         self.local_device = torch.device('cuda:{}'.format(os.environ["LOCAL_RANK"]))
 
+        self._validate_remote_device(remote_device, deepspeed_config, param_dict)
+
         #Remote device is the device where parameter partiitons are stored
-        #It can be same as local_device or it could be CPU.
+        #It can be same as local_device or it could be CPU or NVMe.
         self.remote_device = self.local_device if remote_device is None else remote_device
-        self.pin_memory = pin_memory if (self.remote_device == 'cpu') else False
+        self.pin_memory = pin_memory if (
+            self.remote_device == OFFLOAD_CPU_DEVICE) else False
+
+        # Enable fp16 param swapping to NVMe
+        if self.remote_device == OFFLOAD_NVME_DEVICE:
+            _ds_config = DeepSpeedConfig(deepspeed_config, param_dict=param_dict)
+            self.param_swapper = AsyncPartitionedParameterSwapper(_ds_config)
+        else:
+            self.param_swapper = None
 
         # If we are provided an already-allocated module to prepare.
         if module is not None:
@@ -357,6 +410,23 @@ def get_model():
                 self._convert_to_deepspeed_param(param)
                 param.partition()
 
+    def _validate_remote_device(self, remote_device, ds_config, param_dict):
+        if ds_config is not None:
+            _ds_config = DeepSpeedConfig(ds_config, param_dict=param_dict)
+            if remote_device in [None, OFFLOAD_CPU_DEVICE]:
+                if _ds_config.zero_config.offload_param is not None:
+                    offload_param_device = _ds_config.zero_config.offload_param[
+                        OFFLOAD_PARAM_DEVICE]
+                    assert offload_param_device != OFFLOAD_NVME_DEVICE, \
+                    f"{OFFLOAD_PARAM_DEVICE} in DeepSpeed Config cannot be {offload_param_device} if remote device is {remote_device}."
+
+            if remote_device == OFFLOAD_NVME_DEVICE:
+                assert _ds_config.zero_config.offload_param is not None, \
+                f'{OFFLOAD_PARAM} must be defined in DeepSpeed Config if remote device is {OFFLOAD_NVME_DEVICE}.'
+
+                assert _ds_config.zero_config.offload_param[OFFLOAD_PARAM_NVME_PATH] is not None, \
+                f'{OFFLOAD_PARAM_NVME_PATH} in DeepSpeed Config cannot be None if remote device is {OFFLOAD_NVME_DEVICE}'
+
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
@@ -388,10 +458,10 @@ def _convert_to_deepspeed_param(self, param):
         # Stores the shape of the original tensor
         param.ds_shape = param.shape
 
-        # Stores the number of elements in the original parmaeter without padding
+        # Stores the number of elements in the original parameter without padding
         param.ds_numel = param.numel()
 
-        # Stores the paritioned copy of the tensor
+        # Stores the partitioned copy of the tensor
         param.ds_tensor = None
 
         # Keeps track of how many active sub-modules need this param at any given point in time
@@ -404,6 +474,10 @@ def _convert_to_deepspeed_param(self, param):
         # The group that the parameter is scattered across.
         param.ds_process_group = self.ds_process_group
 
+        # This is set to the Async Param swapper if remote device is nvme
+        # else this is set to None
+        param.nvme_swapper = self.param_swapper
+
         # DeepSped Param ID
         param.ds_id = Init.param_id
         Init.param_id += 1
@@ -454,6 +528,9 @@ def aligned_size():
         def padding_size():
             return self._padding_size(param)
 
+        def partitioned_size():
+            return self._partitioned_size(param)
+
         # Collectives for gathering and partitioning parameters
         param.all_gather = all_gather
         param.partition = partition
@@ -465,6 +542,7 @@ def padding_size():
         # Partitioning size utilities
         param.aligned_size = aligned_size
         param.padding_size = padding_size
+        param.partitioned_size = partitioned_size
 
     def _aligned_size(self, param):
         return param.ds_numel + self._padding_size(param)
@@ -473,7 +551,29 @@ def _padding_size(self, param):
         remainder = param.ds_numel % self.world_size
         return (self.world_size - remainder) if remainder else 0
 
+    def _partitioned_size(self, param):
+        return param.ds_tensor.ds_numel
+
+    def _ensure_availability_of_partitioned_params(self, params):
+        swap_in_list = []
+        swap_in_flight = []
+        for param in params:
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_list.append(param)
+            if param.ds_tensor.status == PartitionedParamStatus.INFLIGHT:
+                assert param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE and param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+                swap_in_flight.append(param)
+        if len(swap_in_list) > 0:
+            swap_in_list[0].nvme_swapper.swap_in(swap_in_list, async_op=False)
+        elif len(swap_in_flight) > 0:
+            swap_in_flight[0].nvme_swapper.synchronize_reads()
+
     def _all_gather(self, param_list, async_op=False, hierarchy=None):
+
+        #fetches from nvme if the partition is not available and in nvme
+        self._ensure_availability_of_partitioned_params(param_list)
+
         handles = []
         all_gather_list = []
         for param in param_list:
@@ -507,8 +607,9 @@ def _partition(self, param_list, force=False, has_been_updated=False):
             #print_rank_0(f"After Partitioning Param {param.ds_id}")
             # self._param_status(param)
 
-    def _partition_param(self, param, has_been_updated=False):
+    def _partition_param(self, param, buffer=None, has_been_updated=False):
         assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot parititon a param in flight"
+
         global reuse_buffers
         #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}")
         if param.ds_status is ZeroParamStatus.AVAILABLE:
@@ -530,22 +631,55 @@ def _partition_param(self, param, has_been_updated=False):
 
                 #param.data = param.ds_tensor.data
 
+                see_memory_usage(
+                    f'Before partitioning param {param.ds_id} {param.shape}',
+                    force=False)
                 #param.data does not store anything meaningful in partitioned state
-                param.data = torch.ones(1).half().to(param.device)
+                param.data = torch.ones(partitioned_param_data_shape).half().to(
+                    param.device)
+                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                                 force=False)
+
+                if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                    print_rank_0(
+                        f"Param {param.ds_id} partition released since it exists in nvme",
+                        force=False)
+                    param.nvme_swapper.remove_partition_and_release_buffers([param])
+
                 return
 
             tensor_size = self._aligned_size(param)
             partition_size = tensor_size // self.world_size
 
             if param.ds_tensor is None:
-                partitioned_tensor = torch.zeros(partition_size,
-                                                 dtype=param.dtype,
-                                                 device=self.remote_device)
-                partitioned_tensor.requires_grad = False
-                if self.pin_memory:
-                    partitioned_tensor = partitioned_tensor.pin_memory()
+                final_location = None
+                if self.remote_device == OFFLOAD_NVME_DEVICE and self.param_swapper.swappable_tensor(
+                        numel=partition_size):
+                    final_location = OFFLOAD_NVME_DEVICE
+                    buffer = self.param_swapper.get_buffer(param, partition_size)
+                    partitioned_tensor = torch.zeros(1,
+                                                     dtype=param.dtype,
+                                                     device=buffer.device)
+                    partitioned_tensor.data = buffer.data
+                    print_rank_0(
+                        f"ID {param.ds_id} Initializing partition for the first time for nvme offload."
+                    )
 
+                else:
+                    partitioned_tensor = torch.zeros(
+                        partition_size,
+                        dtype=param.dtype,
+                        device=OFFLOAD_CPU_DEVICE
+                        if self.remote_device == OFFLOAD_NVME_DEVICE else
+                        self.remote_device)
+                    if self.pin_memory:
+                        partitioned_tensor = partitioned_tensor.pin_memory()
+
+                partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
+                param.ds_tensor.ds_numel = partition_size
+                param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
+                param.ds_tensor.final_location = final_location
 
             start = partition_size * self.rank
             end = start + partition_size
@@ -580,7 +714,20 @@ def _partition_param(self, param, has_been_updated=False):
             #param.data = param.ds_tensor.data
 
             #param.data does not store anything meaningful in partitioned state
-            param.data = torch.ones(1).half().to(param.device)
+
+            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
+                             force=False)
+            param.data = torch.ones(partitioned_param_data_shape).half().to(param.device)
+            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
+                             force=False)
+
+            if param.ds_tensor.final_location == OFFLOAD_NVME_DEVICE:
+                self.param_swapper.swap_out_and_release([param])
+                print_rank_0(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
+                see_memory_usage(
+                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.",
+                    force=False)
 
             print_rank_0(
                 f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}"
@@ -598,7 +745,7 @@ def _param_status(self, param):
 
     def _allgather_param(self, param, async_op=False, hierarchy=0):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         tensor_size = partition_size * self.world_size
         aligned_param_size = self._aligned_size(param)
@@ -607,9 +754,16 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
         print_rank_0(
             f"{'--'* hierarchy}---- Before allocating Allgather param with id {param.ds_id} and status {param.ds_status} Partition Size {partition_size} and data shape {param.ds_shape}"
         )
+
+        see_memory_usage(
+            f'Before allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
         flat_tensor = torch.zeros(aligned_param_size,
                                   dtype=param.dtype,
                                   device=param.device).view(-1)
+        see_memory_usage(
+            f'After allocate allgather param {param.ds_id} {param.ds_status} {aligned_param_size} {partition_size} {param.ds_shape}',
+            force=False)
 
         torch.cuda.synchronize()
 
@@ -642,7 +796,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return
 
-        partition_size = sum([param.ds_tensor.numel() for param in param_list])
+        partition_size = sum([param.ds_tensor.ds_numel for param in param_list])
 
         tensor_size = partition_size * self.world_size
         flat_tensor = torch.empty(tensor_size,
@@ -658,7 +812,7 @@ def _allgather_params(self, param_list, hierarchy=0):
             if i == self.rank:
                 offset = 0
                 for param in param_list:
-                    param_numel = param.ds_tensor.numel()
+                    param_numel = param.ds_tensor.ds_numel
 
                     partitions[i].narrow(0,
                                          offset,
@@ -673,9 +827,7 @@ def _allgather_params(self, param_list, hierarchy=0):
         param_offset = 0
 
         for param in param_list:
-
-            param_partition_size = param.ds_tensor.numel()
-
+            param_partition_size = param.ds_tensor.ds_numel
             param_size = param.ds_numel
             replicated_tensor = torch.empty(param.ds_shape,
                                             dtype=param.dtype,
@@ -696,7 +848,7 @@ def _allgather_params(self, param_list, hierarchy=0):
                                                       param_start,
                                                       numel_to_copy).copy_(part_to_copy)
             #param_offset += param.data.numel()
-            param_offset += param.ds_tensor.numel()
+            param_offset += param.ds_tensor.ds_numel
 
             param.data = replicated_tensor.data
 
@@ -720,7 +872,7 @@ def _reduce_scatter_gradients(self, param_list):
             # some ranks may have partitions that are padded to go beyond the grad size.
             # For these ranks the output of reduce scatter is a separate buffer and needs
             # to be copied in
-            partition_size = param.ds_tensor.numel()
+            partition_size = param.ds_tensor.ds_numel
             start = self.rank * partition_size
             end = start + partition_size
             #print_rank_0("REduce scatter was executed for praam {param.ds_id}")
@@ -735,7 +887,7 @@ def _reduce_scatter_gradients(self, param_list):
 
     def _reduce_scatter_gradient(self, param):
 
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
         #output = torch.empty(partition_size, dtype=param.dtype, device=param.device)
 
         total_size = partition_size * self.world_size
@@ -787,10 +939,10 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         # param.grad=None
         # param.grad.test()
         print_rank_0(
-            f"Partitioning param {id(param)} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.numel()}"
+            f"Partitioning param {param.ds_id} gradient of size {param.grad.numel()} type {param.grad.dtype} part_size {param.ds_tensor.ds_numel}"
         )
         see_memory_usage("Before partitioning gradients", force=False)
-        partition_size = param.ds_tensor.numel()
+        partition_size = param.ds_tensor.ds_numel
 
         if partition_buffer is None:
             assert not accumulate, "No buffer to accumulate to"
@@ -804,17 +956,12 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
         start = partition_size * rank
         end = start + partition_size
 
-        dest_tensor = partition_buffer.view(-1).narrow(0, 0, partition_size)
+        dest_tensor_full_buffer = partition_buffer.view(-1).narrow(0, 0, partition_size)
 
         #print("before partition gradients")
         if start < param.ds_numel:
             elements = min(param.ds_numel - start, partition_size)
 
-            dest_tensor_full_buffer = partition_buffer.view(-1).narrow(
-                0,
-                0,
-                partition_size)
-
             dest_tensor = dest_tensor_full_buffer.narrow(0, 0, elements)
             src_tensor = param.grad.view(-1).narrow(0, start, elements)
 
@@ -853,19 +1000,20 @@ def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
 
 
 class GatheredParameters:
-    def __init__(self, param, modifier_rank=None, fwd_module=None, enabled=True):
-        """A context that collects a parameter that was partitioned via a
-        :class:`deepspeed.zero.Init` context. The parameter is partitioned
+    def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
+        """A context that collects parameters that were partitioned via a
+        :class:`deepspeed.zero.Init` context. The parameters are partitioned
         again upon exit.
 
         Args:
-            param (``torch.nn.Parameter``): The parameter to collect.
+            params (``torch.nn.Parameter``): A single parameter or a list of parameters to collect.
+                It's assumed that all parameters are zero params.
             modifier_rank (int, optional): If specified, this rank's parameter will be
-                broadcasted after the context. This argument is required if ``param`` is
-                modified all processes should have a consistent view of the data. Defaults
+                broadcasted on exit from the context. This argument is required if ``params`` are
+                modified, so that all processes have a consistent view of the data. Defaults
                 to ``None``.
-            fwd_module (``torch.nn.Module``, optional): If specified, ``param`` will be
-                registered as an external parameter of ``fwd_module``. See :meth:`deepspeed.zero.register_external_parameter`.
+            fwd_module (``torch.nn.Module``, optional): If specified, ``params`` will be
+                registered as external parameters of ``fwd_module``. See :meth:`deepspeed.zero.register_external_parameter`.
             enabled (bool, optional): If ``False``, this context is a no-op. Defaults to ``True``.
 
         Examples
@@ -884,6 +1032,10 @@ def __init__(self, param, modifier_rank=None, fwd_module=None, enabled=True):
                     if torch.distributed.get_rank() == 0:
                         linear.weight.zero_()
 
+                with deepspeed.zero.GatheredParameters(linear.weight,
+                                                       modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        linear.weight.zero_()
 
         #. Collect a partitioned weight to pass to another module during
            training. The parameter will be registered as an external parameter
@@ -900,41 +1052,79 @@ def forward(self, input):
                                                            fwd_module=self):
                         y = self.layer2(x, self.layer1.weight)
                     return y
+
+
+        #. Pretrained model loading
+
+            .. code-block:: python
+
+                with deepspeed.zero.Init():
+                    model = MyModel()
+
+                state_dict = torch.load(model_path, map_location="cpu")
+
+                def load(module: nn.Module, prefix=""):
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(state_dict, prefix)
+
+                    for name, child in module._modules.items():
+                        if child is not None:
+                            load(child, prefix + name + ".")
+
+                load(model, prefix="")
+
+        If this approach is not used, then the full model will first get copied to each GPU. For models
+        bigger than the memory of a single gpu this method is required.
         """
 
         self.enabled = enabled
         if not enabled:
             return
 
-        # This is a no-op, just return.
-        if not is_zero_param(param):
+        if not isinstance(params, list):
+            params = [params]
+
+        # enable if at least one is zero-param, otherwise a noop
+        if not any(is_zero_param(p) for p in params):
             self.enabled = False
             return
 
-        self.param = param
+        self.params = params
         self.src_rank = None
         if modifier_rank is not None:
-            if self.param.ds_process_group == torch.distributed.group.WORLD:
+            if self.params[0].ds_process_group == torch.distributed.group.WORLD:
                 self.src_rank = modifier_rank
             else:
                 # A group was specified; convert DP rank to global rank
-                self.src_rank = _get_global_rank(self.param.ds_process_group,
+                self.src_rank = _get_global_rank(self.params[0].ds_process_group,
                                                  modifier_rank)
         self.fwd_module = fwd_module
         if self.fwd_module is not None:
             # is a no-op if already registered
-            register_external_parameter(self.fwd_module, self.param)
+            for p in self.params:
+                register_external_parameter(self.fwd_module, p)
 
     def __enter__(self):
         if not self.enabled:
             return
-        self.param.all_gather()
+        self.params[0].all_gather(param_list=self.params)
 
     def __exit__(self, *exc):
         if not self.enabled:
             return
-        if self.src_rank is not None:
-            torch.distributed.broadcast(self.param,
+        if self.src_rank is None:
+            return
+
+        handles = [
+            torch.distributed.broadcast(p,
                                         self.src_rank,
-                                        group=self.param.ds_process_group)
-        self.param.partition(has_been_updated=self.src_rank is not None)
+                                        group=p.ds_process_group,
+                                        async_op=True) for p in self.params
+        ]
+        for h in handles:
+            h.wait()
+        self.params[0].partition(param_list=self.params, has_been_updated=True)
diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py
index bd01ac3ec4a1..b75d5b4b1fd3 100755
--- a/deepspeed/runtime/zero/stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -1,7 +1,6 @@
 import math
 import torch
 import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from collections import defaultdict
 
 from deepspeed.runtime.zero.utils import _initialize_parameter_parallel_groups
@@ -9,8 +8,10 @@
 from deepspeed.runtime.utils import get_grad_norm, CheckOverflow
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_OPTIMIZER_STATES
 from deepspeed.utils import logger, log_dist
+from deepspeed.ops.op_builder import UtilsBuilder
 import time
 
+
 def get_alignment_padding(flattened_lean_size, sub_partition_id, sub_partition_size):
     sub_partition_high_limit = (sub_partition_id + 1) * sub_partition_size
     if sub_partition_high_limit <= flattened_lean_size:
@@ -71,54 +72,6 @@ def elapsed(self, reset=True):
             self.start()
         return elapsed_
 
-def flatten_dense_tensors_sub_partition_aligned(tensor_list,
-                                                dp,
-                                                max_elements_per_comm,
-                                                pg):
-    assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
-
-    num_elements = sum(t.numel() for t in tensor_list)
-    log_dist("Total number of elements in model: {}, max elements per com: {}".format(
-        num_elements,
-        max_elements_per_comm),
-             ranks=[0])
-
-    # Compute aligned partition size based on parameter count
-    aligned_param_partition_size = math.ceil(num_elements / dp)
-
-    # Compute aligned partition size based on communication size
-    aligned_comm_partition_size = int(max_elements_per_comm // dp)
-
-    if aligned_param_partition_size <= aligned_comm_partition_size:
-        sub_partition_count = 1
-        sub_partition_size = aligned_param_partition_size
-    else:
-        sub_partition_count = math.ceil(aligned_param_partition_size /
-                                        aligned_comm_partition_size)
-        sub_partition_size = aligned_comm_partition_size
-
-    # Compute required padding  for alignment to dp and max_elements_per_comm
-    padding = (sub_partition_count * sub_partition_size * dp) - num_elements
-
-    log_dist(
-        f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
-        ranks=[0])
-    log_dist(
-        f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
-        ranks=[0])
-
-    if padding == 0:
-        aligned_tensor_list = tensor_list
-    else:
-        pad_tensor = torch.zeros(padding,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        aligned_tensor_list = tensor_list + [pad_tensor]
-
-    flat_tensors = _flatten_dense_tensors(aligned_tensor_list)
-    return flat_tensors
-
-
 def _single_range_check(current_index, start_index, end_index, tensor_size):
     offset = 0
     if (current_index >= start_index) and (current_index < end_index):
@@ -169,6 +122,11 @@ def __init__(self,
                  max_elements_per_comm=5e8,
                  elastic_checkpoint=True):
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         if dp_process_group is not None and partition_size is not None:
             raise ValueError("Cannot specify both dp_process_group "
                              "and partition size")
@@ -251,7 +209,7 @@ def __init__(self,
 
             # flattens all tensors into single 1d tensor aligned with sub-partition size for later dividing
             # RS: create aligned sub-partitions
-            flat_aligned_params = flatten_dense_tensors_sub_partition_aligned(
+            flat_aligned_params = self.flatten_dense_tensors_sub_partition_aligned(
                 tensor_list=self.fp16_groups[i],
                 dp=dist.get_world_size(group=self.dp_process_group),
                 max_elements_per_comm=self.max_elems_per_comm[i],
@@ -260,8 +218,8 @@ def __init__(self,
 
             # TODO: I don't think this does anything?
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -497,8 +455,8 @@ def get_all_sub_partition_info(tensor_list,
 
         return params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local
 
-    @staticmethod
-    def get_flat_sub_partitions(comm_tensor_list,
+    def get_flat_sub_partitions(self,
+                                comm_tensor_list,
                                 comm_param_offsets,
                                 sub_partition_size,
                                 dtype,
@@ -569,7 +527,7 @@ def get_flat_sub_partitions(comm_tensor_list,
             partition_params.append(my_params)  #flat_tensor_list)
             final_param_offsets.append(my_offsets)
             assert len(flat_tensor_list) == len(my_offsets), "{} {}".format(len(flat_tensor_list), len(my_offsets))
-            flat_sub_partitions.append(_flatten_dense_tensors(flat_tensor_list))
+            flat_sub_partitions.append(self.flatten(flat_tensor_list))
         if num_comm_intervals is not None and len(
                 flat_sub_partitions) < num_comm_intervals:
             # logger.info("padding w. sub partitions to ensure uniform communication")
@@ -611,6 +569,55 @@ def free_grad_in_param_list(self, param_list):
             else:
                 p.grad = None
 
+    def flatten_dense_tensors_sub_partition_aligned(self,
+                                                    tensor_list,
+                                                    dp,
+                                                    max_elements_per_comm,
+                                                    pg):
+        assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
+
+        num_elements = sum(t.numel() for t in tensor_list)
+        log_dist(
+            "Total number of elements in model: {}, max elements per com: {}".format(
+                num_elements,
+                max_elements_per_comm),
+            ranks=[0])
+
+        # Compute aligned partition size based on parameter count
+        aligned_param_partition_size = math.ceil(num_elements / dp)
+
+        # Compute aligned partition size based on communication size
+        aligned_comm_partition_size = int(max_elements_per_comm // dp)
+
+        if aligned_param_partition_size <= aligned_comm_partition_size:
+            sub_partition_count = 1
+            sub_partition_size = aligned_param_partition_size
+        else:
+            sub_partition_count = math.ceil(aligned_param_partition_size /
+                                            aligned_comm_partition_size)
+            sub_partition_size = aligned_comm_partition_size
+
+        # Compute required padding  for alignment to dp and max_elements_per_comm
+        padding = (sub_partition_count * sub_partition_size * dp) - num_elements
+
+        log_dist(
+            f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
+            ranks=[0])
+        log_dist(
+            f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
+            ranks=[0])
+
+        if padding == 0:
+            aligned_tensor_list = tensor_list
+        else:
+            pad_tensor = torch.zeros(padding,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            aligned_tensor_list = tensor_list + [pad_tensor]
+
+        flat_tensors = self.flatten(aligned_tensor_list)
+        return flat_tensors
+
     def reduce_scatter_gradients(self,
                                  postscale_gradients,
                                  gradient_predivide_factor,
@@ -748,8 +755,8 @@ def step(self, closure=None, comms_timer=None):
 
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
         return self.overflow
@@ -951,7 +958,7 @@ def _retrieve_group_sub_partition_weights(self,
                 sub_partition_idx = (comm_idx * num_partitions) + rank
                 all_sub_partition_weights[sub_partition_idx] = sub_partition_weights
 
-        flat_merged_weights = flatten_dense_tensors_sub_partition_aligned(
+        flat_merged_weights = self.flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_sub_partition_weights,
             dp=dist.get_world_size(group=self.dp_process_group),
             max_elements_per_comm=max_elems_per_comm,
@@ -999,7 +1006,7 @@ def _partition_base_optimizer_state(self,
             return all_partition_states[0]
 
         alignment = dist.get_world_size(group=self.dp_process_group)
-        flat_merged_partitions = flatten_dense_tensors_sub_partition_aligned(
+        flat_merged_partitions = self.flatten_dense_tensors_sub_partition_aligned(
             tensor_list=all_partition_states,
             dp=dist.get_world_size(group=self.dp_process_group),
             max_elements_per_comm=max_elems_per_comm,
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index cd29625958c9..39d780e55574 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -3,7 +3,6 @@
 '''
 
 import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
 import math
@@ -16,9 +15,8 @@
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
-
+from deepspeed.ops.op_builder import UtilsBuilder
 from deepspeed.utils import logger
-from ...ops.op_builder import UtilsBuilder
 
 #Toggle this to true to enable correctness test
 #with gradient partitioning and without
@@ -52,28 +50,6 @@ def lcm(x, y):
     return x * y // gcd(x, y)
 
 
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tensor in tensor_list:
-        num_elements = num_elements + tensor.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
 def get_alignment_padding(tensor_list, alignment):
     num_elements = sum([tensor.numel() for tensor in tensor_list])
     remainder = num_elements % alignment
@@ -121,11 +97,6 @@ def __init__(self,
                  gradient_predivide_factor=1.0,
                  gradient_accumulation_steps=1):
 
-        # Load pre-installed or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
-
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
             logger.info(f"Allgather bucket size {allgather_bucket_size}")
@@ -143,6 +114,11 @@ def __init__(self,
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         self.timers = timers
 
         self.reduce_scatter = reduce_scatter
@@ -236,7 +212,7 @@ def __init__(self,
 
             #create flat buffer in CPU and move to GPU
             self.fp16_groups_flat.append(
-                flatten_dense_tensors_aligned(
+                self.flatten_dense_tensors_aligned(
                     self.fp16_groups[i],
                     dist.get_world_size(group=self.dp_process_group)).cuda(
                         torch.cuda.current_device()))
@@ -247,8 +223,8 @@ def __init__(self,
                     f"After Flattening and after emptying param group {i} cache")
 
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -611,6 +587,27 @@ def report_ipg_memory_usage(self, tag, param_elems):
             f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
         )
 
+    # create a flat tensor aligned at the alignment boundary
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+        num_elements = 0
+        for tensor in tensor_list:
+            num_elements = num_elements + tensor.numel()
+
+        remaining = num_elements % alignment
+
+        if remaining:
+            elements_to_add = alignment - remaining
+            pad_tensor = torch.zeros(elements_to_add,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            padded_tensor_list = tensor_list + [pad_tensor]
+
+            num_elements = num_elements + elements_to_add
+        else:
+            padded_tensor_list = tensor_list
+
+        return self.flatten(padded_tensor_list)
+
     ############### Independent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
@@ -1004,7 +1001,7 @@ def are_all_related_partitions_reduced(params_id):
                 self.param_dict[params_id].grad = None
 
     def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
+        flatten_tensor = self.flatten(tensors)
 
         def print_func():
             logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
@@ -1327,7 +1324,7 @@ def get_flat_partition(self,
         if return_tensor_list:
             return flat_tensor_list
 
-        return _flatten_dense_tensors(flat_tensor_list)
+        return self.flatten(flat_tensor_list)
 
     def free_grad_in_param_list(self, param_list):
         for p in param_list:
@@ -1419,14 +1416,13 @@ def step(self, closure=None):
                 #create a flat gradients for parameters updated by this process
                 # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
                 if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    single_grad_partition = flatten_dense_tensors_aligned(
+                    single_grad_partition = self.flatten_dense_tensors_aligned(
                         self.averaged_gradients[i],
                         int(self.partition_size[i])).to(
                             self.single_partition_of_fp32_groups[i].dtype)
                 else:
-                    single_grad_partition = _flatten_dense_tensors(
-                        self.averaged_gradients[i]).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
+                    single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
+                        self.single_partition_of_fp32_groups[i].dtype)
                 assert single_grad_partition.numel() == self.partition_size[i], \
                     "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
 
@@ -1507,8 +1503,8 @@ def step(self, closure=None):
 
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = self.unflatten(self.fp16_groups_flat[i],
+                                            self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
 
@@ -1749,7 +1745,7 @@ def _restore_from_fp32_weights(self, all_state_dict):
             merged_partitions = [
                 sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
             ]
-            flat_merged_partitions = flatten_dense_tensors_aligned(
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 merged_partitions,
                 dist.get_world_size(group=self.dp_process_group))
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
@@ -1773,7 +1769,7 @@ def _partition_base_optimizer_state(self, state_key, all_partition_states):
         partition_id = dist.get_rank(group=self.dp_process_group)
         alignment = dist.get_world_size(group=self.dp_process_group)
         if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = flatten_dense_tensors_aligned(
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 all_partition_states,
                 alignment)
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 9168ab96d6e1..2b6e12abd84b 100755
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1,42 +1,37 @@
-from deepspeed.utils.logging import logger
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+"""
+"Copyright 2020 The Microsoft DeepSpeed Team.
+Licensed under the MIT license.
+"""
 
+import sys
 import os
-
+from collections import defaultdict, OrderedDict
+import itertools
 import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.distributed.distributed_c10d import _get_global_rank
 import torch.distributed as dist
 import math
 from torch._six import inf
 from torch.autograd import Variable
 
+from deepspeed.utils.logging import logger
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, ZeroParamType, _init_external_params, Init, is_zero_param
+from deepspeed.runtime.zero.partition_parameters import *
+from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.constants import ZERO_OPTIMIZATION_WEIGHTS
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.op_builder import UtilsBuilder
+from deepspeed.runtime.zero.offload_constants import *
+from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
+from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
+from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 
-import itertools
 # Toggle this to true to enable correctness test
 # with gradient partitioning and without
 pg_correctness_test = False
 
-try:
-    from apex_C import flatten
-    from apex_C import unflatten
-except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
+FWD_MODULE_STACK = list()
 
 
 def print_rank_0(message, debug=False, force=False):
@@ -71,35 +66,13 @@ def lcm(x, y):
     return x * y // gcd(x, y)
 
 
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tens in tensor_list:
-        num_elements = num_elements + tens.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
 def move_to_cpu(tensor_list):
     for tensor in tensor_list:
         tensor.data = tensor.data.cpu()
 
 
-def get_all_parameters(sub_module):
-    return itertools.chain(sub_module.named_parameters(recurse=False),
+def get_all_parameters(sub_module, recurse=False):
+    return itertools.chain(sub_module.named_parameters(recurse=recurse),
                            sub_module.ds_external_parameters())
 
 
@@ -144,6 +117,47 @@ def _apply_forward_and_backward_to_tensors_only(module,
         return outputs
 
 
+class ZeROOrderedDict(OrderedDict):
+    def __init__(self, parent_module, *args, **kwargs):
+        """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
+
+        Args:
+            parent_module (``collections.OrderedDict``): the collection to replace
+        """
+
+        super().__init__(*args, **kwargs)
+        self._parent_module = parent_module
+        self._in_forward = False
+
+    def __getitem__(self, key):
+        param = super().__getitem__(key)
+
+        # Params can be registered as None (e.g., bias)
+        if param is None:
+            return param
+
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if self._parent_module._parameters._in_forward:
+                print_rank_0(f'Registering external parameter from getter {key}',
+                             force=False)
+                register_external_parameter(FWD_MODULE_STACK[-1], param)
+                param.all_gather()
+
+        return param
+
+
+def _inject_parameters(module, cls):
+    for module in module.modules():
+        if cls == ZeROOrderedDict:
+            new_param = cls(parent_module=module)
+        else:
+            new_param = cls()
+
+        for key, param in module._parameters.items():
+            new_param[key] = param
+        module._parameters = new_param
+
+
 # TODO Needs to be implemented
 class PrefetchCoordinator(object):
     def __init__(self):
@@ -253,7 +267,6 @@ def get_reuse_distance_in_numel(self, sub_module, sub_module_step_id=None):
                     start_step,
                     end_step,
                     trace)
-
                 break
 
         self.reuse_numel_for_step_id[sub_module_step_id] = reuse_distance_in_numel
@@ -307,9 +320,26 @@ def finish_tracing(self, print_trace=False):
         if print_trace:
             self.prefetch_coordinator.print_trace()
 
+    #swap in parameter partitions from nvme for those parameters that will be used
+    # after the ones that are already being prefetched into full parameters
+    def _prefetch_nvme_param_partitions(self, sub_module, params_in_flight):
+        numel_in_flight = sum([param.ds_tensor.ds_numel for param in params_in_flight])
+        upcoming_param_list = self.prefetch_coordinator.get_params_to_prefetch(
+            sub_module,
+            numel=2 * numel_in_flight)
+        swap_in_params = []
+        for param in upcoming_param_list:
+            if len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers():
+                break
+            if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_in_params.append(param)
+
+        if len(swap_in_params) > 0:
+            swap_in_params[0].nvme_swapper.swap_in(swap_in_params, async_op=True)
+
     # Pre fetches the parameters for sub_modules that comes after
     #  the current sub_module. This call is asynchronous
-    def prefetch_next_sub_modules(self, sub_module, numel=5000000):
+    def prefetch_next_sub_modules(self, sub_module, numel=5000000, nvme=False):
 
         params_to_prefetch = []
         if not self.prefetch_coordinator.trace_completed:
@@ -328,6 +358,9 @@ def prefetch_next_sub_modules(self, sub_module, numel=5000000):
                 # keeping track of number of elements consumed by available parmaeters
                 self._increment_available_parameter_numel(param.ds_numel)
 
+            if nvme:
+                self._prefetch_nvme_param_partitions(sub_module, params_to_prefetch)
+
         self._print_prefetch_elements_info(sub_module, params_to_prefetch)
         print_rank_0(
             f"{'--' * self.hierarchy}--PreFetching parameters {[param.ds_id for param in params_to_prefetch]} and available {self.total_available_parameter_numel}, max limit {self.max_available_parameters_in_numel}",
@@ -407,7 +440,9 @@ def fetch_sub_module(self, sub_module):
 
         for _, param in sub_module.named_parameters(recurse=False):
             param.ds_status = ZeroParamStatus.AVAILABLE
-            #print(f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} ")
+            print_rank_0(
+                f"Param id {param.ds_id}, Shape {param.shape}, device {param.device} norm {param.norm()}",
+                force=False)
         #print_rank_0(f"After fetching (id, shape, device): {[(param.ds_id, param.shape, param.device) for param in sub_module.named_parameters(recurse=False)]}")
 
     def release_sub_module(self, sub_module):
@@ -419,6 +454,7 @@ def release_sub_module(self, sub_module):
             param for _,
             param in sub_module.named_parameters(recurse=False)
         ]
+
         if hasattr(sub_module, 'ds_external_parameters'):
             #print_rank_0(f"Releasing external parameters {sub_module.ds_external_parameters()}")
             params_to_release += [
@@ -432,25 +468,25 @@ def release_sub_module(self, sub_module):
             if not param.ds_active_sub_modules and not self._keep_for_later(
                     sub_module) and not param.ds_persist:
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}"
-                )
+                    f"{'--' * self.hierarchy}--Releasing parameters {param.ds_id} with numel {param.numel()} active sub modules {param.ds_active_sub_modules} and keep for later {self._keep_for_later(sub_module)}",
+                    force=False)
 
                 # Keeping track of number of elements that are consumed by available parameters
                 self._decrement_available_parameter_numel(param.ds_numel)
                 see_memory_usage(
-                    f"Before releasing param {param.ds_id} with numel{param.numel()}",
+                    f"Before releasing param {param.ds_id} with numel {param.numel()}",
                     force=False)
                 param.partition(hierarchy=self.hierarchy)
                 see_memory_usage(
-                    f"After releasing param {param.ds_id} has numel{param.numel()} ",
+                    f"After releasing param {param.ds_id} has numel {param.numel()} ",
                     force=False)
 
                 param.ds_status = ZeroParamStatus.NOT_AVAILABLE
             else:
 
                 print_rank_0(
-                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}"
-                )
+                    f"{'--' * self.hierarchy}--Did not release parameters {param.ds_id} with numel {param.numel()} with active sub modules {param.ds_active_sub_modules}, keep for later {self._keep_for_later(sub_module)} and persistence {param.ds_persist}",
+                    force=False)
 
     def release_and_reset_parameter(self, param):
         param.ds_active_sub_modules = 0
@@ -464,6 +500,8 @@ def release_and_reset_parameter(self, param):
     def _keep_for_later(self, sub_module):
         if not self.prefetch_coordinator.trace_completed:
             return False
+        if self.max_reuse_distance_in_numel == 0:
+            return False
         reuse_distance_in_numel = self.prefetch_coordinator.get_reuse_distance_in_numel(
             sub_module)
         #print_rank_0(f"Reuse distance and numel for sub_module id {sub_module.id} is {reuse_distance_in_numel}")
@@ -568,9 +606,8 @@ def __init__(self,
                  dp_process_group=None,
                  reduce_scatter=True,
                  overlap_comm=False,
-                 cpu_offload_optimizer_state=False,
-                 cpu_offload_params=False,
-                 cpu_offload_use_pin_memory=False,
+                 offload_optimizer_config=None,
+                 offload_param_config=None,
                  sub_group_size=1000000000000,
                  mpu=None,
                  clip_grad=0.0,
@@ -578,9 +615,10 @@ def __init__(self,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
                  gradient_accumulation_steps=1,
-                 elastic_checkpoint=False):
+                 elastic_checkpoint=False,
+                 aio_config=None):
 
-        see_memory_usage("Stage 3 intialize beginning", force=True)
+        see_memory_usage("Stage 3 initialize beginning", force=True)
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
@@ -598,6 +636,11 @@ def __init__(self,
             raise SystemError("Cannot use fp16 without CUDA.")
         self.optimizer = init_optimizer
 
+        # Load pre-built or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
+
         if not all(is_zero_param(p) for p in module.parameters()):
             group = None
             if mpu:
@@ -611,21 +654,51 @@ def __init__(self,
         self.elastic_checkpoint = elastic_checkpoint
         self.overlap_comm = overlap_comm
 
+        # Replace ._parameters with a new class to enable auto-registration of
+        # external parameters
+        _inject_parameters(module, ZeROOrderedDict)
+
         if self.overlap_comm:
             self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-        ######################cpu offload setup##################################
-        self.cpu_offload = cpu_offload_optimizer_state
-        self.cpu_offload_use_pin_memory = cpu_offload_use_pin_memory
-
-        if cpu_offload_params:
-            assert cpu_offload_optimizer_state, "parameter offload is only available with optimizer state offload"
-        self.cpu_offload_params = cpu_offload_optimizer_state and cpu_offload_params
+        ###################### offload optimizer setup ##################################
+        self.optimizer_swapper = None
+        self.swap_optimizer = False
+
+        self.offload_optimizer = False
+        self.offload_optimizer_pin_memory = False
+        self.offload_optimizer_fast_init = False
+        if offload_optimizer_config is not None:
+            self.offload_optimizer = True
+            self.offload_optimizer_pin_memory = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_PIN_MEMORY]
+            self.swap_optimizer = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.offload_optimizer_fast_init = offload_optimizer_config[
+                OFFLOAD_OPTIMIZER_FAST_INIT]
+
+        ###################### offload param setup ##################################
+        self.offload_param = False
+        self.offload_param_pin_memory = False
+        self.params_in_nvme_and_cpu = False
+        self.max_params_in_cpu = 0
+        if offload_param_config is not None:
+            assert self.offload_optimizer, "parameter offload is only available with optimizer state offload"
+            self.offload_param = True
+            self.offload_param_pin_memory = offload_param_config[
+                OFFLOAD_PARAM_PIN_MEMORY]
+            self.params_in_nvme_and_cpu = offload_param_config[
+                OFFLOAD_PARAM_DEVICE] == OFFLOAD_NVME_DEVICE
+            self.max_params_in_cpu = offload_param_config[OFFLOAD_PARAM_MAX_IN_CPU]
+            print_rank_0(
+                f"FP16 params swapping is {self.params_in_nvme_and_cpu}, Max params in CPU is {self.max_params_in_cpu}",
+                force=True)
 
-        self.deepspeed_adam_offload = (self.cpu_offload
+        self.deepspeed_adam_offload = (self.offload_optimizer
                                        and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
+        self.device = torch.cuda.current_device(
+        ) if not self.offload_optimizer else OFFLOAD_CPU_DEVICE
         ############################################################################
 
         see_memory_usage("Before Partitioned Parameter Coordinator", force=False)
@@ -692,10 +765,18 @@ def __init__(self,
 
         # Holds a fused and flattened copy of the parameters
         self.fp16_partitioned_groups_flat = []
+        self.fp16_partitioned_groups_flat_numel = []
+
+        #defragmented pinned memory
+        self.param_groups_fp16_flat_cpu_memory = []
+
+        #fp16 buffer for swapping out nvme params
+        self.param_group_fp16_flat_reuse_buffer = None
 
         #a single 32-bit partition of the parallel partitioned parameters
         #that this process will update
         self.fp32_partitioned_groups_flat = []
+        self.next_swappable_fp32_partitioned_groups = []
 
         # number of elements per partition in each group
         self.partition_size = []
@@ -711,20 +792,28 @@ def __init__(self,
 
         self.sub_group_to_group_id = {}
 
-        see_memory_usage("Before creating fp16 partitions", force=False)
-        #self._create_fp16_partitions()
+        see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation()
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
         see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
                          force=False)
 
+        # Optimizer ensor swapping
+        if self.swap_optimizer:
+            self._configure_tensor_swapping(offload_optimizer_config, aio_config)
+
         see_memory_usage("Before creating fp32 partitions", force=False)
         self._create_fp32_partitions()
         see_memory_usage("After creating fp32 partitions", force=False)
+        dist.barrier()
+
+        # To support pipelined optimizer swapping
+        self._create_next_swappable_fp32_groups()
 
         see_memory_usage("Before initializing optimizer states", force=False)
         self.initialize_optimizer_states()
         see_memory_usage("After initializing optimizer states", force=False)
+        dist.barrier()
 
         if dist.get_rank() == 0:
             logger.info(f"optimizer state initialized")
@@ -749,6 +838,7 @@ def __init__(self,
         self.params_in_ipg_bucket = []
         self.elements_in_ipg_bucket = 0
         self.params_already_reduced = []
+        self.is_gradient_accumulation_boundary = True
         self._release_ipg_buffers()
         self.previous_reduced_grads = None
 
@@ -765,7 +855,10 @@ def __init__(self,
                 count = count + 1
 
         #Largest partitioned param
-        largest_partitioned_param_numel = self._get_largest_partitioned_numel()
+        largest_partitioned_param_numel = max(self.fp16_partitioned_groups_flat_numel)
+        print_rank_0(
+            f'Largest partitioned param numel = {largest_partitioned_param_numel}',
+            force=True)
 
         see_memory_usage(f"Before Set Grad positions", force=False)
 
@@ -775,7 +868,7 @@ def __init__(self,
 
         self.grads_in_partition = None
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.accumulated_grads_in_cpu = {}
             self.norm_for_param_grads = {}
             self.local_overflow = False
@@ -820,14 +913,26 @@ def __init__(self,
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
-    def _get_largest_partitioned_numel(self):
-        largest_partitioned_param_numel = 0
-        for partitioned_params_group in self.fp16_partitioned_groups:
-            for partitioned_param in partitioned_params_group:
-                if partitioned_param.numel() > largest_partitioned_param_numel:
-                    largest_partitioned_param_numel = partitioned_param.numel()
+    def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
+        nvme_swap_folder = os.path.join(
+            offload_optimizer_config[OFFLOAD_OPTIMIZER_NVME_PATH],
+            'zero_stage_3')
+        os.makedirs(nvme_swap_folder, exist_ok=True)
+        if torch.distributed.get_rank() == 0:
+            logger.info(f'Tensor Swapping: Adding optimizer tensors')
+
+        swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config[
+            OFFLOAD_OPTIMIZER_PIPELINE] else PartitionedOptimizerSwapper
 
-        return largest_partitioned_param_numel
+        self.optimizer_swapper = swapper_type(
+            swap_config=offload_optimizer_config,
+            aio_config=aio_config,
+            base_folder=nvme_swap_folder,
+            optimizer=self.optimizer,
+            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
+            device=self.device,
+            dtype=torch.float32,
+            timers=self.timers)
 
     def _create_fp16_partitions(self):
         dist.barrier()
@@ -863,7 +968,7 @@ def _create_fp16_partitions(self):
                 #removing cloning here
                 see_memory_usage(f"Before Flattening param group {i}", force=False)
 
-                if not self.cpu_offload_params:
+                if not self.offload_param:
                     see_memory_usage(f"Before moving param group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
@@ -872,9 +977,10 @@ def _create_fp16_partitions(self):
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(self.fp16_partitioned_groups[i],
-                                                      1).cuda(
-                                                          torch.cuda.current_device()))
+                        self.flatten_dense_tensors_aligned(
+                            self.fp16_partitioned_groups[i],
+                            dist.get_world_size(group=self.dp_process_group)).cuda(
+                                torch.cuda.current_device()))
                     see_memory_usage(
                         f"After flattening and moving param group {i} to GPU",
                         force=False)
@@ -882,7 +988,7 @@ def _create_fp16_partitions(self):
                     #Without the detach, seems like the flattening becomes part of the
                     #model graph causing errors downstream
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(
+                        self.flatten_dense_tensors_aligned(
                             self.fp16_partitioned_groups[i],
                             dist.get_world_size(
                                 group=self.dp_process_group)).detach().pin_memory())
@@ -892,38 +998,92 @@ def _create_fp16_partitions(self):
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
                 #set model fp16 weight to slices of flattened buffer
-                updated_params = _unflatten_dense_tensors(
-                    self.fp16_partitioned_groups_flat[i],
-                    self.fp16_partitioned_groups[i])
+                updated_params = self.unflatten(self.fp16_partitioned_groups_flat[i],
+                                                self.fp16_partitioned_groups[i])
 
                 for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
                     partitioned_param.data = q.data
 
-    def _move_to_flat_buffer(self, src_list, flat_buffer):
+    def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
+        '''If flat buffer is None then the parameters in the param_list are
+        not copied to the flat buffer. This is because they excede the number of max_params_in_cpu
+        Some of these parameters may aready be in CPU in unflattened buffers
+        or they maybe in GPU, or they maybe in NVME. If they are in NVME, then
+        they will be marked as NOT_AVAILABLE, and will be moved to CPU when they are
+        needed during training.'''
+        if flat_buffer is None:
+            # this dst buffer is on NVMe, so skip this
+            return
+
         start = 0
-        for src in src_list:
-            dest = flat_buffer.narrow(0, start, src.numel())
-            start = start + src.numel()
-            dest.data.copy_(src.data)
-            src.data = dest.data
+        for param in param_list:
+            src = param.ds_tensor
+            dest = flat_buffer.narrow(0, start, src.ds_numel)
+            start = start + src.ds_numel
+            '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
+            if src.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with partition size {param.ds_tensor.ds_numel} permanently to CPU"
+                )
+                param.nvme_swapper.swap_in([param],
+                                           swap_in_buffers=[dest],
+                                           async_op=False)
+            else:
+                assert src.status == PartitionedParamStatus.AVAILABLE, "Partitioned Parm must be avialable here"
+                if not avoid_copy:
+                    dest.data.copy_(src.data)
+                src.data = dest.data
 
-    def _create_fp16_partitions_with_defragmentation(self):
-        dist.barrier()
-        partition_id = dist.get_rank(group=self.dp_process_group)
+            # Final location must be gpu/cpu in this case
+            param.ds_tensor.final_location = 'not-nvme'
+
+    def _create_param_groups_fp16_flat_cpu_memory(self):
+
+        aggregate_params_count = 0
+
+        for j, param_group in enumerate(self.optimizer.param_groups):
+            params_in_group = sum([p.ds_tensor.ds_numel for p in param_group['params']])
 
-        if self.cpu_offload_params:
-            self.param_groups_fp16_flat_cpu_memory = []
-            for j, param_group in enumerate(self.optimizer.param_groups):
-                total_params = sum([p.ds_tensor.numel() for p in param_group['params']])
+            flat_buffer_size = params_in_group
+
+            if self.params_in_nvme_and_cpu and \
+                aggregate_params_count + params_in_group > self.max_params_in_cpu:
+
+                flat_buffer_size = max(0,
+                                       self.max_params_in_cpu - aggregate_params_count)
+
+            aggregate_params_count += params_in_group
+
+            if flat_buffer_size > 0:
+                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
+                             force=False)
                 self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(total_params,
+                    torch.empty(int(flat_buffer_size),
                                 dtype=torch.half,
                                 pin_memory=True))
+            else:
+                print_rank_0(
+                    f"No flat buffer size. Param group size was  {params_in_group}",
+                    force=False)
+
+                self.param_groups_fp16_flat_cpu_memory.append(
+                    torch.empty(1,
+                                dtype=torch.half))
+
+    def _create_fp16_partitions_with_defragmentation(self):
+        dist.barrier()
+        partition_id = dist.get_rank(group=self.dp_process_group)
+
+        #create a flat CPU memory allocation for each param group
+        if self.offload_param:
+            self._create_param_groups_fp16_flat_cpu_memory()
 
         # loop to deal with groups
         for j, param_group in enumerate(self.optimizer.param_groups):
 
             sub_groups = self._create_fp16_sub_groups(param_group['params'])
+            print_rank_0(f'fp16 group {j} has {len(sub_groups)} subgroups', force=True)
+
             flat_offset = 0
             for sub_group in sub_groups:
                 i = len(self.fp16_groups)
@@ -936,6 +1096,10 @@ def _create_fp16_partitions_with_defragmentation(self):
                 self.fp16_partitioned_groups.append(
                     [param.ds_tensor for param in self.fp16_groups[i]])
 
+                total_elements = sum(
+                    [t.ds_numel for t in self.fp16_partitioned_groups[i]])
+                self.fp16_partitioned_groups_flat_numel.append(total_elements)
+
                 print_rank_0(
                     f"fp16 group {i} partitioned_param norms : {[param.ds_tensor.norm().item() for param in self.fp16_groups[i]]}"
                 )
@@ -949,60 +1113,225 @@ def _create_fp16_partitions_with_defragmentation(self):
 
                 #not sure why apex was cloning the weights before flattening
                 #removing cloning here
-                see_memory_usage(f"Before Flattening param group {i}", force=False)
+                see_memory_usage(f"Before Flattening param subgroup {i}", force=False)
 
-                if not self.cpu_offload_params:
-                    see_memory_usage(f"Before moving param group {i} to CPU",
+                #all partitioned parameters remain in GPU during training
+                if not self.offload_param:
+                    see_memory_usage(f"Before moving param subgroup group {i} to CPU",
                                      force=False)
                     #move all the parameters to cpu to free up GPU space for creating flat buffer
                     move_to_cpu(self.fp16_partitioned_groups[i])
-                    see_memory_usage(f"After moving param group {i} to CPU", force=False)
+                    see_memory_usage(f"After moving param subgroup {i} to CPU",
+                                     force=False)
 
                     #create flat buffer in CPU and move to GPU
                     self.fp16_partitioned_groups_flat.append(
-                        flatten_dense_tensors_aligned(self.fp16_partitioned_groups[i],
-                                                      1).cuda(
-                                                          torch.cuda.current_device()))
+                        self.flatten_dense_tensors_aligned(
+                            self.fp16_partitioned_groups[i],
+                            1).cuda(torch.cuda.current_device()))
                     see_memory_usage(
-                        f"After flattening and moving param group {i} to GPU",
+                        f"After flattening and moving param subgroup {i} to GPU",
                         force=False)
+
+                #all partitioned parameters are in CPU during training
                 else:
-                    total_elements = sum(
-                        [t.numel() for t in self.fp16_partitioned_groups[i]])
-                    fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                        j].narrow(0,
-                                  flat_offset,
-                                  total_elements)
+                    print_rank_0(f"Params in nvme and cpu {self.params_in_nvme_and_cpu}")
+                    #Flat buffer may not be available for parameters that reside in NVME
+                    if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
+                            j].numel():
+                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
+                            j].narrow(0,
+                                      flat_offset,
+                                      total_elements)
+                        print_rank_0(
+                            f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elemets {flat_offset + total_elements}",
+                            force=False)
+                    #these parameters reside in NVME and
+                    elif self.params_in_nvme_and_cpu:
+                        fp16_partitioned_group_flat = None
+                        print_rank_0(
+                            f"No flat buffer for sub group {i} of {total_elements} elements",
+                            force=False)
+                    else:
+                        assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs"
+
                     self.fp16_partitioned_groups_flat.append(fp16_partitioned_group_flat)
                     flat_offset += total_elements
 
                 # move param to flat buffer for both param offload on/off
-                self._move_to_flat_buffer(self.fp16_partitioned_groups[i],
-                                          self.fp16_partitioned_groups_flat[i])
+                self._move_to_flat_buffer(self.fp16_groups[i],
+                                          self.fp16_partitioned_groups_flat[i],
+                                          avoid_copy=not self.offload_param)
 
                 see_memory_usage(f"After Flattening param group {i}", force=False)
 
+                #create a pinned memory to be used for swapping out params to NVME after optimizer step
+                if self.fp16_partitioned_groups_flat[
+                        -1] is None and self.param_group_fp16_flat_reuse_buffer is None:
+                    self.param_group_fp16_flat_reuse_buffer = torch.empty(
+                        max(self.fp16_partitioned_groups_flat_numel),
+                        dtype=torch.half,
+                        device='cpu',
+                        pin_memory=True)
+
+                see_memory_usage(f"After Flattening param subgroup {i}", force=False)
+
+    def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
+        offset = 0
+        elements_in_sub_group = sum(
+            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
+        assert (flat_buffer.numel() == elements_in_sub_group)
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                print_rank_0(
+                    f"Swapping in {param.ds_id} with elements {param.ds_numel} and partition {param.ds_tensor.ds_numel}"
+                )
+                param.nvme_swapper.swap_in([param], async_op=False)
+                dest.data.copy_(partitioned_param.data)
+                param.nvme_swapper.remove_partition_and_release_buffers([param])
+                print_rank_0(f"Swapping in {param.ds_id} done")
+            else:
+                dest.data.copy_(partitioned_param.data)
+            offset += partitioned_param.ds_numel
+
+    def _create_next_swappable_fp32_groups(self):
+        reverse_order_indices = [
+            i for i in range(len(self.fp32_partitioned_groups_flat))
+        ]
+        reverse_order_indices.reverse()
+
+        next_group = None
+        for i in reverse_order_indices:
+            self.next_swappable_fp32_partitioned_groups.append(next_group)
+            if self._swappable_optimizer_subgroup(i):
+                next_group = self.fp32_partitioned_groups_flat[i]
+
+        self.next_swappable_fp32_partitioned_groups.reverse()
+
+    def _get_sub_group_partitions(self, sub_group_id):
+        sub_group_partitions = []
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+            if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
+                swap_path = param.nvme_swapper.get_path(param, True)
+                sub_group_partitions.append((partitioned_param,
+                                             param.ds_tensor.ds_numel,
+                                             swap_path))
+            else:
+                sub_group_partitions.append((partitioned_param,
+                                             partitioned_param.ds_numel,
+                                             None))
+
+        return sub_group_partitions
+
     def _create_fp32_partitions(self):
+        cpu_memory_usage = 0
+        cpu_memory_sub_groups = 0
+        nvme_memory_usage = 0
+        num_swappable_partitions = 0
+        num_swap_from_nvme_partitions = 0
+        num_swap_from_cpu_partitions = 0
+        swap_from_nvme_memory_usage = 0
+        swap_from_cpu_memory_usage = 0
+        GIGA_BYTES = (1024**3)
+
+        swappable_fp32_tensors = []
+        swappable_fp16_src_tensors = []
+        nvme_fp16_partitions_info = []
+        nvme_fp16_num_elems = []
+        nvme_fp32_dest_tensors = []
+        fp32_element_size = torch.tensor([], dtype=torch.float32).element_size()
+
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
-            # a partition of the fp32 master weights that will be updated by this process
+            num_elements = self.fp16_partitioned_groups_flat_numel[i]
 
-            self.fp32_partitioned_groups_flat.append(
-                self.fp16_partitioned_groups_flat[i].to(
-                    self.device).clone().float().detach())
-            element_size = self.fp32_partitioned_groups_flat[i].element_size()
-            num_elements = self.fp32_partitioned_groups_flat[i].numel()
+            # a partition of the fp32 master weights that will be updated by this process
+            if self._swappable_optimizer_subgroup(i):
+                self.fp32_partitioned_groups_flat.append(torch.Tensor())
+                nvme_memory_usage += (fp32_element_size * num_elements)
+                num_swappable_partitions += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    num_swap_from_nvme_partitions += 1
+                    swap_from_nvme_memory_usage += (fp32_element_size * num_elements)
+                    if self.offload_optimizer_fast_init:
+                        sub_group_partitions = self._get_sub_group_partitions(i)
+                        nvme_fp16_partitions_info.append(sub_group_partitions)
+                        nvme_fp16_num_elems.append(num_elements)
+                        nvme_fp32_dest_tensors.append(
+                            self.fp32_partitioned_groups_flat[i])
+                    else:
+                        unpinned_fp32_buffer = torch.empty(num_elements,
+                                                           device=self.device,
+                                                           dtype=torch.float)
+                        self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                        self.optimizer_swapper.initialize_parameters(
+                            parameters=[self.fp32_partitioned_groups_flat[i]],
+                            src_tensors=[unpinned_fp32_buffer])
+                else:
+                    num_swap_from_cpu_partitions += 1
+                    swap_from_cpu_memory_usage += (fp32_element_size * num_elements)
+                    swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i])
+                    swappable_fp16_src_tensors.append(
+                        self.fp16_partitioned_groups_flat[i])
+            else:
+                cpu_memory_usage += (fp32_element_size * num_elements)
+                cpu_memory_sub_groups += 1
+
+                if self.params_in_nvme_and_cpu and tensor is None:
+                    unpinned_fp32_buffer = torch.empty(num_elements,
+                                                       device=self.device,
+                                                       dtype=torch.float)
+                    self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
+                    self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer)
+                else:
+                    self.fp32_partitioned_groups_flat.append(
+                        self.fp16_partitioned_groups_flat[i].to(
+                            self.device).clone().float().detach())
 
             self.fp32_partitioned_groups_flat[
                 i].requires_grad = True  # keep this in case internal optimizer uses it
 
+        if len(swappable_fp32_tensors) > 0:
+            self.optimizer_swapper.initialize_parameters(
+                parameters=swappable_fp32_tensors,
+                src_tensors=swappable_fp16_src_tensors)
+
+        if len(nvme_fp32_dest_tensors) > 0:
+            fp16_pinned_buffers = self.fp16_groups[0][
+                0].nvme_swapper.reserve_available_buffers()
+            assert len(fp16_pinned_buffers) > 0
+            self.optimizer_swapper.initialize_from_swapped_fp16_params(
+                fp16_partitions_info=nvme_fp16_partitions_info,
+                fp16_num_elems=nvme_fp16_num_elems,
+                fp16_pinned_buffers=fp16_pinned_buffers,
+                fp32_parameters=nvme_fp32_dest_tensors)
+            self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
+
+        nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
+            force=True)
+        if self.params_in_nvme_and_cpu:
+            print_rank_0(
+                f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+            print_rank_0(
+                f'Swap from CPU Partitions: count = {num_swap_from_cpu_partitions}, size = {swap_from_cpu_memory_usage/GIGA_BYTES:5.2f}GB',
+                force=True)
+
+        cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
+        print_rank_0(
+            f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
+            force=True)
+
         # Clear for on-the-fly population before the optimizer step
         for param_group in self.optimizer.param_groups:
             param_group['params'] = []
 
     def _create_fp16_sub_groups(self, params_group):
 
-        params_group_numel = sum([param.ds_tensor.numel() for param in params_group])
-
+        params_group_numel = sum([param.partitioned_size() for param in params_group])
         sub_group_size = self.sub_group_size
 
         if sub_group_size is None or sub_group_size >= params_group_numel:
@@ -1014,7 +1343,7 @@ def _create_fp16_sub_groups(self, params_group):
         for param in params_group:
 
             sub_group.append(param)
-            local_sub_group_size += param.ds_tensor.numel()
+            local_sub_group_size += param.partitioned_size()
 
             if local_sub_group_size >= sub_group_size or id(param) == id(
                     params_group[-1]):
@@ -1036,13 +1365,23 @@ def setup_zero_stage3_hooks(self):
         self.hierarchy = 0
         self._register_hooks_recursively(self.module)
 
+        #reset step at the beginning of forward
+        def _pre_forward_hook(module, *args):
+            self.param_coordinator.reset_step()
+
         #reset step if in inference mode
         def _end_of_forward_hook(module, *args):
 
             if not torch._C.is_grad_enabled():
                 self.param_coordinator.reset_step()
 
+        #likely one of them should be enough but just to be safe
         self.module.register_forward_hook(_end_of_forward_hook)
+        self.module.register_forward_pre_hook(_pre_forward_hook)
+
+        # Add top todule to stack trace
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(self.module)
 
     def persistent_parameters(self):
         persistent_params = []
@@ -1071,7 +1410,41 @@ def _register_hooks_recursively(self, module, count=[0]):
         def _pre_forward_module_hook(module, *args):
             self.pre_sub_module_forward_function(module)
 
-        def _post_forward_module_hook(module, *args):
+        def _post_forward_module_hook(module, input, output):
+            global FWD_MODULE_STACK
+            FWD_MODULE_STACK.pop()
+
+            if not isinstance(output, (list, tuple)):
+                if torch.is_tensor(output):
+                    output = [output]
+                else:
+                    #print(f'got UNKNOWN type {type(output)}')
+                    outputs = []
+                    for name, val in vars(output).items():
+                        if not name.startswith('__') and torch.is_tensor(val):
+                            outputs.append(val)
+                    output = outputs
+                    #print(f'convert output to {output}')
+
+            for item in filter(lambda item: is_zero_param(item), output):
+                if not any(id(item) in m._external_params for m in FWD_MODULE_STACK):
+                    item.ds_active_sub_modules += 1
+                    module_to_register = FWD_MODULE_STACK[-1]
+                    print_rank_0(
+                        f'Registering dangling parameter for module {module_to_register.__class__.__name__}.',
+                        force=False)
+                    register_external_parameter(module_to_register, item)
+
+                    # It's possible that the parameter was already external to the completed module. If so, remove it the
+                    # registration as it will be covered by the outer module instead.
+                    if id(item) in module._external_params:
+                        print_rank_0(
+                            f'  Unregistering nested dangling parameter from module {module.__class__.__name__}',
+                            force=False)
+                        unregister_external_parameter(module, item)
+
+                    item.all_gather()
+
             self.post_sub_module_forward_function(module)
 
         def _pre_backward_module_hook(module, inputs, output):
@@ -1135,6 +1508,9 @@ def pre_sub_module_forward_function(self, sub_module):
         see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
                          force=False)
 
+        global FWD_MODULE_STACK
+        FWD_MODULE_STACK.append(sub_module)
+
         self.param_coordinator.record_trace(sub_module)
 
         self.param_coordinator.fetch_sub_module(sub_module)
@@ -1142,8 +1518,10 @@ def pre_sub_module_forward_function(self, sub_module):
             f"Before sub module function {sub_module.__class__.__name__} after fetch",
             force=False)
 
-        self.param_coordinator.prefetch_next_sub_modules(sub_module,
-                                                         numel=self.prefetch_elements)
+        self.param_coordinator.prefetch_next_sub_modules(
+            sub_module,
+            numel=self.prefetch_elements,
+            nvme=self.params_in_nvme_and_cpu)
         see_memory_usage(
             f"Before sub module function {sub_module.__class__.__name__} after prefetch",
             force=False)
@@ -1152,11 +1530,13 @@ def pre_sub_module_forward_function(self, sub_module):
 
     def post_sub_module_forward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} before release",
+            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
+
         self.param_coordinator.release_sub_module(sub_module)
+
         see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} after release",
+            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
             force=False)
 
     def pre_sub_module_backward_function(self, sub_module):
@@ -1171,17 +1551,17 @@ def pre_sub_module_backward_function(self, sub_module):
 
     def post_sub_module_backward_function(self, sub_module):
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} before release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
         self.param_coordinator.release_sub_module(sub_module)
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} after release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
             force=False)
 
     def _release_ipg_buffers(self):
         if self.contiguous_gradients:
             self.ipg_buffer = None
-            if not self.cpu_offload:
+            if not self.offload_optimizer and self.is_gradient_accumulation_boundary:
                 self.grads_in_partition = None
 
             self.grads_in_partition_offset = 0
@@ -1191,35 +1571,93 @@ def _optimizer_step(self, sub_group_id):
         fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
         fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
         self.optimizer.param_groups[param_group_id]['params'] = [fp32_param]
+
         self.optimizer.step()
         self.optimizer.param_groups[param_group_id]['params'] = []
-        fp16_param.data.copy_(fp32_param.data)
+
+        if fp16_param is not None:
+            fp16_param.data.copy_(fp32_param.data)
+        else:
+            #synchronize incase there is a previous write going on the reuse buffer
+            self.fp16_groups[sub_group_id][0].nvme_swapper.synchronize_writes()
+            self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                0,
+                fp32_param.numel()).data.copy_(fp32_param.data)
+
+    def _swappable_optimizer_subgroup(self, sub_group_id):
+        if not self.swap_optimizer:
+            return False
+
+        return self.optimizer_swapper.swappable_tensor(
+            None,
+            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
+
+    def _partitioned_params_swap_out(self, i):
+        swap_out_params = []
+        offset = 0
+        for param, partitioned_param in zip(self.fp16_groups[i], self.fp16_partitioned_groups[i]):
+            src = self.param_group_fp16_flat_reuse_buffer.narrow(
+                0,
+                offset,
+                partitioned_param.ds_numel)
+            if partitioned_param.status == PartitionedParamStatus.AVAILABLE:
+                partitioned_param.data.copy_(src.data)
+            else:
+                partitioned_param.data = src.data
+                #Setting it to available just for good practice. It will be released at the end of the call
+                #by swap out and release
+                partitioned_param.status = PartitionedParamStatus.AVAILABLE
+                swap_out_params.append(param)
+            offset += partitioned_param.ds_numel
+
+        if len(swap_out_params) > 0:
+            #The write synchronize will happen before the buffer is reused in _optimizer_step so the buffer can be released
+            swap_out_params[0].nvme_swapper.swap_out_and_release(
+                swap_out_params,
+                async_op=True,
+                force_buffer_release=True)
 
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
-        largest_numel = max([t.numel() for t in self.fp16_partitioned_groups_flat])
+        largest_numel = max(
+            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
         gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
         gradient_buffer = torch.zeros(int(largest_numel),
                                       dtype=gradient_dtype,
                                       device=self.device)
 
+        timers = self.timers
+        timer_names = set()
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.init_timers()
+
+        INIT_OPTIMIZER_TIMER = 'init_optimizer_state'
+        timer_names.add(INIT_OPTIMIZER_TIMER)
+        self.start_timers([INIT_OPTIMIZER_TIMER])
+
         for i, group in enumerate(self.fp16_groups):
+            swappable_optimizer_subgroup = self._swappable_optimizer_subgroup(i)
+            swappable_param_subgroup = self.fp16_partitioned_groups_flat[i] is None
+
+            num_elements = int(self.fp16_partitioned_groups_flat_numel[i])
+
             see_memory_usage(
-                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[Begin] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-            num_elements = int(self.fp16_partitioned_groups_flat[i].numel())
-            if self.cpu_offload and not self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device)
-            elif self.cpu_offload_use_pin_memory:
-                self.fp32_partitioned_groups_flat[i].grad = torch.zeros(
-                    num_elements,
-                    dtype=gradient_dtype,
-                    device=self.device).pin_memory()
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_in(i, timer_names)
+
+            if self.offload_optimizer and not swappable_optimizer_subgroup:
+                subgroup_gradient_buffer = torch.zeros(num_elements,
+                                                       dtype=gradient_dtype,
+                                                       device=self.device)
+                if self.offload_optimizer_pin_memory:
+                    subgroup_gradient_buffer = subgroup_gradient_buffer.pin_memory()
+                self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
             else:
                 self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
                     0,
@@ -1228,14 +1666,27 @@ def initialize_optimizer_states(self):
 
             self._optimizer_step(i)
 
+            if swappable_optimizer_subgroup:
+                self._optimizer_states_and_gradient_swap_out(i, timer_names)
+
+            if swappable_param_subgroup:
+                self._partitioned_params_swap_out(i)
+
             see_memory_usage(
-                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups',
+                f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
-        if not self.cpu_offload:
+        self.stop_timers([INIT_OPTIMIZER_TIMER])
+        self.log_timers(timer_names)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
+
+        if not self.offload_optimizer:
             for group in self.fp32_partitioned_groups_flat:
                 group.grad = None
 
+        # Reset steps
         return
 
     #########################################################################
@@ -1294,7 +1745,7 @@ def independent_gradient_partition_epilogue(self):
 
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
         #TODO: use a similar code path for both cpu_offload and non-cpu offload
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             for i, sub_group in enumerate(self.fp16_groups):
                 self.averaged_gradients[i] = [
                     torch.zeros_like(param.ds_tensor) if param.grad is None else
@@ -1516,7 +1967,7 @@ def set_grad_positions(self):
             current_offset = 0
             for param in group:
                 param_id = self.get_param_id(param)
-                num_elements = param.ds_tensor.numel()
+                num_elements = param.ds_tensor.ds_numel
 
                 self.grad_position[param_id] = [
                     int(i),
@@ -1532,7 +1983,7 @@ def async_accumulate_grad_in_cpu_via_gpu(self, param, acc_grad_cpu_partition):
         dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
             0,
             0,
-            param.ds_tensor.numel())
+            param.ds_tensor.ds_numel)
 
         if self.micro_step_id > 0:
             dest_buffer.copy_(acc_grad_cpu_partition.view(-1), non_blocking=True)
@@ -1606,7 +2057,7 @@ def partition_previous_reduced_grads(self):
         if not self.previous_reduced_grads:
             return
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             allocate_grads_in_partition = self.grads_in_partition is None\
             and self.gradient_accumulation_steps > 1
         else:
@@ -1618,12 +2069,12 @@ def partition_previous_reduced_grads(self):
             for i, group in enumerate(self.fp16_groups):
                 total_size = 0
                 for param_in_partition in group:
-                    total_size += param_in_partition.ds_tensor.numel()
+                    total_size += param_in_partition.ds_tensor.ds_numel
 
                 see_memory_usage(
                     f"group {i} before creating {total_size} reduced gradients into partition",
                     force=False)
-                if self.cpu_offload_use_pin_memory:
+                if self.offload_param_pin_memory:
                     self.grads_in_partition.append(
                         torch.zeros(int(total_size),
                                     dtype=torch.half,
@@ -1637,51 +2088,73 @@ def partition_previous_reduced_grads(self):
                     f"group {i} after creating {total_size} reduced gradients into partition",
                     force=False)
 
-        for param in self.previous_reduced_grads:
+        if self.offload_optimizer:
+            offload_fp32_gradients = {}
+            offload_fp32_offsets = {}
 
-            [i, dest_offset, num_elements] = self.grad_position[self.get_param_id(param)]
-
-            # self.debug_fp16_grads[i][self.get_param_id(param)] = (
-            #     float(param.data.float().norm(2)),
-            #     float(param.grad.data.float().norm(2)))
-
-            if self.cpu_offload:
-
-                param.partition_gradients(partition_buffers=self.temp_grad_gpu_buffer)
-                with torch.cuda.stream(self.copy_grad_stream):
-                    self.reduction_stream.synchronize()
-
-                if self.gradient_accumulation_steps > 1:
+        with torch.cuda.stream(self.copy_grad_stream):
+            self.reduction_stream.synchronize()
+            for param in self.previous_reduced_grads:
+
+                [i,
+                 dest_offset,
+                 num_elements] = self.grad_position[self.get_param_id(param)]
+
+                if self.offload_optimizer:
+                    param.partition_gradients(
+                        partition_buffers=self.temp_grad_gpu_buffer)
+                    #with torch.cuda.stream(self.copy_grad_stream):
+                    #    self.reduction_stream.synchronize()
+
+                    if self.gradient_accumulation_steps > 1:
+                        # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
+                        fp16_grad_tensor = self.grads_in_partition[i].narrow(
+                            0,
+                            dest_offset,
+                            num_elements)
+                        self.async_accumulate_grad_in_cpu_via_gpu(
+                            param,
+                            fp16_grad_tensor)
+
+                    if self.is_gradient_accumulation_boundary:
+
+                        self.set_norm_for_param_grad_in_gpu(param)
+
+                        self.update_overflow_tracker_for_param_grad(param)
+
+                        if self._swappable_optimizer_subgroup(i):
+                            if not i in offload_fp32_gradients.keys():
+                                offload_fp32_gradients[i] = []
+                                offload_fp32_offsets[i] = []
+
+                            offload_fp32_gradients[i].append(param.grad.view(-1).float())
+                            param.grad = None
+                            offload_fp32_offsets[i].append(dest_offset)
+                        else:
+                            fp32_grad_tensor = self.fp32_partitioned_groups_flat[
+                                i].grad.narrow(0,
+                                               dest_offset,
+                                               num_elements)
+
+                            self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
+                                param,
+                                fp32_grad_tensor)
+                else:
                     # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
                     fp16_grad_tensor = self.grads_in_partition[i].narrow(
                         0,
                         dest_offset,
                         num_elements)
-                    self.async_accumulate_grad_in_cpu_via_gpu(param, fp16_grad_tensor)
-
-                if self.is_gradient_accumulation_boundary:
+                    param.partition_gradients(
+                        partition_buffers=fp16_grad_tensor,
+                        accumulate=True if self.micro_step_id > 0 else False)
 
-                    self.set_norm_for_param_grad_in_gpu(param)
-
-                    self.update_overflow_tracker_for_param_grad(param)
-
-                    fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
-
-                    self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(
-                        param,
-                        fp32_grad_tensor)
-            else:
-                # The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-                fp16_grad_tensor = self.grads_in_partition[i].narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
-                param.partition_gradients(
-                    partition_buffers=fp16_grad_tensor,
-                    accumulate=True if self.micro_step_id > 0 else False)
+            if self.offload_optimizer and self.swap_optimizer:
+                for i in offload_fp32_gradients.keys():
+                    self.optimizer_swapper.swap_out_gradients(
+                        parameter=self.fp32_partitioned_groups_flat[i],
+                        gradient_offsets=offload_fp32_offsets[i],
+                        gradient_tensors=offload_fp32_gradients[i])
 
         self.previous_reduced_grads = []
 
@@ -1734,7 +2207,7 @@ def are_all_related_partitions_reduced(params_id):
                 self.param_dict[params_id].grad = None
 
     def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
+        flatten_tensor = self.flatten(tensors)
 
         def print_func():
             logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
@@ -1792,7 +2265,7 @@ def set_none_gradients_to_zero(self, i, partition_id):
 
     def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
         rank = None
-        tensor = flatten(bucket)
+        tensor = self.flatten(bucket)
 
         tensor_to_allreduce = tensor
 
@@ -1822,7 +2295,7 @@ def allreduce_and_copy(self, small_bucket, rank=None, log=None):
         with torch.cuda.stream(self.reduction_stream):
             allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
             if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
     def allreduce_no_retain(self,
@@ -2041,7 +2514,7 @@ def get_flat_partition(self,
         if return_tensor_list:
             return flat_tensor_list
 
-        return _flatten_dense_tensors(flat_tensor_list)
+        return self.flatten(flat_tensor_list)
 
     def free_grad_in_param_list(self, param_list):
         for p in param_list:
@@ -2071,167 +2544,7 @@ def stop_timers(self, timer_names):
         for name in timer_names:
             self.timers(name).stop()
 
-    def old_step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-
-        self.micro_step_id = INITIAL_MICRO_STEP_ID
-
-        # if self.cpu_offload:
-        #    torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        print_rank_0(f"Inside Step function")
-        see_memory_usage(f"In step before checking overflow", force=False)
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-        self.param_coordinator.hierarchy = 0
-        self.param_coordinator.finish_tracing(print_trace=True)
-
-        self.param_coordinator.reset_step()
-
-        print_rank_0("Finished Tracing at Beginning of Step")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        timers = self.timers
-
-        OPTIMIZER_STEP = 'optimizer_step'
-        OPTIMIZER_FP16_UPDATE = 'optimizer_fp16_update'
-        OPTIMIZER_FP32_GRADIENT = 'optimizer_fp32_gradient'
-        timer_names = [OPTIMIZER_STEP, OPTIMIZER_FP16_UPDATE, OPTIMIZER_FP32_GRADIENT]
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients', force=False)
-            self.zero_grad()
-
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            see_memory_usage('After overflow after clearing gradients', force=False)
-
-            logger.info(
-                "[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
-            return
-
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        debug_fp32_grads = [{} for _ in self.fp16_groups]
-
-        self.start_timers([OPTIMIZER_FP32_GRADIENT])
-        for i, group in enumerate(self.fp16_groups):
-
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
-
-                single_grad_partition = self.fp32_partitioned_groups_flat[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
-
-                # free gradients for all the prameters that are not updated by this process
-                # self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                # create a flat gradients for parameters updated by this process
-
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                single_grad_partition = _flatten_dense_tensors(
-                    self.averaged_gradients[i]).to(
-                        self.fp32_partitioned_groups_flat[i].dtype)
-
-                assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[i].numel(), \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(
-                        single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.fp32_partitioned_groups_flat[i].grad = single_grad_partition
-
-                # release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.zero_grad()
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-            debug_fp32_grads[i] = [
-                (t.clone().detach(),
-                 t) for t in _unflatten_dense_tensors(single_grad_partition,
-                                                      group)
-            ]
-
-        self.stop_timers([OPTIMIZER_FP32_GRADIENT])
-
-        print(f"Norm groups: {norm_groups}")
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-
-        #self.dump_pre_step_gradients(debug_fp32_grads)
-
-        self.start_timers([OPTIMIZER_STEP])
-        self.optimizer.step()
-        self.stop_timers([OPTIMIZER_STEP])
-
-        # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
-            for group in self.fp32_partitioned_groups_flat:
-                group.grad = None
-
-        self.start_timers([OPTIMIZER_FP16_UPDATE])
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat):
-            fp16_partitions.data.copy_(fp32_partition.data)
-        self.stop_timers([OPTIMIZER_FP16_UPDATE])
-
-        print(
-            f"fp16 groups norm : {[group_flat.norm() for group_flat in self.fp16_partitioned_groups_flat]}"
-        )
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            #for p in self.fp16_groups[i]:
-            #    p.data=p.ds_tensor
-
-            updated_params = _unflatten_dense_tensors(
-                self.fp16_partitioned_groups_flat[i],
-                self.fp16_partitioned_groups[i])
-            for partitioned_param, q in zip(self.fp16_partitioned_groups[i], updated_params):
-                # print(f"Grad fn: {p.grad_fn}")
-                # p.data = torch.ones(1).half().cuda()
-                partitioned_param.data = q.data
-
-        #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
-
-        #self.dump_post_step_gradients()
-        self.debug_fp16_grads = [{} for _ in self.fp16_groups]
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        self.log_timers(timer_names)
-
-        see_memory_usage('After zero_optimizer step', force=False)
-        print_rank_0(f"------------------Finishing Step-----------------------",
-                     force=False)
-        return
-
     def _pre_step(self):
-
         self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         print_rank_0(f"Inside Step function")
@@ -2248,7 +2561,7 @@ def _pre_step(self):
     def _get_norm_groups(self):
         norm_groups = []
         for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
+            if self.offload_optimizer:
                 norm_groups.append(
                     self.complete_grad_norm_calculation_for_cpu_offload(
                         self.fp16_groups[i]))
@@ -2259,12 +2572,10 @@ def _get_norm_groups(self):
         return norm_groups
 
     def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
-
         partition_id = dist.get_rank(group=self.dp_process_group)
 
-        single_grad_partition = _flatten_dense_tensors(
-            self.averaged_gradients[sub_group_id]).to(
-                self.fp32_partitioned_groups_flat[sub_group_id].dtype)
+        single_grad_partition = self.flatten(self.averaged_gradients[sub_group_id]).to(
+            self.fp32_partitioned_groups_flat[sub_group_id].dtype)
 
         assert single_grad_partition.numel() == self.fp32_partitioned_groups_flat[sub_group_id].numel(), \
             "averaged gradients have different number of elements that partition size {} {} {} {}".format(
@@ -2280,25 +2591,95 @@ def _prepare_fp32_grad_for_sub_group(self, sub_group_id):
     def _prepare_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}',
                          force=False)
-        if not self.cpu_offload:
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names)
+        elif not self.offload_optimizer:
             self._prepare_fp32_grad_for_sub_group(sub_group_id)
         see_memory_usage(f'After prepare optimizer sub group {sub_group_id}',
                          force=False)
 
+    def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
+        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+        self.start_timers([OPTIMIZER_SWAP_IN_STATE])
+
+        self.optimizer_swapper.swap_in_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_parameter=self.next_swappable_fp32_partitioned_groups[sub_group_id])
+
+        self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
+        timer_names.add(OPTIMIZER_SWAP_IN_STATE)
+        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}',
+                         force=False)
+
     def _release_sub_group(self, sub_group_id, timer_names=set()):
         see_memory_usage(f'Before release optimizer sub group {sub_group_id}',
                          force=False)
         # get rid of the fp32 gradients. Not needed anymore
-        if not self.cpu_offload:
+        if not self.offload_optimizer:
             self.fp32_partitioned_groups_flat[sub_group_id].grad = None
 
+        if self._swappable_optimizer_subgroup(sub_group_id):
+            self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names)
         see_memory_usage(f'After release optimizer sub group {sub_group_id}',
                          force=False)
 
+    # create a flat tensor aligned at the alignment boundary
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+        num_elements = 0
+        for tens in tensor_list:
+            num_elements = num_elements + tens.numel()
+
+        remaining = num_elements % alignment
+
+        if remaining:
+            elements_to_add = alignment - remaining
+            pad_tensor = torch.zeros(elements_to_add,
+                                     device=tensor_list[0].device,
+                                     dtype=tensor_list[0].dtype)
+            padded_tensor_list = tensor_list + [pad_tensor]
+
+            num_elements = num_elements + elements_to_add
+        else:
+            padded_tensor_list = tensor_list
+
+        return self.flatten(padded_tensor_list)
+
+    def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set()):
+        param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
+        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        assert self._swappable_optimizer_subgroup(sub_group_id), \
+            f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
+
+        OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
+        see_memory_usage(
+            f'post-step Before swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
+
+        self.optimizer_swapper.swap_out_optimizer_state(
+            parameter=self.fp32_partitioned_groups_flat[sub_group_id],
+            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is
+            not None)
+
+        self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
+        see_memory_usage(
+            f'post-step After swapping out optimizer tensors {sub_group_id}',
+            force=False)
+        timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
+
+        # get rid of the fp32 gradients. Not needed anymore
+        self.fp32_partitioned_groups_flat[sub_group_id].grad = None
+
     def _unflatten_partitioned_parameters(self, sub_group_id):
-        updated_params = _unflatten_dense_tensors(
-            self.fp16_partitioned_groups_flat[sub_group_id],
-            self.fp16_partitioned_groups[sub_group_id])
+        updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                        self.fp16_partitioned_groups[sub_group_id])
 
         for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
             partitioned_param.data = q.data
@@ -2307,7 +2688,7 @@ def _overflow_clean_up(self, prev_scale):
         see_memory_usage('After overflow before clearing gradients', force=False)
         self.zero_grad()
 
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
         else:
             self.averaged_gradients = {}
@@ -2336,17 +2717,28 @@ def _overflow_check_and_loss_scale_update(self):
         return self.overflow
 
     def _post_step(self, timer_names=set()):
-        if self.cpu_offload:
+        if self.offload_optimizer:
             self.reset_cpu_buffers()
 
         #Gathering persisting parameters
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.log_timers()
 
         self.log_timers(timer_names)
 
         see_memory_usage('After zero_optimizer step', force=False)
         print_rank_0(f"------------------Finishing Step-----------------------")
 
+    def _reassign_or_swap_out_partitioned_parameters(self, sub_group_id):
+        if self.fp16_partitioned_groups_flat[sub_group_id] is not None:
+            #unflatten fp16 parameter subgroup
+            self._unflatten_partitioned_parameters(sub_group_id)
+        else:
+            self._partitioned_params_swap_out(sub_group_id)
+
     def step(self, closure=None):
         """
             Not supporting closure.
@@ -2355,6 +2747,8 @@ def step(self, closure=None):
 
         #checks for overflow, adjust the loss scale accordingly
         if self._overflow_check_and_loss_scale_update():
+            if self.swap_optimizer:
+                self.optimizer_swapper.log_timers()
             return
 
         norm_groups = self._get_norm_groups()
@@ -2379,8 +2773,8 @@ def step(self, closure=None):
             #release memory or swap out optimizer states of fp32 parameters
             self._release_sub_group(sub_group_id, timer_names)
 
-            #unflatten fp16 parameter subgroup
-            self._unflatten_partitioned_parameters(sub_group_id)
+            #put fp16 parameters in appropriate location
+            self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
 
         self.stop_timers(['optimizer_step'])
 
@@ -2404,10 +2798,9 @@ def dump_post_step_gradients(self):
         for i, group in enumerate(self.fp16_groups):
             print(
                 f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
-            unflat_fp16 = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                   self.fp16_groups[i])
-            unflat_fp32 = _unflatten_dense_tensors(self.fp32_partitioned_groups_flat[i],
-                                                   self.fp16_groups[i])
+            unflat_fp16 = self.unflatten(self.fp16_groups_flat[i], self.fp16_groups[i])
+            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i],
+                                         self.fp16_groups[i])
             for j, p in enumerate(self.fp16_groups[i]):
                 param_id = self.get_param_id(p)
                 param_norm = float(p.data.float().norm(2))
@@ -2470,7 +2863,7 @@ def has_overflow(self, partition_gradients=True):
                 self.local_overflow = self._has_inf_or_nan(self.gpu_sum)
                 self.gpu_sum = torch.zeros(1, dtype=torch.float).cuda()
 
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
+            overflow = self.local_overflow if self.offload_optimizer else self.has_overflow_partitioned_grads_serial(
             )
             #overflow = self.has_overflow_partitioned_grads_serial()
             overflow_gpu = torch.cuda.ByteTensor([overflow])
@@ -2529,6 +2922,10 @@ def backward(self, loss, retain_graph=False):
         print_rank_0(
             f"Total fully available parameters {self.param_coordinator.total_available_parameter_numel}"
         )
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.pre_backward()
+
         see_memory_usage(f"Before backward", force=False)
         if self.contiguous_gradients:
             self.ipg_buffer = []
@@ -2551,6 +2948,9 @@ def backward(self, loss, retain_graph=False):
         grad computation do not trigger post call and will therefore will remain unpartitioned '''
         self._partition_all_parameters()
 
+        if self.swap_optimizer:
+            self.optimizer_swapper.post_backward()
+
     def _partition_all_parameters(self):
         for name, param in self.module.named_parameters(recurse=True):
             self.param_coordinator.release_and_reset_parameter(param)
@@ -2592,8 +2992,7 @@ def _set_loss_scale(self, value):
 
     def _get_lean_tensors(self, padded_flattened_tensor, group_tensors, paddings):
         # Remove paddings from flattened tensor
-        individual_tensors = _unflatten_dense_tensors(padded_flattened_tensor,
-                                                      group_tensors)
+        individual_tensors = self.unflatten(padded_flattened_tensor, group_tensors)
         lean_lengths = [t.numel() - pad for t, pad in zip(group_tensors, paddings)]
         lean_tensors = [t[:len] for t, len in zip(individual_tensors, lean_lengths)]
         #logger.info(f'rank {dist.get_rank()}: lean_tensors = {[t.numel() for t in lean_tensors]}')
@@ -2675,6 +3074,11 @@ def state_dict(self):
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
 
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
         return self._rigid_state_dict()
 
 
@@ -2714,14 +3118,14 @@ def _get_flattened_partition(self, all_partition_states):
 
         local_state_partitions = []
         for param_index, param_slices in enumerate(param_partitions):
-            flattened_merged_tensor = flatten_dense_tensors_aligned(
+            flattened_merged_tensor = self.flatten_dense_tensors_aligned(
                 param_slices,
                 alignment)
             new_partitions = self.get_data_parallel_partitions(flattened_merged_tensor)
             local_state_partitions.append(new_partitions[partition_id])
 
         if torch.is_tensor(local_state_partitions[0]):
-            return flatten_dense_tensors_aligned(local_state_partitions, alignment)
+            return self.flatten_dense_tensors_aligned(local_state_partitions, alignment)
 
         # Assume non-tensor states are not partitioned and equal across ranks, so return first one
         return local_state_partitions[0]
@@ -2776,7 +3180,7 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
 
         # update fp16 unflattened params
         for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
-            updated_params = _unflatten_dense_tensors(
+            updated_params = self.unflatten(
                 self.fp16_partitioned_groups_flat[sub_group_id],
                 self.fp16_partitioned_groups[sub_group_id])
 
@@ -2817,19 +3221,26 @@ def load_state_dict(self,
             raise NotImplementedError(
                 "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
             )
-        else:
-            self._rigid_load_state_dict(
-                state_dict_list[dist.get_rank(group=self.dp_process_group)],
-                load_optimizer_states=load_optimizer_states)
 
-        self.persistent_parameters[0].partition(self.persistent_parameters)
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if self.swap_optimizer or self.params_in_nvme_and_cpu:
+            raise NotImplementedError(
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
+            )
+
+        self._rigid_load_state_dict(
+            state_dict_list[dist.get_rank(group=self.dp_process_group)],
+            load_optimizer_states=load_optimizer_states)
+
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].partition(self.persistent_parameters)
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
     def save_checkpoint_prologue(self):
         self._partition_all_parameters()
 
     def save_checkpoint_epilogue(self):
-        self.persistent_parameters[0].all_gather(self.persistent_parameters)
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
 
 def _handle_overflow(cpu_sum, x, i):
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
new file mode 100644
index 000000000000..d78fc81515e4
--- /dev/null
+++ b/deepspeed/runtime/zero/tiling.py
@@ -0,0 +1,294 @@
+import torch
+import deepspeed
+from deepspeed.runtime.utils import partition_uniform as partition
+
+
+def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=False):
+    """Split a tensor along its last dimension. Adapted from Megatron-LM.
+
+    Arguments:
+        tensor: input tensor.
+        partitions: list of partition sizes to supply to torch.split
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    # Split.
+    tensor_list = torch.split(tensor, partitions, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class TiledLinear(torch.nn.Module):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 in_splits=1,
+                 out_splits=1,
+                 input_is_already_split=False,
+                 combine_out_splits=True,
+                 linear_cls=torch.nn.Linear,
+                 init_linear=None,
+                 **kwargs):
+        """A replacement for ``torch.nn.Linear`` that works with ZeRO-3 to reduce
+        memory requirements via tiling.
+
+        TiledLinear breaks the input and output dimensions of a linear layer
+        into tiles that are processed in sequence. This class enables huge
+        linear layers when combined with ZeRO-3 because inactive tiles can be
+        partitioned and offloaded.
+
+        .. note::
+            We recommend using as few tiles as necessary. Tiling
+            significantly reduces memory usage, but can reduce throughput
+            for inexpensive layers. This due to the smaller kernels having
+            less parallelism and lower arithmetic intensity, while
+            introducing more frequent synchronization and communication.
+
+        Args:
+            in_features (int): See ``torch.nn.Linear``
+            out_features (int): See ``torch.nn.Linear``
+            bias (bool, optional): See ``torch.nn.Linear``
+            in_splits (int, optional): The number of tiles along the input dimension. Defaults to 1.
+            out_splits (int, optional): The number of tiles along the output dimension. Defaults to 1.
+            input_is_already_split (bool, optional): If set to ``True``, assume that the ``input_`` in
+                to ``forward()`` is already split into ``in_splits`` chunks. Defaults to ``False``.
+            combine_out_splits (bool, optional): If set to ``False``, do not combine the ``out_splits`` outputs
+                into a single tensor. Defaults to ``True``.
+            linear_cls (class, optional): The underlying class to build individual tiles.
+                Defaults to ``torch.nn.Linear``.
+            init_linear (``torch.nn.Linear``, optional): If set, copy the parameters of
+                ``init_linear``. Useful for debugging. Defaults to ``None``.
+            kwargs (dict, optional): additional keyword arguments to provide to ``linear_cls()``.
+
+        Raises:
+            RuntimeError: ``in_splits`` must be within the range [1, in_features).
+            RuntimeError: ``out_splits`` must be within the range of [1, out_features).
+        """
+
+        super().__init__()
+
+        if (in_splits < 1) or (in_splits > in_features):
+            raise RuntimeError('in splits must be in range [1, in_features].')
+        if (out_splits < 1) or (out_splits > out_features):
+            raise RuntimeError('out splits must be in range [1, out_features].')
+
+        # global, not necessarily local
+        self.in_features = in_features
+        self.out_features = out_features
+        self.use_bias = bias
+
+        self.out_splits = out_splits
+        self.in_splits = in_splits
+        self.input_is_already_split = input_is_already_split
+        self.combine_out_splits = combine_out_splits
+
+        # Build partition-lists. These are CSR-style splits [0, part0, part1, ..., features]
+        # For example, row_parts[p] gives the start of partition p and row_parts[p+1]
+        # is the exclusive end.
+        self.in_parts = partition(num_items=in_features, num_parts=in_splits)
+        self.out_parts = partition(num_items=out_features, num_parts=out_splits)
+
+        assert len(self.out_parts) == out_splits + 1
+        assert len(self.in_parts) == in_splits + 1
+        assert self.out_parts[0] == 0
+        assert self.out_parts[out_splits] == out_features
+        assert self.in_parts[in_splits] == in_features
+
+        self.linears = torch.nn.ModuleList()
+        for out_id in range(out_splits):
+            self.linears.append(torch.nn.ModuleList())
+
+            local_out_dim = self.out_parts[out_id + 1] - self.out_parts[out_id]
+
+            for in_id in range(in_splits):
+                #if input_size is split, we only need one bias
+                local_bias = bias if in_id == (in_splits - 1) else False
+
+                local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id]
+                local = linear_cls(local_in_dim,
+                                   local_out_dim,
+                                   bias=local_bias,
+                                   **kwargs)
+                self.linears[out_id].append(local)
+
+        # Optionally initialize with a known tensor
+        if init_linear is not None:
+            self.copy_params_from(init_linear)
+
+    def forward(self, input_):
+        if self.in_splits > 1 and not self.input_is_already_split:
+            split_sizes = [
+                self.in_parts[p + 1] - self.in_parts[p] for p in range(self.in_splits)
+            ]
+            inputs = self._split_global_input(input_, split_sizes)
+        elif self.in_splits > 1:
+            inputs = input_
+            assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
+        else:
+            # no splits
+            inputs = [input_]
+
+        outputs = [None] * self.out_splits
+        for out_id in range(self.out_splits):
+            for in_id in range(self.in_splits):
+                local_output = self.linears[out_id][in_id](inputs[in_id])
+
+                outputs[out_id] = self._reduce_local_output(in_id=in_id,
+                                                            out_id=out_id,
+                                                            current_out=outputs[out_id],
+                                                            new_out=local_output)
+
+        if self.combine_out_splits:
+            return self._combine_output_splits(outputs)
+
+        return outputs
+
+    def _split_global_input(self, input, split_sizes):
+        """Partition an input tensor along the last dimension, aligned with given splits.
+
+        Subclasses should override this method to account for new input types.
+
+        Args:
+            input (List[Tensor]): The tensor to partition along the last dimension.
+            split_sizes (List[int]): The size of each partition.
+
+        Returns:
+            List[Any]: A list of the chunks of ``input``.
+        """
+        return split_tensor_along_last_dim(input, split_sizes)
+
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduce (sum) a new local result into the existing local results.
+
+        Subclasses should override this method.
+
+        For a given ``out_id``, this method is called ``in_id-1`` times. The first input
+        split is a simple assignment.
+
+        Args:
+            in_id (int): The input split that produced ``new_out``.
+            out_id (int): The output split that produced ``new_out``.
+            current_out (Any): The reduced form of all previous ``out_id`` results.
+            new_out (Any): The local result from forward (``in_id``, ``out_id``)e
+
+        Returns:
+            Any: The combined result of ``current_out`` and ``new_out``.
+        """
+
+        if current_out is None:
+            #this clone is necessary to preserve auto grad
+            #there is some issue with inplace update for outputs that are views
+            return new_out.clone()
+        else:
+            return current_out + new_out
+
+    def _combine_output_splits(self, outputs):
+        """Join the splits of the output into a single result.
+
+        Args:
+            outputs (List[Any]): The reduced outputs for each output split.
+
+        Returns:
+            Any: The combined outputs.
+        """
+        assert len(outputs) == self.out_splits
+        return torch.cat(outputs, dim=-1)
+
+    @torch.no_grad()
+    def copy_params_from(self, other):
+        """Copy the weight and bias data from ``other``.
+
+        This is especially useful for reproducible initialization and testing.
+
+        Equivalent to:
+
+        .. code-block:: python
+
+            with torch.no_grad():
+                self.weight.copy_(other.weight)
+                if self.bias is not None:
+                    self.bias.copy_(other.bias)
+
+        .. note::
+            If ZeRO-3 is enabled, this is a collective operation and the
+            updated parameters of data-parallel rank 0 will be visible on all
+            ranks. See :class:`deepspeed.zero.GatheredParameters` for more
+            information.
+
+
+        Args:
+            other (``torch.nn.Linear``): the linear layer to copy from.
+        """
+        assert hasattr(other, 'weight')
+        assert other.weight.size() == (self.out_features, self.in_features)
+        if self.use_bias:
+            assert hasattr(other, 'bias')
+            assert other.bias is not None
+            assert other.bias.size() == (self.out_features, )
+        else:
+            assert other.bias is None
+
+        for row in range(self.out_splits):
+            rstart = self.out_parts[row]
+            rstop = self.out_parts[row + 1]
+
+            for col in range(self.in_splits):
+                cstart = self.in_parts[col]
+                cstop = self.in_parts[col + 1]
+
+                local = self.linears[row][col]
+                global_weight = other.weight[rstart:rstop, cstart:cstop]
+                with deepspeed.zero.GatheredParameters(local.weight, modifier_rank=0):
+                    local.weight.copy_(global_weight)
+
+            if local.bias is not None:
+                with deepspeed.zero.GatheredParameters(local.bias, modifier_rank=0):
+                    local.bias.data.copy_(other.bias[rstart:rstop].data)
+
+
+class TiledLinearReturnBias(TiledLinear):
+    """Wrapper for a Linear class that returns its own bias parameter, such as
+    used by Megatron-LM.
+    """
+    def _reduce_local_output(self, in_id, out_id, current_out, new_out):
+        """Reduces output tensors, but not the returned bias. """
+        if current_out is not None:
+            old_tensor, old_bias = current_out
+        else:
+            old_tensor, old_bias = None, None
+
+        assert isinstance(new_out, tuple)
+        assert len(new_out) == 2
+
+        tensor, bias = new_out
+        assert tensor is not None
+
+        tensor = super()._reduce_local_output(in_id=in_id,
+                                              out_id=out_id,
+                                              current_out=old_tensor,
+                                              new_out=tensor)
+
+        if bias is None:
+            bias = old_bias
+
+        return tensor, bias
+
+    def _combine_output_splits(self, outputs):
+        # stack output tensors
+        tensors = [o[0] for o in outputs]
+        tensor = super()._combine_output_splits(tensors)
+
+        # stack biases if applicable
+        biases = [o[1] for o in outputs if o[1] is not None]
+        if len(biases) > 0:
+            bias = super()._combine_output_splits(biases)
+        else:
+            bias = None
+
+        return tensor, bias
diff --git a/docs/_config.yml b/docs/_config.yml
index 19d679042b90..a39298be04f9 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -33,15 +33,22 @@ collections:
       - advanced-install.md
       - getting-started.md
       - azure.md
-      - cifar-10.md
-      - bert-pretraining.md
       - bert-finetuning.md
-      - transformer_kernel.md
+      - bert-pretraining.md
+      - cifar-10.md
+      - flops-profiler.md
+      - gan.md
+      - lrrt.md
       - megatron.md
       - one-cycle.md
-      - lrrt.md
+      - onebit-adam.md
+      - onebit-lamb.md
+      - pipeline.md
+      - progressive_layer_dropping.md
+      - sparse-attention.md
+      - transformer_kernel.md
+      - zero-offload.md
       - zero.md
-      - flops-profiler.md
 
 defaults:
   - scope:
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 318cb2213404..6ab28bb84fd4 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -43,6 +43,10 @@ lnav:
         url: /docs/config-json/#gradient-clipping
       - title: "ZeRO optimizations"
         url: /docs/config-json/#zero-optimizations-for-fp16-training
+      - title: "Parameter Offloading"
+        url: /docs/config-json/#parameter-offloading
+      - title: "Optimizer Offloading"
+        url: /docs/config-json/#optimizer-offloading
       - title: "Logging"
         url: /docs/config-json/#logging
       - title: "Flops Profiler"
@@ -76,6 +80,8 @@ lnav:
         url: /tutorials/one-cycle/
       - title: "One-Bit Adam"
         url: /tutorials/onebit-adam/
+      - title: "One-Bit LAMB"
+        url: /tutorials/onebit-lamb/
       - title: "Pipeline Parallelism"
         url: /tutorials/pipeline/
       - title: "Progressive Layer Dropping"
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 6c34100095cd..8d33179862ef 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -4,37 +4,40 @@ title: "DeepSpeed Configuration JSON"
 
 ### Batch Size Related Parameters
 
-**Note:** configuring ***train\_batch\_size*** is required.
+**Note:** configuring <i>**train_batch_size**</i> is required.
 {: .notice--warning}
 
-***train\_batch\_size***: [integer]
+<i>**train_batch_size**</i>: [integer]
 
 | Value                                                                                                                                                                                                                                                                                                                                                                             | Example |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| The effective training batch size. This is the amount of data samples that leads to one step of model update. ***train\_batch\_size*** is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., ***train\_step\_batch\_size***),  the gradient accumulation steps (a.k.a., ***gradient\_accumulation\_steps***), and the number of GPUs. | `32`    |
-***train\_micro\_batch\_size\_per\_gpu***: [integer]
+| The effective training batch size. This is the amount of data samples that leads to one step of model update. <i>**train_batch_size**</i> is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., <i>**train_step_batch_size**</i>),  the gradient accumulation steps (a.k.a., <i>**gradient_accumulation_steps**</i>), and the number of GPUs. | `32`    |
+
+
+<i>**train_micro_batch_size_per_gpu**</i>: [integer]
+
 | Description                                                                                                                                                                                                                                                                                                                    | Default                        |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------ |
-| Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, ***gradient\_accumulation\_steps*** is automatically calculated using ***train\_batch\_size*** and number of GPUs. Should not be concurrently specified with ***gradient\_accumulation\_steps*** in the configuration JSON. | ***train\_batch\_size*** value |
+| Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, <i>**gradient_accumulation_steps**</i> is automatically calculated using <i>**train_batch_size**</i> and number of GPUs. Should not be concurrently specified with <i>**gradient_accumulation_steps**</i> in the configuration JSON. | <i>**train_batch_size**</i> value |
 
-***gradient\_accumulation\_steps***: [integer]
+<i>**gradient_accumulation_steps**</i>: [integer]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, ***train\_step\_batch\_size*** is automatically calculated using ***train\_batch\_size*** and number of GPUs. Should not be concurrently specified with ***train\_step\_batch\_size*** in the configuration JSON. | `1`     |
+| Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, <i>**train_step_batch_size**</i> is automatically calculated using <i>**train_batch_size**</i> and number of GPUs. Should not be concurrently specified with <i>**train_step_batch_size**</i> in the configuration JSON. | `1`     |
 
 
 
 ### Optimizer Parameters
 
-***optimizer***: [dictionary]
+<i>**optimizer**</i>: [dictionary]
 
 | Fields | Value                                                                                                                                                                                                                                                                                        | Example                      |
 | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------- |
-| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, and **Lamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
+| type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, **Lamb**, and **OneBitLamb** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
 | params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
-  Example of ***optimizer*** with Adam
+  Example of <i>**optimizer**</i> with Adam
 
 ```json
 "optimizer": {
@@ -57,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
 | torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
 | adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |
 
-  Another example of ***optimizer*** with 1-bit Adam
+Another example of <i>**optimizer**</i> with 1-bit Adam specific parameters is as follows.
 
 ```json
 "optimizer": {
@@ -79,14 +82,53 @@ The Adam optimizer also supports the following two params keys/values in additio
 
 The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
 
+| "params" key        | Description                                                                        | Default |
+| ------------------- | ---------------------------------------------------------------------------------- | ------- |
+| freeze\_step        | Number of warm up steps before 1-bit compression gets applied to the communication | 100000  |
+| cuda\_aware         | To indicate that the underlying MPI library supports CUDA-Aware communication      | false   |
+| comm\_backend\_name | To indicate which backend implementation to use                                    | "nccl"  |
+
+Another example of ***optimizer*** with 1-bit LAMB
+
+```json
+"optimizer": {
+    "type": "OneBitLamb",
+    "params": {
+      "lr": 11e-3,
+      "weight_decay": 0.01,
+      "bias_correction": false,
+      "max_coeff": 0.3,
+      "min_coeff": 0.01,
+      "freeze_step": 1000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl",
+      "coeff_beta": 0.9,
+      "factor_max": 4.0,
+      "factor_min": 0.5,
+      "factor_threshold": 0.1
+    }
+  }
+```
+
+The 1-bit LAMB optimizer supports the following params keys/values in addition to the standard LAMB (learn more in our [tutorial](/tutorials/onebit-lamb/)):
+
 | "params" key  | Description                                                                 | Default |
 | ------------- | --------------------------------------------------------------------------- | ------- |
-| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication | 100000   |
-| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication         | false    |
-| comm\_backend\_name | To indicate which backend implementation to use                               | "nccl"   |
+| max\_coeff   | Scaling coefficient upper bound for original LAMB algorithm and 1-bit LAMB's warmup stage   | 10.0   |
+| min\_coeff   | Scaling coefficient lower bound for original LAMB algorithm and 1-bit LAMB's warmup stage   | 0.01   |
+| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication   | 100000   |
+| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication           | false    |
+| comm\_backend\_name | To indicate which backend implementation to use                                 | "nccl"   |
+| coeff\_beta | Coefficient used for computing running averages of lamb coefficient                     | 0.9      |
+| factor\_max | Maximum value of scaling factor to the frozen lamb coefficient during compression stage | 4.0      |
+| factor\_min | Minimum value of scaling factor to the frozen lamb coefficient during compression stage | 0.5      |
+| factor\_threshold | Threshold of how much the scaling factor can fluctuate between steps              | 0.1      |
 
 ### Scheduler Parameters
 
+
+DeepSpeed calls the `step()` method of the scheduler at every training step when `model_engine.step()` is executed.
+
 ***scheduler***: [dictionary]
 
 | Fields | Value                                                                                                                      | Example                                        |
@@ -94,7 +136,7 @@ The 1-bit Adam optimizer supports the following three params keys/values in addi
 | type   | The scheduler name. See [here](https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers. | `"WarmupLR"`                                   |
 | params | Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.       | `{"warmup_min_lr": 0, "warmup_max_lr": 0.001}` |
 
-Example of ***scheduler***
+Example of <i>**scheduler**</i>
 
 ```json
  "scheduler": {
@@ -109,25 +151,25 @@ Example of ***scheduler***
 
 ### Communication options
 
-***fp32\_allreduce***: [boolean]
+<i>**fp32_allreduce**</i>: [boolean]
 
 | Description                                                    | Default |
 | -------------------------------------------------------------- | ------- |
 | During gradient averaging perform allreduce with 32 bit values | `false` |
 
-***prescale\_gradients***: [boolean]
+<i>**prescale_gradients**</i>: [boolean]
 
 | Description                            | Default |
 | -------------------------------------- | ------- |
 | Scale gradients before doing allreduce | `false` |
 
-***gradient_predivide_factor***: [float]
+<i>**gradient_predivide_factor**</i>: [float]
 
 | Description                                                                                                                                       | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs | `1.0`   |
 
-***sparse\_gradients***: [boolean]
+<i>**sparse_gradients**</i>: [boolean]
 
 | Description                                                                                                              | Default |
 | ------------------------------------------------------------------------------------------------------------------------ | ------- |
@@ -138,7 +180,7 @@ Example of ***scheduler***
 **Note:** this mode cannot be combined with the `amp` mode described below.
 {: .notice--warning}
 
-***fp16***: [dictionary]
+<i>**fp16**</i>: [dictionary]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -155,45 +197,48 @@ Example of ***scheduler***
 }
 ```
 
-***fp16:enabled***: [boolean]
+<i>**fp16:enabled**</i>: [boolean]
 
 | Description                                                                            | Default |
 | -------------------------------------------------------------------------------------- | ------- |
-| ***enabled*** is a **fp16** parameter indicating whether or not FP16 training enabled. | `false` |
+| <i>**enabled**</i> is a **fp16** parameter indicating whether or not FP16 training enabled. | `false` |
 
-***fp16:loss\_scale***: [float]
+<i>**fp16:loss_scale**</i>: [float]
 
 | Description                                                                                                                                                                                                                  | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***loss\_scale*** is a ***fp16*** parameter representing the loss scaling value for FP16 training. The default value of 0.0 results in dynamic loss scaling, otherwise the value will be used for static fixed loss scaling. | `0.0`   |
+| <i>**loss_scale**</i> is a <i>**fp16**</i> parameter representing the loss scaling value for FP16 training. The default value of 0.0 results in dynamic loss scaling, otherwise the value will be used for static fixed loss scaling. | `0.0`   |
+
+<i>**fp16:initial_scale_power**</i>: [integer]
 
-***fp16:initial\_scale\_power***: [integer]
-| Description                                                                                                                                                                                       | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***initial\_scale\_power*** is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup>***initial\_scale\_power***</sup>. | `32`    | ***fp16:loss\_scale\_window***: [integer]
+| Description                                                                                                                                                                                                   | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `32`    |
+
+<i>**fp16:loss_scale_window**</i>: [integer]
 
 | Description                                                                                                                       | Default |
 | --------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| ***loss\_scale\_window*** is a **fp16** parameter representing the window over which to raise/lower the dynamic loss scale value. | `1000`  |
+| <i>**loss_scale_window**</i> is a **fp16** parameter representing the window over which to raise/lower the dynamic loss scale value. | `1000`  |
 
-***fp16:hysteresis***: [integer]
+<i>**fp16:hysteresis**</i>: [integer]
 
 | Description                                                                                    | Default |
 | ---------------------------------------------------------------------------------------------- | ------- |
-| ***hysteresis*** is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |
+| <i>**hysteresis**</i> is a **fp16** parameter representing the delay shift in dynamic loss scaling. | `2`     |
 
-***fp16:min\_loss\_scale***: [integer]
+<i>**fp16:min_loss_scale**</i>: [integer]
 
 | Description                                                                                        | Default |
 | -------------------------------------------------------------------------------------------------- | ------- |
-| ***min\_loss\_scale*** is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1000`  |
+| <i>**min_loss_scale**</i> is  a **fp16** parameter representing the minimum dynamic loss scale value. | `1000`  |
 
 ### Automatic mixed precision (AMP) training options
 
 **Note:** this mode cannot be combined with the `fp16` mode described above. In addition this mode is not currently compatible with ZeRO.
 {: .notice--warning}
 
-***amp***: [dictionary]
+<i>**amp**</i>: [dictionary]
 
 | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Default |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -208,11 +253,11 @@ Example of ***scheduler***
 }
 ```
 
-***amp:enabled***: [boolean]
+<i>**amp:enabled**</i>: [boolean]
 
 | Description                                                                              | Default |
 | ---------------------------------------------------------------------------------------- | ------- |
-| ***enabled*** is an **amp** parameter indicating whether or not AMP training is enabled. | `false` |
+| <i>**enabled**</i> is an **amp** parameter indicating whether or not AMP training is enabled. | `false` |
 
 ***amp params***: [various]
 
@@ -222,7 +267,7 @@ Example of ***scheduler***
 
 ### Gradient Clipping
 
-***gradient\_clipping***: [float]
+<i>**gradient_clipping**</i>: [float]
 
 | Description                         | Default |
 | ----------------------------------- | ------- |
@@ -242,124 +287,224 @@ Enabling and configuring ZeRO memory optimizations
     "reduce_scatter": [true|false],
     "reduce_bucket_size": 5e8,
     "contiguous_gradients" : [true|false],
-    "cpu_offload": [true|false],
-    "cpu_offload_params" : [true|false],
-    "cpu_offload_use_pin_memory" : [true|false],
+    "offload_param": {
+      ...
+    },
+    "offload_optimizer": {
+      ...
+    },
     "stage3_max_live_parameters" : 1e9,
     "stage3_max_reuse_distance" : 1e9,
     "stage3_prefetch_bucket_size" : 5e8,
     "stage3_param_persistence_threshold" : 1e6,
     "sub_group_size" : 1e12,
-    "elastic_checkpoint" : [true|false]
+    "elastic_checkpoint" : [true|false],
+    "stage3_gather_fp16_weights_on_model_save": [true|false]
     }
 ```
 
-***zero\_optimization***: [dictionary]
+<i>**zero_optimization**</i>: [dictionary]
 
 | Description                                                                                               | Default |
 | --------------------------------------------------------------------------------------------------------- | ------- |
 | Enable ZeRO memory optimization wrapper for FP16 Training. Currently compatible only with Adam optimizer. | `false` |
 
-***stage***: [integer]
+<i>**stage**</i>: [integer]
 
-| Description                                                                                                                                                           | Default |
-| --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                                                                                               | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Chooses different stages of ZeRO Optimizer. Stage 0, 1, 2, and 3 refer to disabled, optimizer state partitioning, and optimizer+gradient state partitioning, and optimizer+gradient+parameter partitioning, respectively. | `0`     |
 
-***allgather_partitions***: [boolean]
+<i>**allgather_partitions**</i>: [boolean]
 
 | Description                                                                                                                                      | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
 | Chooses between allgather collective or a series of broadcast collectives to gather updated parameters from all the GPUs at the end of each step | `true`  |
 
-***allgather_bucket_size***: [boolean]
+***allgather_bucket_size***: [integer]
 
 | Description                                                                                                  | Default |
 | ------------------------------------------------------------------------------------------------------------ | ------- |
 | Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes | `5e8`   |
 
-***overlap_comm***: [boolean]
+<i>**overlap_comm**</i>: [boolean]
 
 | Description                                                                  | Default |
 | ---------------------------------------------------------------------------- | ------- |
 | Attempts to overlap the reduction of the gradients with backward computation | `false` |
 
-***reduce_scatter***: [boolean]
+<i>**reduce_scatter**</i>: [boolean]
 
 | Description                                                             | Default |
 | ----------------------------------------------------------------------- | ------- |
 | Uses reduce or reduce scatter instead of allreduce to average gradients | `true`  |
 
-***reduce_bucket_size***: [boolean]
+***reduce_bucket_size***: [integer]
 
 | Description                                                                                                         | Default |
 | ------------------------------------------------------------------------------------------------------------------- | ------- |
 | Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes | `5e8`   |
 
-***contiguous_gradients***: [boolean]
+<i>**contiguous_gradients**</i>: [boolean]
 
 | Description                                                                                                                                                     | Default |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models. | `False` |
 
-***cpu_offload***: [boolean]
-
-| Description                                                                                                              | Default |
-| ------------------------------------------------------------------------------------------------------------------------ | ------- |
-| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. | `False` |
 
-***cpu_offload_params***: [boolean]
+***offload_param***: [dictionary]
 
 | Description                                                                                                                       | Default |
 | --------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Enable offloading of model parameters to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. | `False` |
+| Enable offloading of model parameters to CPU or NVMe. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#parameter-offloading) for more details. | `False` |
 
-***cpu_offload_use_pin_memory***: [boolean]
+***offload_optimizer***: [dictionary]
 
 | Description                                                                               | Default |
 | ----------------------------------------------------------------------------------------- | ------- |
-| Use pinned CPU memory when offloading. Can improve performance. Valid only with stage 3.  | `False` |
+| Enable offloading of optimizer state to CPU or NVMe, and optimizer computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 3. See [here](#optimizer-offloading) for more details. | `False` |
 
 ***stage3_max_live_parameters***: [integer]
 
-| Description                                                                                                                           | Default |
-| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                         | Default |
+| ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but perform more communication. | `1e9`   |
 
 ***stage3_max_reuse_distance***: [integer]
 
-| Description                                                                                                      | Default |
-| ---------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                                          | Default |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but perform more communication. | `1e9`   |
 
 ***stage3_prefetch_bucket_size***: [integer]
 
-| Description                                                                                                                     | Default |
-| ------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Description                                                                                                                            | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The size of the fixed buffer for prefetching parameters. Smaller values use less memory, but can increase stalls due to communication. | `5e8`   |
 
 
 ***stage3_param_persistence_threshold***: [integer]
+
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
 
 
+***stage3_gather_fp16_weights_on_model_save***: [boolean]
+
+| Description                                                                                                                                                          | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |
+
+***cpu_offload***: [boolean]
+
+**Deprecated:** **cpu_offload** is disabled and will be removed in future, please use `offload_optimizer` instead.
+{: .notice--warning}
+
+| Description                                                                                                              | Default |
+| ------------------------------------------------------------------------------------------------------------------------ | ------- |
+| Enable offloading of optimizer memory and computation to CPU. This frees up GPU memory for larger models or batch sizes. Valid only with stage 2.| `False` |
+
+
+### Parameter offloading
+Enabling and configuring ZeRO optimization of parameter offloading to CPU/NVMe. Available only with ZeRO stage 3.
+```json
+  "offload_param": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 5,
+    "buffer_size": 1e8,
+    "max_in_cpu": 1e9
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload model parameters. Supported options are `cpu` and `nvme`. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for parameter offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for parameter offloading to NVMe. | 5  |
+
+
+***buffer_size***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Size of buffers in buffer pool for parameter offloading to NVMe. | 1e8  |
+
+***max_in_cpu***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of parameter elements to maintain in CPU memory when offloading to NVMe is enabled. | 1e9  |
+
+### Optimizer offloading
+Enabling and configuring ZeRO optimization of offloading optimizer computation to CPU and state to CPU/NVMe. CPU offloading is available with ZeRO stage 2 or 3. NVMe offloading is available only with ZeRO stage 3.
+```json
+  "offload_optimizer": {
+    "device": "[none|cpu|nvme]",
+    "nvme_path": "/local_nvme",
+    "buffer_count": 4,
+    "pin_memory": [true|false],
+    "fast_init": false
+  }
+```
+***device***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Device memory to offload optimizer state. Supported options are `cpu` and `nvme`. Optimizer computation is offload to CPU regardless of device option. | `cpu`   |
+
+***nvme_path***: [string]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Filesystem path for NVMe device for optimizer state offloading. | `/local_nvme`   |
+
+***buffer_count***: [integer]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Number of buffers in buffer pool for optimizer state offloading to NVMe. This should be at least the number of states maintained per parameter by the optimizer. For example, Adam optimizer has 4 states (parameter, gradient, momentum, and variance). | 4  |
+
+
+***pin_memory***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Offload to page-locked CPU memory. This could boost throughput at the cost of extra memory overhead. | `false`  |
+
+***fast_init***: [boolean]
+
+| Description                                                                                                                           | Default |
+| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| Enable fast optimizer initialization when offloading to NVMe. | `false`  |
+
 ### Logging
 
-***steps\_per\_print***: [integer]
+<i>**steps_per_print**</i>: [integer]
 
 | Description                    | Default |
 | ------------------------------ | ------- |
 | Print train loss every N steps | `10`    |
 
-***wall\_clock\_breakdown***: [boolean]
+<i>**wall_clock_breakdown**</i>: [boolean]
 
 | Description                                                             | Default |
 | ----------------------------------------------------------------------- | ------- |
 | Enable timing of the latency of forward/backward/update training phases | `false` |
 
-***dump_state***: [boolean]
+<i>**dump_state**</i>: [boolean]
 
 | Description                                                          | Default |
 | -------------------------------------------------------------------- | ------- |
@@ -377,31 +522,31 @@ Enabling and configuring ZeRO memory optimizations
     }
 }
 ```
-***enabled***: [boolean]
+<i>**enabled**</i>: [boolean]
 
 | Description                 | Default |
 | --------------------------- | ------- |
 | Enables the flops profiler. | `false` |
 
-***profile\_step***: [integer]
+<i>**profile_step**</i>: [integer]
 
 | Description                                                                                                     | Default |
 | --------------------------------------------------------------------------------------------------------------- | ------- |
 | The global training step at which to profile. Note that warm up steps are needed for accurate time measurement. | `1`     |
 
-***module\_depth***: [integer]
+<i>**module_depth**</i>: [integer]
 
 | Description                                                                                                                                                            | Default |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | The depth of the model at which to print the aggregated module information. When set to `-1`, it prints information on the innermost modules (with the maximum depth). | `-1`    |
 
-***top\_modules***: [integer]
+<i>**top_modules**</i>: [integer]
 
 | Description                                                                  | Default |
 | ---------------------------------------------------------------------------- | ------- |
 | Limits the aggregated profile output to the number of top modules specified. | `3`     |
 
-***detailed***: [boolean]
+<i>**detailed**</i>: [boolean]
 
 | Description                                  | Default |
 | -------------------------------------------- | ------- |
@@ -418,39 +563,39 @@ Enabling and configuring ZeRO memory optimizations
     "profile": false
     }
 ```
-***partition\_activations***: [boolean]
+<i>**partition_activations**</i>: [boolean]
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
 | Enables partition activation when used with model parallelism | `false` |
 
-***cpu\_checkpointing***: [boolean]
+<i>**cpu_checkpointing**</i>: [boolean]
 
 | Description                                                                 | Default |
 | --------------------------------------------------------------------------- | ------- |
 | Offloads partitioned activations to CPU if partition_activations is enabled | `false` |
 
 
-***contiguous\_memory\_optimization***: [boolean]
+<i>**contiguous_memory_optimization**</i>: [boolean]
 
 | Description                                                          | Default |
 | -------------------------------------------------------------------- | ------- |
 | Copies partitioned activations so that they are contiguous in memory | `false` |
 
-***number_checkpoints***: [integer]
+<i>**number_checkpoints**</i>: [integer]
 
 | Description                                                                                              | Default |
 | -------------------------------------------------------------------------------------------------------- | ------- |
 | Total number of activation checkpoints used to allocate memory buffer for contiguous_memoty_optimization | `None`  |
 
-***synchronize\_checkpoint\_boundary***: [boolean]
+<i>**synchronize_checkpoint_boundary**</i>: [boolean]
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
 | Inserts torch.cuda.synchronize() at each checkpoint boundary. | `false` |
 
 
-***profile***: [boolean]
+<i>**profile**</i>: [boolean]
 
 | Description                                                     | Default |
 | --------------------------------------------------------------- | ------- |
@@ -458,7 +603,7 @@ Enabling and configuring ZeRO memory optimizations
 
 ### Sparse Attention
 
-***sparse\_attention***: [dictionary]
+<i>**sparse_attention**</i>: [dictionary]
 
 | Fields                           | Value                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | Example           |
 | -------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------- |
@@ -476,7 +621,7 @@ Enabling and configuring ZeRO memory optimizations
 | global\_block\_end\_indices      | A list of integers determining end indices of global window blocks. By default this is not used. But if it is set, it must have the same size of global\_block\_indices parameter, and combining this two parameters, for each index i, blocks from global\_block\_indices[i] to global\_block\_end\_indices[i], exclusive, are considered as global attention; used in `"variable"` and `"bslongformer"` modes.                                                                                               | None              |
 | num\_sliding\_window\_blocks     | An integer determining the number of blocks in sliding local attention window; used in `"bigbird"` and `"bslongformer"` modes.                                                                                                                                                                                                                                                                                                                                                                                 | 3                 |
 
-  Example of ***sparse\_attention***
+  Example of <i>**sparse_attention**</i>
 
 ```json
   "sparse_attention": {
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
index ba955fd574db..9b0b89d0a64b 100755
--- a/docs/_pages/features.md
+++ b/docs/_pages/features.md
@@ -172,15 +172,17 @@ Please see the [core API doc](https://deepspeed.readthedocs.io/) for more detail
 
 ## Training Optimizers
 
-### 1-bit Adam optimizer with up to 5x less communication
+### 1-bit Adam and 1-bit LAMB optimizers with up to 5x less communication
 
-DeepSpeed has an efficient implementation of a novel algorithm called 1-bit Adam.
-It offers the same convergence as Adam, incurs up to 5x less communication that enables
+DeepSpeed has two communication-efficient optimizers called 1-bit Adam and 1-bit LAMB.
+They offer the same convergence as Adam/LAMB, incur up to 5x less communication that enables
 up to 3.5x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
 for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
-please refer to the detailed [tutorial](https://www.deepspeed.ai/tutorials/onebit-adam) and
-[blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md), respectively.
-<!-- **TODO: add paper link when it is ready ** -->
+please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
+[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
+and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
+please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888) and
+[1-bit LAMB paper](https://arxiv.org/abs/2104.06069).
 
 ### Fused Adam optimizer and arbitrary torch.optim.Optimizer
 With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 18f60e864039..3b9449d368fc 100644
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -73,6 +73,18 @@ DS_BUILD_OPS=1 pip install deepspeed --global-option="build_ext" --global-option
 
 This should complete the full build 2-3 times faster. You can adjust `-j` to specify how many cpu-cores are to be used during the build. In the example it is set to 8 cores.
 
+You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, pytorch, python, etc.)
+
+```bash
+DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel
+```
+
+This will create a pypi binary wheel under `dist`, e.g., ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` and then you can install it directly on multiple machines, in our example:
+
+```bash
+pip install dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
+```
+
 
 ## Install DeepSpeed from source
 
@@ -91,6 +103,23 @@ script in the repo. This will build a python wheel locally and copy it to all
 the nodes listed in your hostfile (either given via --hostfile, or defaults to
 /job/hostfile).
 
+When the code using DeepSpeed is used for the first time it'll automatically build only the CUDA
+extensions, required for the run, and by default it'll place them under
+`~/.cache/torch_extensions/`. The next time the same program is executed these now precompiled
+extensions will be loaded form that directory.
+
+If you use multiple virtual environments this could be a problem, since by default there is only one
+extensions directory, but different virtual environments may use different setups (e.g. different
+python or cuda versions) and then the loading of a CUDA extension built by another environment will
+fail. Therefore, if you need to you can override the default location with the help of the
+ `TORCH_EXTENSIONS_DIR` environment variable. So in each virtual environment you can point it to a
+ unique directory and DeepSpeed will use it to save and load CUDA extensions.
+
+ You can also change it just for a specific run with:
+
+```bash
+ TORCH_EXTENSIONS_DIR=./torch-extensions deepspeed ...
+```
 
 ## Building for the correct architectures
 
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index ecd3159df8c9..16fb5035cc97 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -1,7 +1,7 @@
 ---
-title: "Getting Started"
+title: 'Getting Started'
 permalink: /getting-started/
-excerpt: "First steps with DeepSpeed"
+excerpt: 'First steps with DeepSpeed'
 date: 2020-05-15
 ---
 
@@ -13,12 +13,14 @@ date: 2020-05-15
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
 
 ## Writing DeepSpeed Models
+
 DeepSpeed model training is accomplished using the DeepSpeed engine. The engine
 can wrap any arbitrary model of type `torch.nn.module` and has a minimal set of APIs
 for training and checkpointing the model. Please see the tutorials for detailed
 examples.
 
 To initialize the DeepSpeed engine:
+
 ```python
 model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
                                                      model=model,
@@ -27,10 +29,10 @@ model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args,
 
 `deepspeed.initialize` ensures that all of the necessary setup required for
 distributed data parallel or mixed precision training are done
-appropriately under the hood.  In addition to wrapping the model, DeepSpeed can
+appropriately under the hood. In addition to wrapping the model, DeepSpeed can
 construct and manage the training optimizer, data loader, and the learning rate
 scheduler based on the parameters passed to `deepspeed.initialize` and the
-DeepSpeed [configuration file](#deepspeed-configuration).
+DeepSpeed [configuration file](#deepspeed-configuration). Note that DeepSpeed automatically executes the learning rate schedule at every training step.
 
 If you already have a distributed environment setup, you'd need to replace:
 
@@ -48,7 +50,6 @@ The default is to use the NCCL backend, which DeepSpeed has been thoroughly test
 
 But if you don't need the distributed environment setup until after `deepspeed.initialize()` you don't have to use this function, as DeepSpeed will automatically initialize the distributed environment during its `initialize`. Regardless, you will need to remove `torch.distributed.init_process_group` if you already had it in place.
 
-
 ### Training
 
 Once the DeepSpeed engine has been initialized, it can be used to train the
@@ -67,32 +68,31 @@ for step, batch in enumerate(data_loader):
     model_engine.step()
 ```
 
-
 Under the hood, DeepSpeed automatically performs the necessary operations
 required for distributed data parallel training, in mixed precision, with a
-pre-defined learning rate schedule:
+pre-defined learning rate scheduler:
 
-* **Gradient Averaging**: in distributed data parallel training, `backward`
+- **Gradient Averaging**: in distributed data parallel training, `backward`
   ensures that gradients are averaged across data parallel processes after
   training on an `train_batch_size`.
 
-* **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
+- **Loss Scaling**: in FP16/mixed precision training, the DeepSpeed
   engine automatically handles scaling the loss to avoid precision loss in the
   gradients.
 
-* **Learning Rate Schedule**: if using DeepSpeed's learning rate
-  schedule, then DeepSpeed automatically handles any updates to the learning
-  rate when `step` is executed.
-
-
+- **Learning Rate Scheduler**: when using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+  - if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
+  - if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
 ### Model Checkpointing
+
 Saving and loading the training state is handled via the `save_checkpoint` and
 `load_checkpoint` API in DeepSpeed which takes two arguments to uniquely
 identify a checkpoint:
-  * `ckpt_dir`: the directory where checkpoints will be saved.
-  * `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
-    In the following code snippet, we use the loss value as the checkpoint identifier.
+
+- `ckpt_dir`: the directory where checkpoints will be saved.
+- `ckpt_id`: an identifier that uniquely identifies a checkpoint in the directory.
+  In the following code snippet, we use the loss value as the checkpoint identifier.
 
 ```python
 #load checkpoint
@@ -133,6 +133,7 @@ each process needs to save its master weights and scheduler+optimizer states. Th
 waiting to synchronize with other processes if it's called just for the process with rank 0.
 
 ## DeepSpeed Configuration
+
 DeepSpeed features can be enabled, disabled, or configured using a config JSON
 file that should be specified as `args.deepspeed_config`. A sample config file
 is shown below. For a full set of features see [ API
@@ -156,6 +157,7 @@ doc](/docs/config-json/).
 ```
 
 # Launching DeepSpeed Training
+
 DeepSpeed installs the entry point `deepspeed` to launch distributed training.
 We illustrate an example usage of DeepSpeed with the following assumptions:
 
@@ -164,18 +166,20 @@ We illustrate an example usage of DeepSpeed with the following assumptions:
 3. `client args` is the `argparse` command line arguments
 4. `ds_config.json` is the configuration file for DeepSpeed
 
-
 ## Resource Configuration (multi-node)
+
 DeepSpeed configures multi-node compute resources with hostfiles that are compatible with
 [OpenMPI](https://www.open-mpi.org/) and [Horovod](https://github.com/horovod/horovod).
-A hostfile is a list of *hostnames* (or SSH aliases), which are machines accessible via passwordless
-SSH, and *slot counts*, which specify the number of GPUs available on the system. For
+A hostfile is a list of _hostnames_ (or SSH aliases), which are machines accessible via passwordless
+SSH, and _slot counts_, which specify the number of GPUs available on the system. For
 example,
+
 ```
 worker-1 slots=4
 worker-2 slots=4
 ```
-specifies that two machines named *worker-1* and *worker-2* each have four GPUs to use
+
+specifies that two machines named _worker-1_ and _worker-2_ each have four GPUs to use
 for training.
 
 Hostfiles are specified with the `--hostfile` command line option. If no hostfile is
@@ -183,9 +187,9 @@ specified, DeepSpeed searches for `/job/hostfile`. If no hostfile is specified o
 DeepSpeed queries the number of GPUs on the local machine to discover the number of local
 slots available.
 
-
 The following command launches a PyTorch training job across all available nodes and GPUs
 specified in `myhostfile`:
+
 ```bash
 deepspeed --hostfile=myhostfile <client_entry.py> <client args> \
   --deepspeed --deepspeed_config ds_config.json
@@ -195,20 +199,25 @@ Alternatively, DeepSpeed allows you to restrict distributed training of your mod
 subset of the available nodes and GPUs. This feature is enabled through two command line
 arguments: `--num_nodes` and `--num_gpus`. For example, distributed training can be
 restricted to use only two nodes with the following command:
+
 ```bash
 deepspeed --num_nodes=2 \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
+
 You can instead include or exclude specific resources using the `--include` and
 `--exclude` flags. For example, to use all available resources **except** GPU 0 on node
-*worker-2* and GPUs 0 and 1 on *worker-3*:
+_worker-2_ and GPUs 0 and 1 on _worker-3_:
+
 ```bash
 deepspeed --exclude="worker-2:0@worker-3:0,1" \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
-Similarly, you can use **only** GPUs 0 and 1 on *worker-2*:
+
+Similarly, you can use **only** GPUs 0 and 1 on _worker-2_:
+
 ```bash
 deepspeed --include="worker-2:0,1" \
 	<client_entry.py> <client args> \
@@ -228,24 +237,26 @@ executing from and also in your home directory (`~/`).
 As a concrete example, some clusters require special NCCL variables to set
 prior to training. The user can simply add these variables to a
 `.deepspeed_env` file in their home directory that looks like this:
+
 ```
 NCCL_IB_DISABLE=1
 NCCL_SOCKET_IFNAME=eth0
 ```
+
 DeepSpeed will then make sure that these environment variables are set when
 launching each process on every node across their training job.
 
-
 ### MPI and AzureML Compatibility
+
 As described above, DeepSpeed provides its own parallel launcher to help launch
 multi-node/multi-gpu training jobs. If you prefer to launch your training job
 using MPI (e.g., mpirun), we provide support for this. It should be noted that
-DeepSpeed will still use the torch distributed NCCL backend and *not* the MPI
+DeepSpeed will still use the torch distributed NCCL backend and _not_ the MPI
 backend.
 
 To launch your training job with mpirun + DeepSpeed or with AzureML (which uses
 mpirun as a launcher backend) you simply need to install the
-[mpi4py](https://pypi.org/project/mpi4py/) python package.  DeepSpeed will use
+[mpi4py](https://pypi.org/project/mpi4py/) python package. DeepSpeed will use
 this to discover the MPI environment and pass the necessary state (e.g., world
 size, rank) to the torch distributed backend.
 
@@ -259,8 +270,9 @@ deepspeed.init_distributed()
 ```
 
 ## Resource Configuration (single-node)
+
 In the case that we are only running on a single node (with one or more GPUs)
-DeepSpeed *does not* require a hostfile as described above. If a hostfile is
+DeepSpeed _does not_ require a hostfile as described above. If a hostfile is
 not detected or passed in then DeepSpeed will query the number of GPUs on the
 local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
@@ -269,6 +281,7 @@ as the hostname.
 Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
 which devices should be used. For example, to use only gpu1 of the current
 node, do:
+
 ```bash
 deepspeed --include localhost:1 ...
 ```
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 1a15000135c9..feef71682513 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -7,7 +7,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 {: .notice--info}
 
 **Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
@@ -23,7 +23,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert
 
 ### 1.1 Pre-requisites for installing DeepSpeed
 
-If you don't already have a copy of the DeepSpeed repository, please clone in
+If you don't already have a copy of the DeepSpeed repository, please clone it
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
 
 ```shell
@@ -106,7 +106,7 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
-1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+1-bit Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
 {: .notice--warning}
 
 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
@@ -191,7 +191,7 @@ Table 1. Fine-tuning configuration
 
 ### 2.3 Performance Results for BingBertSQuAD Fine-tuning
 
-***Accuracy:***
+<i>**Accuracy:**</i>
 The results are summarized in the table below. The total batch size is set to 96 and training is conducted
 on 32 GPUs for 2 epochs. A set of parameters (seeds and learning rates) were tried and the best ones were selected.
 We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scores we achieved that are on-par or better than the [HuggingFace results](https://github.com/huggingface/transformers/tree/master/examples/question-answering).
diff --git a/docs/_tutorials/onebit-lamb.md b/docs/_tutorials/onebit-lamb.md
new file mode 100644
index 000000000000..f6d9341d9095
--- /dev/null
+++ b/docs/_tutorials/onebit-lamb.md
@@ -0,0 +1,130 @@
+---
+title: "1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed"
+---
+
+**Watch out!**
+1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit LAMB is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit LAMB's convergence. See details below.
+{: .notice--warning}
+
+In this tutorial, we introduce DeepSpeed's 1-bit LAMB optimizer which enables communication-efficient large-scale large-batch training with LAMB's convergence speed. 1-bit LAMB can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 4.6x. We also have a [paper](https://arxiv.org/abs/2104.06069) which provides the technical details including algorithm, system implementation, and evaluations.
+
+To illustrate the benefits and usage of 1-bit LAMB optimizer, we use the BERT Pre-training task as example. For more details on this task, please refer to the [tutorial](/tutorials/bert-pretraining/).
+
+## 1. Overview
+
+### 1.1 Pre-requisites for installing DeepSpeed
+
+If you don't already have a copy of the DeepSpeed repository, please clone it
+now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
+
+```shell
+git clone https://github.com/microsoft/DeepSpeed
+cd DeepSpeed
+git submodule update --init --recursive
+cd DeepSpeedExamples/
+```
+
+### 1.2 Pre-requisites for 1-bit LAMB
+
+#### 1.2.1 NCCL-based implementation
+
+In DeepSpeed, we introduce a system implementation for compressed communication using the NCCL backend of PyTorch distributed. This implementation provides better performance and usability than the MPI-based implementation below. Thus we highly recommend users to choose this implementation.
+
+**Watch out!**
+This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
+{: .notice--warning}
+
+#### 1.2.2 MPI-based implementation
+
+For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+
+We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
+
+```shell
+pip install deepspeed[1bit_adam]
+```
+
+We have tested CUDA-Aware MPI communication using the [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) library. However, any CUDA-Aware communication library including [OpenMPI](https://www.open-mpi.org/) should work fine with these examples.
+
+An example launch command for 1-bit LAMB using the `deepspeed` launcher is as follows:
+
+```shell
+deepspeed --launcher=[mvapich|openmpi] script.py
+```
+
+Please note that for MPI-based implementation of 1-bit LAMB, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+
+Alternatively, the standard mpirun launcher can also be used as follows:
+
+```shell
+mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+```
+
+### 1.3 1-bit LAMB Algorithm
+
+The detailed description of the 1-bit LAMB algorithm can be seen from our [paper](https://arxiv.org/abs/2104.06069).
+
+### 1.4 Configuration of 1-bit LAMB
+The 1-bit LAMB feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
+
+```json
+{
+  "train_batch_size": 65536,
+  "train_micro_batch_size_per_gpu": 64,
+  "optimizer": {
+    "type": "OneBitLamb",
+    "params": {
+      "lr": 11e-3,
+      "max_coeff": 0.3,
+      "min_coeff": 0.01,
+      "freeze_step": 1000,
+      "cuda_aware": false,
+      "comm_backend_name": "nccl",
+      "coeff_beta": 0.9,
+      "factor_max": 4.0,
+      "factor_min": 0.5,
+      "factor_threshold": 0.1
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "initial_scale_power": 16
+  }
+}
+```
+Please note the new parameters `freeze_step`, `cuda_aware`, `comm_backend_name`, `coeff_beta`, `factor_max`, `factor_min`, and `factor_threshold` that have been added to support the 1-bit LAMB feature:
+
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to LAMB's variance/second moment term and scaling coefficient. See detailed analysis in our [paper](https://arxiv.org/abs/2104.06069)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+
+`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+
+`coeff_beta` is used when calculating a moving average of the LAMB scaling coefficient during the warmup stage. This moving average is then used as the frozen base scaling coefficient during the compression stage.
+
+`factor_max`, `factor_min`, and `factor_threshold` are used to regularize the adaptive scaling of the frozen base scaling coefficient during the compression stage. `factor_max` and `factor_min` are the scaling factor upper/lower bound. `factor_threshold` defines the threshold of how much the scaling factor can fluctuate between steps.
+
+#### 1.4.1 Momentum masks for parameters with constant zero gradients
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit LAMB we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+
+**Watch out!**
+1-bit LAMB relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
+{: .notice--warning}
+
+## 2. BERT Pre-training with 1-bit LAMB
+For data downloading and pre-processing, please refer to the [BERT Pre-training tutorial](/tutorials/bert-pretraining/).
+
+### 2.1 Running Pre-training with DeepSpeed and 1-bit LAMB
+
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_lamb/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_lamb). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+
+### 2.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit LAMB enabled
+
+The `deepspeed_bsz64k_onebitlamb_config_seq128_*.json` and `deepspeed_bsz32k_onebitlamb_config_seq512_*.json` files give the user the ability to specify DeepSpeed
+options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters. In these files we include the tuned hyperparameters to reproduce experiments in our [paper](https://arxiv.org/abs/2104.06069).
+
+### 2.3 Performance Results for BERT Pre-training
+
+Performance results can be seen in our [paper](https://arxiv.org/abs/2104.06069).
diff --git a/docs/_tutorials/zero-offload.md b/docs/_tutorials/zero-offload.md
index 31c89bd5934e..a70a3674e90b 100644
--- a/docs/_tutorials/zero-offload.md
+++ b/docs/_tutorials/zero-offload.md
@@ -1,6 +1,8 @@
 ---
 title: "ZeRO-Offload"
 ---
+ZeRO-3 Offload consists of a subset of features in our newly released ZeRO-Infinity. Read our [ZeRO-Infinity blog](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/) to learn more!
+
 We recommend that you read the tutorials on [Getting Started](/getting-started/)  and [ZeRO](/tutorials/zero/) before stepping through this tutorial.
 
 ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index ad6e222707e0..217160400e29 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -12,7 +12,9 @@ ZeRO leverages the aggregate computation and memory resources of data parallelis
 
 * **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
-* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO will automatically collect and partition them during the forward and backward passes.
+* **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+In addition, ZeRO-3 includes the *infinity offload engine* to form ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload to both CPU and NVMe memory for huge memory savings.
 
 ## Training environment
 We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
@@ -106,121 +108,51 @@ Here is a screenshot of nvidia-smi showing GPU activity during training:
 <img src="/assets/images/zero2_dp32_10B_smi.png">
 </a>
 
-### Training trillion-scale models with ZeRO-3 Offload
+### Training trillion-scale models with ZeRO-Infinity
 
-Stage 3 can be enabled in the JSON configuration. A full description of these
-configurations is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
+ZeRO-3, the third stage of ZeRO, partitions the full model state (i.e.,
+weights, gradients, and optimizer states) to scale memory savings linearly
+with the degree of data parallelism. ZeRO-3 can be enabled in the JSON
+configuration. A full description of these configurations is available
+[here](/docs/config-json/#zero-optimizations-for-fp16-training).
 
-```json
-{
+
+#### Offloading to CPU and NVMe with ZeRO-Infinity
+
+ZeRO-Infinity uses DeepSpeed's infinity offload engine to offload the full
+model state to CPU or NVMe memory, allowing for even larger model sizes. Offloading
+can be enabled inside the DeepSpeed configuration:
+
+```diff
+@@ -6,5 +6,11 @@
   "zero_optimization": {
     "stage": 3,
-    "cpu_offload": true,
-    "cpu_offload_params": true,
-    "overlap_comm": true,
     "contiguous_gradients": true,
-    "stage3_max_live_parameters": 6000000,
-    "stage3_max_reuse_distance": 100000000,
-    "stage3_prefetch_bucket_size": 200000,
-    "stage3_param_persitance_threshold": 100000,
-    "reduce_bucket_size": 3000000,
-    "sub_group_size": 1e6
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+     "stage3_prefetch_bucket_size": 1e7,
+     "stage3_param_persistence_threshold": 1e5,
+     "reduce_bucket_size": 1e7,
+-    "sub_group_size": 1e9
++    "sub_group_size": 1e9,
++    "offload_optimizer": {
++      "device": "cpu"
++    },
++    "offload_param": {
++      "device": "cpu"
++    }
   }
-}
 ```
 
+**ZeRO-Infinity vs ZeRO-Offload:**
+DeepSpeed first included offloading capabilities with ZeRO-Offload,
+a system for offloading optimizer and gradient states to CPU memory
+within ZeRO-2. ZeRO-Infinity is the next generation of offloading
+capabilities accessible to ZeRO-3. ZeRO-Infinity is able to offload
+more data than ZeRO-Offload and has more effective bandwidth utilization
+and overlapping of computation and communication.
+{: .notice--info}
 
-ZeRO-3 will automatically collect and partition the parameters as they are
-needed during the forward and backward passes. However, in some cases a
-parameter may be used outside of its module's forward pass. We call these
-*external parameters*. ZeRO-3 can coordinate these parameters if they are
-registered. Please see our [ZeRO-3 docs](https://deepspeed.readthedocs.io/en/latest/zero3.html) for more
-information and examples of external parameters.
-
-The Megatron-LM model has three external parameters that must be registered
-with ZeRO-3. External parameters are those that are accessed outside of the
-owning module's forward pass.
-
-1. `megatron/model/gpt2_model.py:GPT2Model`: register the word embedding for both uses in forward.
-
-```python
-    class GPT2Model(MegatronModule):
-    def __init__(self, num_tokentypes=0, parallel_output=True):
-        ...
-        deepspeed.zero.register_external_parameter(self,
-                                                   self.language_model.embedding.word_embeddings.weight)
-
-
-    def forward(self, input_ids, position_ids, attention_mask, labels=None,
-                tokentype_ids=None, layer_past=None, get_key_value=False,
-                forward_method_parallel_output=None):
-        # self.embeddings will compute its forward pass here
-        lm_output = self.language_model(input_ids,
-                                        position_ids,
-                                        attention_mask,
-                                        tokentype_ids=tokentype_ids,
-                                        layer_past=layer_past,
-                                        get_key_value=get_key_value)
-        ...
-
-        # Accesses word_embeddings.weight outside of the embedding's forward pass.
-        output = parallel_lm_logits(
-            lm_output,
-            self.language_model.embedding.word_embeddings.weight,
-            parallel_output)
-```
-
-2. `megatron/model/transformer.py:ParallelMLP`: register a bias that is
-returned from a submodule forward and used in this forward.
-
-```python
-class ParallelMLP(MegatronModule):
-    def __init__(self, init_method, output_layer_init_method):
-        ...
-        if self.dense_h_to_4h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.dense_h_to_4h.bias)
-
-    def forward(self, hidden_states):
-
-        # bias_parallel is a parameter of dense_h_to_4h
-
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-        ...
-```
-
-3. `megatron/model/transformer.py:ParallelTransformerLayer`: register two biases that
-are returned from submodules and used in forward.
-
-```python
-class ParallelTransformerLayer(MegatronModule):
-    ...
-    def __init__(self, attention_mask_func, init_method,
-                 output_layer_init_method, layer_number):
-        ...
-        if self.attention.dense.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.attention.dense.bias)
-        if self.mlp.dense_4h_to_h.bias is not None:
-            deepspeed.zero.register_external_parameter(self, self.mlp.dense_4h_to_h.bias)
-
-    def forward(self, hidden_states, attention_mask, layer_past=None,
-                get_key_value=False):
-        ...
-        # attention_bias is a parameter returned from attention
-
-        # Self attention.
-        attention_output, attention_bias = \
-            self.attention(layernorm_output,
-                           attention_mask,
-                           layer_past=layer_past,
-                           get_key_value=get_key_value)
-
-        ...
-
-        # mlp_bias is a parameter returned from mlp
-        mlp_output, mlp_bias = self.mlp(layernorm_output)
-        ...
-```
 
 
 
@@ -231,7 +163,7 @@ that exceed *local* system memory, but not *total* system memory.
 
 1. Allocate the model in a memory-scalable fashion. The model parameters will
 be allocated and immediately partitioned across the data parallel group. If
-`remote_device="cpu"`, the model will also be allocated in CPU memory
+`remote_device` is  `"cpu"` or `"nvme"`, the model will also be allocated in CPU/NVMe memory
 instead of GPU memory. Please see the full
 [ZeRO-3 Init docs](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init)
 for more details.
@@ -243,7 +175,7 @@ for more details.
         model = GPT2Model(num_tokentypes=0, parallel_output=True)
     ```
 
-2. Gather the position embeddings weight for initialization. DeepSpeed will automatically
+2. Gather the embeddings weight for initialization. DeepSpeed will automatically
 gather a module's parameters during its constructor and for its forward and backward pass.
 However, additional accesses must coordinate with DeepSpeed to ensure that parameter data
 is gathered and subsequently partitioned. If the tensor is modified, the `modifier_rank`
@@ -258,7 +190,96 @@ for more details.
                                            modifier_rank=0):
         # Initialize the position embeddings.
         self.init_method(self.position_embeddings.weight)
+
+    ...
+
+    self.tokentype_embeddings = torch.nn.Embedding(...)
+    with deepspeed.zero.GatheredParameters(self.tokentype_embeddings.weight,
+                                        modifier_rank=0):
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
     ```
 
+#### Memory-centric tiling
+ZeRO-Infinity includes a replacement for `Linear` layers that further reduces memory.
+We optionally tile the model parallel linear layers found in each Transformer layer. Note
+that model parallelism and tiling can be combined by specifying the corresponding
+base class when building the layer.
+The `deepspeed.zero.TiledLinear` module exploits the data fetch and release
+pattern of ZeRO-3 to reduce the working memory requirements by breaking down
+a large operator into smaller tiles that can be executed sequentially.
+
+We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/microsoft/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
+model-parallel layers in `transformer.py` proceed similarly.
+
+The model parallel layers of Megatron-LM have a special form in which the
+additive `bias` of the layer is delayed and instead returned from `forward()`
+to be fused with a later operator. DeepSpeed's
+`deepspeed.zero.TiledLinearReturnBias` subclass of `TiledLinear` simply also
+forwards the returned `bias` parameter without accumulating.
+
+```diff
+@@ -1,6 +1,9 @@
+-self.dense_h_to_4h = mpu.ColumnParallelLinear(
++self.dense_h_to_4h = deepspeed.zero.TiledLinearReturnBias(
+     args.hidden_size,
+     4 * args.hidden_size,
++    in_splits=args.tile_factor,
++    out_splits=4*args.tile_factor,
++    linear_cls=mpu.ColumnParallelLinear,
+     gather_output=False,
+     init_method=init_method,
+     skip_bias_add=True)
+```
+
+Note that we scale `in_splits` and `out_splits` proportionally with `input_size` and `output_size`.  This
+results in tiles of fixed size `[hidden/tile_factor, hidden/tile_factor]`.
+
+#### Registering external parameters
+
+**Deprecated:**
+DeepSpeed version `0.3.15` introduced automatic external parameter
+registration and this step is no longer needed.
+{: .notice--info}
+
+
+## Extracting weights
+
+If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
+
+- under ZeRO-2 `state_dict` contains the fp16 model weights and these can be saved normally with `torch.save`.
+- under ZeRO-3 `state_dict` contains just the placeholders since the model weights are partitioned across multiple GPUs. If you want to get to these weights enable:
+
+```json
+    "zero_optimization": {
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+```
+And then save the model using:
+
+```python
+            if self.deepspeed:
+                self.deepspeed.save_fp16_model(output_dir, output_file)
+```
+
+Because it requires consolidation of the weights on one GPU it can be slow and memory demanding, so only use this feature when needed.
+
+Note that if `stage3_gather_fp16_weights_on_model_save` is `False`, no weights will be saved (again, because `state_dict` doesn't have them.
+You can use this method to save ZeRO-2 weights as well.
+
+If you'd like to get the fp32 weights, we supply a special script that can do offline consolidation. It requires no configuration files or GPUs. Here is an example of its usage:
+
+``` bash
+$ cd /path/to/checkpoints_dir
+$ ./zero_to_fp32.py global_step1 pytorch_model.bin
+Processing zero checkpoint at global_step1
+Detected checkpoint of type zero stage 3, world_size: 2
+Saving fp32 state dict to pytorch_model.bin (total_numel=60506624)
+```
+
+The `zero_to_fp32.py` gets created automatically when you save a checkpoint.
+
+Note: currently this script uses 2x memory (general RAM) of the size of the final checkpoint.
+
 
 Congratulations! You have completed the ZeRO tutorial.
diff --git a/docs/code-docs/source/optimizers.rst b/docs/code-docs/source/optimizers.rst
index d7b338561b96..fda69e0677eb 100755
--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -1,20 +1,24 @@
-Optimizers
-===================
-
-DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedAdam``, ``OneBitAdam`` optimizers on GPU.
-
-Adam (CPU)
-----------------------------
-.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
-
-FusedAdam (GPU)
-----------------------------
-.. autoclass:: deepspeed.ops.adam.FusedAdam
-
-FusedLamb (GPU)
-----------------------------
-.. autoclass:: deepspeed.ops.lamb.FusedLamb
-
-OneBitAdam (GPU)
-----------------------------
-.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
+Optimizers
+===================
+
+DeepSpeed offers high-performance implementations of ``Adam`` optimizer on CPU; ``FusedAdam``, ``FusedLamb``, ``OnebitAdam``, ``OnebitLamb`` optimizers on GPU.
+
+Adam (CPU)
+----------------------------
+.. autoclass:: deepspeed.ops.adam.DeepSpeedCPUAdam
+
+FusedAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.adam.FusedAdam
+
+FusedLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.ops.lamb.FusedLamb
+
+OneBitAdam (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OnebitAdam
+
+OnebitLamb (GPU)
+----------------------------
+.. autoclass:: deepspeed.runtime.fp16.onebit.lamb.OnebitLamb
diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
index 6be3112164ef..c7b67cbb20d8 100755
--- a/docs/code-docs/source/schedulers.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,8 +1,9 @@
 Learning Rate Schedulers
 ===================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers.
-
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+  * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
+  * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
 LRRangeTest
 ---------------------------
diff --git a/docs/code-docs/source/training.rst b/docs/code-docs/source/training.rst
index d88d755f39cb..52e124fc3b40 100644
--- a/docs/code-docs/source/training.rst
+++ b/docs/code-docs/source/training.rst
@@ -31,3 +31,11 @@ Optimizer Step
 Gradient Accumulation
 ---------------------
 .. autofunction:: deepspeed.DeepSpeedEngine.is_gradient_accumulation_boundary
+
+
+Model Saving
+------------
+.. autofunction:: deepspeed.DeepSpeedEngine.save_fp16_model
+
+
+Additionally when a DeepSpeed checkpoint is created, a script ``zero_to_fp32.py`` is added there which can be used to reconstruct fp32 master weights into a single pytorch ``state_dict`` file.
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index c986990444f3..daced77d9093 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -1,5 +1,5 @@
-ZeRO-3 Offload
-##############
+ZeRO
+####
 
 The Zero Redundancy Optimizer (ZeRO) removes the memory redundancies across
 data-parallel processes by partitioning the three model states (optimizer
@@ -8,33 +8,53 @@ replicating them. By doing this, it boosts memory efficiency compared to
 classic data-parallelism while retaining its computational granularity and
 communication efficiency.
 
-ZeRO-Offload further increases memory efficiency by offloading the
-optimizer's states and computations to the CPU. The model parameters can also
-be offloaded for even more memory savings!
+#. **ZeRO Stage 1**: The optimizer states (e.g., for `Adam optimizer <https://arxiv.org/abs/1412.6980>`_, 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
+
+#. **ZeRO Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+
+#. **ZeRO Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
+
+In addition, ZeRO-3 includes the *infinity offload engine* to form
+ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload
+all model states to both CPU and NVMe memory for huge memory savings.
+
+
+For a deep dive of our algorithms, please see our `papers <https://www.deepspeed.ai/#publications>`_ on `ZeRO
+<https://arxiv.org/abs/1910.02054>`_, `ZeRO-Offload
+<https://arxiv.org/abs/2101.06840>`_,
+and `ZeRO-Infinity <https://arxiv.org/abs/2104.07857>`_.
+
+.. note::
+    DeepSpeed first included offloading capabilities with **ZeRO-Offload**, a
+    system for offloading optimizer and gradient states to CPU memory within
+    ZeRO-2. **ZeRO-Infinity** is the next generation of offloading
+    capabilities, accessible to ZeRO-3. ZeRO-Infinity has all of the savings
+    of ZeRO-Offload, plus is able to offload more the model weights and has
+    more effective bandwidth utilization and overlapping of computation and
+    communication.
+
 
-For more information on our algorithms, please see our papers on `ZeRO
-<https://arxiv.org/abs/1910.02054>`_ and `ZeRO-Offload
-<https://arxiv.org/abs/2101.06840>`_.
 
 Getting Started
 ---------------
 
 If you are new to DeepSpeed, check out our `Getting Started <https://www.deepspeed.ai/getting-started/>`_ page.
 
-Once you are training with DeepSpeed, enabling ZeRO-3 Offload is as simple as enabling it
+Once you are training with DeepSpeed, enabling ZeRO-3 offload is as simple as enabling it
 in your DeepSpeed configuration! Below are a few examples of ZeRO-3 configurations. Please see
 our `config guide <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`_
 for a complete list of options for configuration and performance tuning.
 
 .. note::
-        ZeRO-3 Offload works best with our heavily optimized
+        ZeRO-Infinity and ZeRO-Offload work best with our heavily optimized
         :class:`deepspeed.ops.adam.DeepSpeedCPUAdam` optimizer. We recommend using
         our `optimizer config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
         to instruct :meth:`deepspeed.initialize` to build the optimizer for you.
 
 
-Example ZeRO-3 Offload Configurations
-=====================================
+
+Example ZeRO-3 Configurations
+=============================
 
 #. Use ZeRO to partition the optimizer states (stage 1), gradients (stage 2),
    and parameters (stage 3).
@@ -45,7 +65,6 @@ Example ZeRO-3 Offload Configurations
         {
             "zero_optimization": {
                 "stage": 3,
-                "overlap_comm": true
             },
             "fp16": {
                 "enabled": true
@@ -66,16 +85,16 @@ Example ZeRO-3 Offload Configurations
         }
 
 
-#. Additionally offload the optimizer states and computations to the CPU.
+#. Additionally offload the optimizer states and computations to the CPU with ZeRO-Infinity.
 
     .. code-block:: python
-        :emphasize-lines:  4
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
-                "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
             },
             ...
         }
@@ -84,14 +103,36 @@ Example ZeRO-3 Offload Configurations
 #. Save even more memory by offloading parameters to the CPU memory.
 
     .. code-block:: python
-        :emphasize-lines:  5
 
         {
             "zero_optimization": {
                 "stage": 3,
-                "cpu_offload": true,
-                "cpu_offload_params": true,
-                "overlap_comm": true
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
+                "offload_param": {
+                    "device": "cpu"
+                }
+            },
+            ...
+        }
+
+
+#. Save even MORE memory by offloading to NVMe (if available on your system):
+
+    .. code-block:: python
+
+        {
+            "zero_optimization": {
+                "stage": 3,
+                "offload_optimizer": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
+                "offload_param": {
+                    "device": "nvme",
+                    "nvme_path": "/nvme_data"
+                }
             },
             ...
         }
@@ -107,6 +148,9 @@ granularity of (sub)module ``forward()`` methods. The backward pass is
 handled similarly. This strategy has two underlying assumptions:
 
 #. The forward and backward passes of submodules must individually fit in device memory.
+   If this not the case, :class:`deepspeed.zero.TiledLinear` implements
+   **memory-centric tiling** and works with ZeRO-3 to break linear layers
+   into a sequence of smaller submodules that can fit in memory.
 
 #. A module's parameters are only accessed within its own ``__init__`` and ``forward()`` methods.
    Otherwise, DeepSpeed must be instructed to collect and re-partition the parameter.
@@ -126,7 +170,6 @@ you can simply allocate your model in our context:
         model = MyLargeModel()
 
 
-
 .. autoclass:: deepspeed.zero.Init
     :members:
 
@@ -160,25 +203,75 @@ parameters are accessed outside of the module that created them. To do so, use
 Registering External Parameters
 ===============================
 
-Consider the following pattern common in language models such as GPT:
+ZeRO-3 will automatically collect and partition the model parameters as they
+are needed during the forward and backward passes. However, in some cases a
+parameter may be used outside of its module's forward pass. We call these
+*external* parameters. ZeRO-3 can coordinate these parameters if they are
+registered either automatically or manually.
 
-.. code-block:: python
 
-    class LanguageModel(torch.nn.Module):
-        ...
-        def forward(self, inputs):
-            embeds = self.embeddings(inputs)
-            ...
-            logits = compute_logits(output, self.embeddings.weight)
-            ...
+.. note::
+    DeepSpeed version ``0.3.15`` includes automatic external parameter
+    discovery and registration to support the most common cases. Parameters
+    can still be manually registered if they cannot be automatically
+    detected.
+
+
+DeepSpeed can automatically detect the following external parameter scenarios:
+
+
+#. Parameter access: consider the following pattern common in language models such as GPT:
+
+   The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
+   ``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
+   because it is used in the training loop outside of its owning module's
+   forward pass.
+
+
+   .. code-block:: python
+
+       class LanguageModel(torch.nn.Module):
+           ...
+           def forward(self, inputs):
+               embeds = self.embeddings(inputs)
+               ...
+               logits = compute_logits(output, self.embeddings.weight)
+               ...
+
+
+#. Returning a parameter:
+
+   ``CustomLinear`` returns both an output and its own ``bias`` parameter. DeepSpeed
+   will detect the external ``bias`` parameter and register it with submodules that
+   use ``CustomLinear``.
+
+   .. code-block:: python
+
+       class CustomLinear(torch.nn.Linear):
+           def forward(self, *input):
+               output = super().forward(*input)
+               return output, self.bias
 
 
-The tensor ``embeddings.weight`` is used in both ``embeddings.forward()`` and
-``compute_logits()``. We call ``embeddings.weight`` an *external* parameter
-because it is used in the training loop outside of its owning module's
-forward pass. DeepSpeed will coordinate external parameters if they are
-registered prior to the first forward pass.
 
 .. autofunction:: deepspeed.zero.register_external_parameter
 
 .. autofunction:: deepspeed.zero.unregister_external_parameter
+
+
+Memory-Centric Tiling
+---------------------
+
+To reduce the working memory requirements of DL training for large models,
+ZeRO-Infinity includes technique called *memory-centric tiling* that exploits
+the data fetch and release pattern of ZeRO-3 to reduce the working memory
+requirements by breaking down a large operator into smaller tiles that can be
+executed sequentially. When combined with ZeRO-3, the parameter and gradients
+of each tile can be fetched and released one at a time, reducing the working
+memory proportional to the number of tiles. Therefore, ZeRO-Infinity can
+support operators of arbitrary sizes, without refactoring for model
+parallelism to fit them in limited GPU memory.
+
+
+.. autoclass:: deepspeed.zero.TiledLinear
+    :members:
diff --git a/docs/index.md b/docs/index.md
index 497f88bab5c3..9d60ed6e1298 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,7 +17,7 @@ DeepSpeed delivers extreme-scale model training for everyone, from data scientis
 * Extreme scale: Using current generation of GPU clusters with hundreds of devices,  3D parallelism of DeepSpeed can efficiently train deep learning models with trillions of parameters.
 * Extremely memory efficient: With just a single GPU, ZeRO-Offload of DeepSpeed can train models with over 10B parameters, 10x bigger than the state of arts, democratizing multi-billion-parameter model training such that many deep learning scientists can explore bigger and better models.
 * Extremely long sequence length: Sparse attention of DeepSpeed powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution comparing with dense transformers.
-* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.
+* Extremely communication efficient: 3D parallelism improves communication efficiency allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.  1-bit Adam/1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam/LAMB, allowing for scaling to different types of GPU clusters and networks.
 
 Early adopters of DeepSpeed have already produced
 a language model (LM) with over 17B parameters called
@@ -30,6 +30,9 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 
 # What's New?
+* [2021/04/20] [1-bit LAMB: up to 4.6x less communication and 2.8x faster training, together with LAMB's convergence speed at large batch sizes](https://www.deepspeed.ai/tutorials/onebit-lamb/)
+* [2021/04/19] [ZeRO-Infinity unlocks unprecedented model scale for deep learning training](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+  * [Tutorial on how to use different stages of ZeRO](https://www.deepspeed.ai/tutorials/zero/)
 * [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
 * [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
@@ -38,10 +41,6 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
-  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
-  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
-  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
-  * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,
@@ -136,7 +135,7 @@ combinations, which we call 3D parallelism.
 Pipeline parallelism of DeepSpeed reduce communication volume during distributed training, which allows users to train multi-billion-parameter models 2–7x faster on clusters with limited network bandwidth.
 ![Low-bandwidth GPT-2 Performance](/assets/images/pp-lowbw-gpt2.png)
 
-1-bit Adam reduces communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+1-bit Adam and 1-bit LAMB reduce communication volume by up to 5x while achieving similar convergence efficiency to Adam, allowing for scaling to different types of GPU clusters and networks.  [1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html), [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam/), [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/).
 
 ## Supporting long sequence length
 DeepSpeed offers sparse attention kernels—an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers **an order-of-magnitude longer input sequence** and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5–3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.  [Read more here](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
@@ -180,7 +179,7 @@ Below we provide a brief feature list, see our detailed [feature overview](https
   * Memory- and compute-efficient sparse kernels
   * Support 10x long sequences than dense
   * Flexible support to different sparse structures
-* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
+* [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and [1-bit LAMB](https://www.deepspeed.ai/tutorials/onebit-lamb/)
   * Custom communication collective
   * Up to 5x communication volume saving
 * [Additional Memory and Bandwidth Optimizations](https://www.deepspeed.ai/features/#additional-memory-and-bandwidth-optimizations)
@@ -236,6 +235,8 @@ comments.
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857).
+7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 38f27a9897ce..72e09b293811 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -8,6 +8,7 @@
 from .transformer import TransformerBuilder
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
+from .async_io import AsyncIOBuilder
 from .builder import get_default_compute_capatabilities
 
 # TODO: infer this list instead of hard coded
@@ -19,6 +20,7 @@
     SparseAttnBuilder(),
     TransformerBuilder(),
     StochasticTransformerBuilder(),
-    UtilsBuilder()
+    UtilsBuilder(),
+    AsyncIOBuilder()
 ]
 ALL_OPS = {op.name: op for op in __op_builders__}
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
new file mode 100644
index 000000000000..749ac8725906
--- /dev/null
+++ b/op_builder/async_io.py
@@ -0,0 +1,56 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .builder import OpBuilder
+
+
+class AsyncIOBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp'
+        ]
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        return [
+            '-g',
+            '-Wall',
+            '-O0',
+            '-std=c++14',
+            '-shared',
+            '-fPIC',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            '-laio',
+            self.simd_width()
+        ]
+
+    def extra_ldflags(self):
+        return ['-laio']
+
+    def is_compatible(self):
+        aio_libraries = ['libaio-dev']
+        aio_compatible = self.libraries_installed(aio_libraries)
+        if not aio_compatible:
+            self.warning(
+                f"{self.NAME} requires the libraries: {aio_libraries} but are missing.")
+        return super().is_compatible() and aio_compatible
diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
new file mode 100755
index 000000000000..b3ed3c601492
--- /dev/null
+++ b/tests/benchmarks/flatten_bench.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./flatten_bench.py -t
+# ./flatten_bench.py -c
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+import argparse
+
+import gc
+
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
+t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(t)
+flat_cpp = flatten(t)
+flat_apex = flatten_apex(t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+TIMES = 1000
+
+
+# the programs being tested
+def py():
+    for i in range(TIMES):
+        flat = _flatten_dense_tensors(t)
+
+
+def cpp():
+    for i in range(TIMES):
+        flat = flatten(t)
+
+
+def apex():
+    for i in range(TIMES):
+        flat = flatten_apex(t)
+
+
+#### cProfile ####
+
+import cProfile
+
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### timeit ####
+
+import timeit
+
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l flatten_bench.py -l; python -m line_profiler  flatten_bench.py.lprof
+
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
new file mode 100755
index 000000000000..85baf751ad9c
--- /dev/null
+++ b/tests/benchmarks/unflatten_bench.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
+#
+# usage:
+# ./unflatten_bench.py -t
+# ./unflatten_bench.py -c
+# kernprof -l unflatten_bench.py -l; python -m line_profiler  unflatten_bench.py.lprof
+
+import argparse
+import gc
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from deepspeed.ops.op_builder import UtilsBuilder
+
+from apex_C import flatten as flatten_apex
+from apex_C import unflatten as unflatten_apex
+
+util_ops = UtilsBuilder().load()
+flatten = util_ops.flatten
+unflatten = util_ops.unflatten
+
+torch.manual_seed(0)
+# emulate a small typical model weights
+x = [
+    torch.rand((512,
+                512)).cuda(),
+    torch.rand((512,
+                1024)).cuda(),
+    torch.rand((512,
+                30000)).cuda()
+]
+unflat_t = x * 30
+
+# warm up and check that the same output is produced
+flat_py = _flatten_dense_tensors(unflat_t)
+flat_cpp = flatten(unflat_t)
+flat_apex = flatten_apex(unflat_t)
+#numel = flat_cpp.numel()
+assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
+assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"
+
+flat_t = flat_py
+unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_py[i]).all()
+unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
+unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
+for i in range(len(unflat_t)):
+    assert torch.eq(unflat_t[i], unflat_apex[i]).all()
+
+
+# the programs being tested
+def py():
+    for i in range(1000):
+        unflat = _unflatten_dense_tensors(flat_t, unflat_t)
+
+
+def cpp():
+    for i in range(1000):
+        unflat = unflatten(flat_t, unflat_t)
+
+
+def apex():
+    for i in range(1000):
+        unflat = unflatten_apex(flat_t, unflat_t)
+
+
+#### cProfile ####
+
+import cProfile
+
+
+def cprofileme():
+    print("--------------- cProfile -----------------")
+    print("py")
+    cProfile.run("py()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    cProfile.run("cpp()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    cProfile.run("apex()", sort=-1)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### timeit ####
+
+import timeit
+
+
+def timeme():
+    print("--------------- timeit -----------------")
+    print(f'py  ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+    print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+#### line_profiler ####
+# this one requires a special way to be called
+# pip install line_profiler
+# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof
+
+
+def line_profileme():
+    print("--------------- line_profier -----------------")
+    print("py")
+    profile(py)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("cpp")
+    profile(cpp)()
+    gc.collect()
+    torch.cuda.empty_cache()
+    print("apex")
+    profile(apex)()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-l", action='store_true')
+    parser.add_argument("-c", action='store_true')
+    parser.add_argument("-t", action='store_true')
+    args = parser.parse_args()
+    if args.l:
+        line_profileme()
+    elif args.c:
+        cprofileme()
+    elif args.t:
+        timeme()
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index c3f138b221d5..6cb89a9638cd 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -8,7 +8,7 @@
 from deepspeed.runtime.fp16.onebit.onebitadam import OnebitAdam
 =======
 import os
->>>>>>> ab5534fc4c0f8ca21ada321f9730d723aa31288b:tests/onebit/test_nccl_backend.py
+>>>>>>> 669028f0fd5067c9247120cb21fd6e9bea4820a9:tests/onebit/test_nccl_backend.py
 
 from deepspeed.runtime.comm.nccl import NcclBackend
 
@@ -93,7 +93,7 @@ def torch_sim(a):
 if test_correctness:
     if torch.sum(diff_server_mask) == 0:
         print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
->>>>>>> ab5534fc4c0f8ca21ada321f9730d723aa31288b:tests/onebit/test_nccl_backend.py
+>>>>>>> 669028f0fd5067c9247120cb21fd6e9bea4820a9:tests/onebit/test_nccl_backend.py
     else:
         check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
         if torch.sum(check_mag_mask) == 0:
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 8fcae8bcca18..015e8c508cee 100755
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -132,18 +132,15 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
     return model
 
 
-@torch.jit.script
 def f_gelu(x):
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_gelu(bias, y):
     x = bias + y
     return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
 
 
-@torch.jit.script
 def bias_tanh(bias, y):
     x = bias + y
     return torch.tanh(x)
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
index 2c7e07aa8b31..e05cb1190dde 100755
--- a/tests/unit/test_cuda_backward.py
+++ b/tests/unit/test_cuda_backward.py
@@ -17,9 +17,9 @@
 import sys
 
 #if not deepspeed.ops.__installed_ops__['transformer']:
-pytest.skip(
-    "transformer kernels are temporarily disabled because of unexplained failures",
-    allow_module_level=True)
+#pytest.skip(
+#    "transformer kernels are temporarily disabled because of unexplained failures",
+#    allow_module_level=True)
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
@@ -258,6 +258,9 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 # 3-128-54-2-24-False-True-0.2
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
                          [
+                             (8,1600,128,25,3,True,True, 0.05),
+                             (8,160,128,2,3,True,True, 0.1),
+                             (8,1600,128,2,3,True,True, 0.05),
                              (3,1024,119,16,24,True,False, 0.05),
                              (3,1024,115,16,24,True,True, 0.05),
                              (1024,128,10,2,2,False,False, 0.1),
@@ -291,7 +294,7 @@ def test_backward(batch_size,
     ds_config.initializer_range = 0.02
     ds_config.fp16 = use_fp16
 
-    run_backward(ds_config, seq_len, atol=atol)
+    run_backward(ds_config, seq_len, atol=atol, verbose=False)
 
 
 #@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
index 5add5e152a91..73e847aa3ac4 100755
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@@ -199,7 +199,11 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 # FP16 test cases can only run on the devices support FP16.
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,256,53,4,3,True,False),
+                             (8,160,128,2,3,True,True),
+                             (8,160,128,2,3,False,True),
+                             (8,1600,128,2,3,True,True),
+                             (8,1600,128,25,3,True,True),
+                             (8,1600,128,25,3,False,True),
                              (8,256,52,4,3,True,True),
                              (3,1024,51,16,3,True,False),
                              (3,1024,54,16,3,True,True),
@@ -259,10 +263,10 @@ def test_forward(batch_size,
 
 @pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (8,3,1024,512,16,3,True,False),
-                             (8,7,1024,512,16,3,True,True),
-                             (8,3,1024,512,16,3,False,False),
-                             (8,7,1024,512,16,3,False,True),
+                             #(8,3,1024,512,16,3,True,False),
+                             #(8,7,1024,512,16,3,True,True),
+                             #(8,3,1024,512,16,3,False,False),
+                             #(8,7,1024,512,16,3,False,True),
                          ]) # yapf: disable
 def test_forward_with_small_bsz(batch_size,
                                 small_bsz,
@@ -294,10 +298,10 @@ def test_forward_with_small_bsz(batch_size,
 
 @pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                          [
-                             (64,1024,128,16,3,True,False),
-                             (64,1024,128,16,3,True,True),
-                             (64,1024,128,16,3,False,False),
-                             (64,1024,128,16,3,False,True),
+                             #(64,1024,128,16,3,True,False),
+                             #(64,1024,128,16,3,True,True),
+                             #(64,1024,128,16,3,False,False),
+                             #(64,1024,128,16,3,False,True),
                          ]) # yapf: disable
 def test_forward_stochastic(batch_size,
                             hidden_size,
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
index 8e0056be0cff..9796a70953f8 100644
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@@ -1,14 +1,22 @@
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.distributed as dist
 import deepspeed
 import argparse
 import pytest
+import copy
 import json
 import os
 import numpy as np
 import time
+
+from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+PipeTopo = PipeDataParallelTopology
+from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
 from common import distributed_test
 from simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
+from test_pipe import AlexNetPipe, train_cifar
 
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -241,9 +249,7 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
         mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
         assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        # optimizer_1.optimizer.gather_compression_errors()
         model_1.save_checkpoint(save_folder, tag=None)
-        time.sleep(5)
         assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
 
@@ -297,6 +303,552 @@ def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
                                    hidden_dim=hidden_dim)
 
 
+def test_onebitadam_checkpointing_overflow(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[2])
+    def _test_onebitadam_checkpointing_overflow(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+    _test_onebitadam_checkpointing_overflow(args=args,
+                                            model=model,
+                                            hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('topo',
+                         [
+                             PipeTopo(num_pp=1,
+                                      num_dp=4),
+                             PipeTopo(num_pp=2,
+                                      num_dp=2),
+                             PipeTopo(num_pp=4,
+                                      num_dp=1),
+                         ])
+def test_onebitadam_fp16_pipeline(topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "OneBitAdam",
+            "params": {
+                "lr": 0.00001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7,
+                "freeze_step": 200,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    # Allocate model for consistent initial weights.
+    init_net = AlexNetPipe()
+
+    @distributed_test(world_size=4)
+    def _helper(topo, tmpdir, steps=500):
+        assert steps >= 100
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+    _helper(topo, tmpdir)
+
+
+def test_onebitlamb_fp16_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitlamb_fp16_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitlamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_fp32_basic(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[1, 2])
+    def _test_onebitlamb_fp32_basic(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+    _test_onebitlamb_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_exp_avg_mask(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+    optimizer_grouped_parameters = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                    {
+                                        'params': [param_optimizer[1][1]],
+                                        'weight_decay': 0.01
+                                    }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_exp_avg_mask(args, model, hidden_dim):
+        model, optimizer, _, _ = deepspeed.initialize(args=args,
+                                                      model=model,
+                                                      model_parameters=optimizer_grouped_parameters)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+        # Test whether the momentum mask works
+        for v in optimizer.state.values():
+            if v['exp_avg'].size() == mask1.size():
+                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
+
+    _test_onebitlamb_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_checkpointing(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+    param_optimizer = list(model.named_parameters())
+    mask1 = torch.zeros_like(param_optimizer[0][1].data)
+    mask2 = torch.zeros_like(param_optimizer[0][1].data)
+    for col in range(mask1.size()[1]):
+        mask1[0][col] += 1
+        mask2[1][col] += 1
+
+    optimizer_grouped_parameters_1 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask1
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_2 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01,
+        'exp_avg_mask': mask2
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    optimizer_grouped_parameters_3 = [{
+        'params': [param_optimizer[0][1]],
+        'weight_decay': 0.01
+    },
+                                      {
+                                          'params': [param_optimizer[1][1]],
+                                          'weight_decay': 0.01
+                                      }]
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_checkpointing(mask1, mask2, args, model, hidden_dim):
+        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_1)
+        data_loader = random_dataloader(model=model_1,
+                                        total_samples=10,
+                                        hidden_dim=hidden_dim,
+                                        device=model_1.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_1(batch[0], batch[1])
+            model_1.backward(loss)
+            model_1.step()
+        # Test whether momentum mask still exist after saving checkpoint
+        assert optimizer_1.optimizer.lamb_freeze_key is True
+        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
+        scaling_coeff_1 = []
+        for v in optimizer_1.state.values():
+            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
+            scaling_coeff_1.append(v['scaling_coeff'])
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        model_1.save_checkpoint(save_folder, tag=None)
+        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
+
+
+        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_2)
+        # Test whether momentum mask stays the same after loading checkpoint
+        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
+        model_2.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs is loaded correctly
+        scaling_coeff_2 = []
+        for v in optimizer_2.state.values():
+            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
+            scaling_coeff_2.append(v['scaling_coeff'])
+        assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
+        assert optimizer_2.optimizer.lamb_freeze_key is True
+
+        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
+                                                          model=model,
+                                                          model_parameters=optimizer_grouped_parameters_3)
+        optimizer_3.optimizer.freeze_step = 20
+        data_loader = random_dataloader(model=model_3,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model_3.device)
+        for n, batch in enumerate(data_loader):
+            loss = model_3(batch[0], batch[1])
+            model_3.backward(loss)
+            model_3.step()
+        assert optimizer_3.optimizer.lamb_freeze_key is True
+        # Test whether momentum mask stays the same after loading checkpoint
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
+        model_3.load_checkpoint(save_folder,
+                                tag=None,
+                                load_optimizer_states=True,
+                                load_lr_scheduler_states=True)
+        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
+        # Test whether worker&server error is resetted
+        assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
+        assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
+        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are resetted
+        for v in optimizer_3.state.values():
+            assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
+            assert v['last_factor'] == 1.0, f"Incorrect last_factor"
+            assert 'scaling_coeff' not in v, f"Incorrect scaling_coeff"
+        assert optimizer_3.optimizer.lamb_freeze_key is False
+
+    _test_onebitlamb_checkpointing(mask1,
+                                   mask2,
+                                   args=args,
+                                   model=model,
+                                   hidden_dim=hidden_dim)
+
+
+def test_onebitlamb_checkpointing_overflow(tmpdir):
+    config_dict = {
+        "train_batch_size": 2,
+        "steps_per_print": 1,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00015,
+                "weight_decay": 0.01,
+                "max_coeff": 0.3,
+                "min_coeff": 0.01,
+                "freeze_step": 2,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl",
+                "coeff_beta": 0.9,
+                "factor_max": 1.0,
+                "factor_min": 0.5,
+                "factor_threshold": 0.1
+            }
+        },
+        "gradient_clipping": 1.0,
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+    hidden_dim = 10
+
+    model = SimpleModel(hidden_dim)
+
+    @distributed_test(world_size=[2])
+    def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
+        model, _, _, _ = deepspeed.initialize(args=args,
+                                              model=model,
+                                              model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model,
+                                        total_samples=100,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
+        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if dist.get_rank() == 0 and n >= 10:
+                loss = loss * 1000000.0
+            model.backward(loss)
+            dist.barrier()
+            model.step()
+            dist.barrier()
+            model.save_checkpoint(save_folder, tag=None)
+
+    _test_onebitlamb_checkpointing_overflow(args=args,
+                                            model=model,
+                                            hidden_dim=hidden_dim)
+
+
+@pytest.mark.parametrize('topo',
+                         [
+                             PipeTopo(num_pp=1,
+                                      num_dp=4),
+                             PipeTopo(num_pp=2,
+                                      num_dp=2),
+                             PipeTopo(num_pp=4,
+                                      num_dp=1),
+                         ])
+def test_onebitlamb_fp16_pipeline(topo, tmpdir):
+    config_dict = {
+        "train_batch_size": 16,
+        "train_micro_batch_size_per_gpu": 4,
+        "steps_per_print": 20,
+        "optimizer": {
+            "type": "OneBitLamb",
+            "params": {
+                "lr": 0.00001,
+                "betas": [0.9,
+                          0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7,
+                "freeze_step": 200,
+                "cuda_aware": False,
+                "comm_backend_name": "nccl"
+            }
+        },
+        "gradient_clipping": 1.0,
+        "zero_optimization": {
+            "stage": 0
+        },
+        "fp16": {
+            "enabled": True,
+            "loss_scale": 0,
+            "initial_scale_power": 16
+        },
+        "pipeline": {
+            "seed_layers": True,
+            "activation_checkpoint_interval": 1
+        }
+    }
+    args = args_from_dict(tmpdir, config_dict)
+
+    # Allocate model for consistent initial weights.
+    init_net = AlexNetPipe()
+
+    @distributed_test(world_size=4)
+    def _helper(topo, tmpdir, steps=500):
+        assert steps >= 100
+
+        test_net = copy.deepcopy(init_net)
+        test_model = PipelineModule(layers=test_net.to_layers(),
+                                    topology=topo,
+                                    loss_fn=nn.CrossEntropyLoss())
+
+        test_losses = train_cifar(test_model,
+                                  args,
+                                  num_steps=steps,
+                                  fp16=config_dict['fp16']['enabled'])
+
+    _helper(topo, tmpdir)
+
+
 def test_compressed_allreduce_basic(tmpdir):
     @distributed_test(world_size=[1, 2])
     def _test_compressed_allreduce_basic():
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
index 61f07a196971..a29d22a2a954 100644
--- a/tests/unit/test_pipe_module.py
+++ b/tests/unit/test_pipe_module.py
@@ -96,6 +96,6 @@ def _helper():
         base_output = base_output.to('cpu')
         pipe_output = pipe_output.to('cpu')
 
-        assert torch.allclose(base_output, pipe_output)
+        assert torch.allclose(base_output, pipe_output, atol=1e-4)
 
     _helper()
diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py
index 0e5b2e0696e6..5ccccb5c18a0 100644
--- a/tests/unit/test_zero_context.py
+++ b/tests/unit/test_zero_context.py
@@ -1,9 +1,12 @@
 import os
+import sys
+from types import SimpleNamespace
+
 import torch
 import pytest
 
 import deepspeed
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
 
 from common import distributed_test
 
@@ -29,7 +32,7 @@ def test_scatter_gather():
     with deepspeed.zero.Init():
         l = torch.nn.Linear(6, 3)
     assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert l.weight.numel() == 1
+    assert l.weight.shape == torch.Size(partitioned_param_data_shape)
 
     # Ensure there is no impact outside the context
     l2 = torch.nn.Linear(6, 3)
@@ -62,55 +65,59 @@ def test_gather_update():
         assert torch.equal(l.weight, torch.zeros_like(l.weight))
 
 
-@pytest.mark.skip('WIP')
-def test_external_param():
+config_dict = {
+    "train_batch_size": 1,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "loss_scale": 138.
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+    }
+}
+
+
+def test_ext_param_getattr():
     setup_serial_env()
 
-    print()
-
     class ExtLinear(torch.nn.Module):
-        def __init__(self, dim=10, copycat=None):
+        def __init__(self, dim=16):
             super().__init__()
             self.dim = dim
-            self.linear = torch.nn.Linear(dim, dim)
-            if copycat is not None:
-                with deepspeed.zero.GatheredParameters(self.linear.weight,
-                                                  modifier_rank=0), \
-                     torch.no_grad():
-                    self.linear.weight.copy_(copycat.linear.weight)
-
-            if hasattr(self.linear.weight, 'ds_id'):
-                print('registering')
-                super().ds_register_external_parameter('samyam', self.linear.weight)
+            self.linear1 = torch.nn.Linear(dim, dim)
+            self.linear2 = torch.nn.Linear(dim, dim)
 
         def forward(self, input):
-            yamsam = self.linear(input)
-            if hasattr(self.linear.weight, 'ds_status'):
-                assert self.linear.weight.ds_status == ZeroParamStatus.AVAILABLE
-            jeff = torch.nn.functional.linear(yamsam, self.linear.weight)
-            return jeff
+            A = self.linear1(input)
+            B = self.linear2(A)
 
-    l1_base = ExtLinear().half().cuda()
-    l2_base = ExtLinear().half().cuda()
+            # external use of self.linear1.weight
+            C = torch.nn.functional.linear(B, self.linear1.weight)
+            return C.sum()
 
-    input = torch.rand(10).half().cuda()
+    net = ExtLinear()
 
-    l1_base_out = l1_base(input.clone().detach())
-    l2_base_out = l2_base(input.clone().detach())
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
 
-    with deepspeed.zero.Init():
-        l1_test = ExtLinear(copycat=l1_base).cuda()
-        #l2_test = ExtLinear(copycat=l2_base).cuda()
-        assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-    # XXX l1 and l2 share their external parameter (l2.linear.weight)
-
-    assert l1_test.linear.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    l1_test_out = l1_test(input.clone().detach())
-    #assert torch.allclose(l1_base_out, l1_test_out)
+    with deepspeed.zero.GatheredParameters(net.linear1.weight):
+        assert net.linear1.weight.numel() == net.dim**2
 
-    #l2_test_out = l2_test(input.clone().detach())
-    #assert torch.allclose(l2_base_out, l2_test_out)
+    input = torch.rand(net.dim).to(engine.device).half()
+    loss = engine(input)
+    engine.backward(loss)
+    engine.step()
 
 
 def test_scatter_halftype():
@@ -122,3 +129,117 @@ def test_scatter_halftype():
 
         y = torch.LongTensor([3, 3])
         assert y.dtype == torch.long
+
+
+class DanglingBias(torch.nn.Linear):
+    def forward(self, *inputs):
+        out = super().forward(*inputs)
+        # return the bias to trigger a dangling external param
+        return out, self.bias
+
+
+class DataClass:
+    """Just wraps data in an object. """
+    def __init__(self, out=None, bias=None):
+        self.out = out
+        self.bias = bias
+
+
+class DanglingBiasClass(DanglingBias):
+    def forward(self, *inputs):
+        out, bias = super().forward(*inputs)
+        return DataClass(out=out, bias=bias)
+
+
+class DanglingAttention(torch.nn.Linear):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__(dim, dim)
+        self.dim = dim
+        self.return_obj = return_obj
+        if return_obj:
+            self.d_linear = DanglingBiasClass(dim, dim)
+        else:
+            self.d_linear = DanglingBias(dim, dim)
+
+    def forward(self, input):
+        out = super().forward(input)
+        if self.return_obj:
+            out_obj = self.d_linear(out)
+            assert out_obj.bias.ds_status == ZeroParamStatus.AVAILABLE
+            # forward the external param
+            return out_obj.out, out_obj.bias
+        else:
+            out, bias = self.d_linear(out)
+            assert bias.ds_status == ZeroParamStatus.AVAILABLE
+            return out, bias
+
+
+class ModelContainer(torch.nn.Module):
+    def __init__(self, dim=16, return_obj=False):
+        super().__init__()
+        self.dim = dim
+        self.linear1 = torch.nn.Linear(dim, dim)
+        self.dangler = DanglingAttention(dim, return_obj=return_obj)
+
+    def forward(self, input):
+        act1 = self.linear1(input)
+        # bias is actually dangler.d_linear1.bias
+        act2, bias = self.dangler(act1)
+        assert bias.ds_status == ZeroParamStatus.AVAILABLE
+        return (act2 + bias).sum()
+
+
+class DanglingExt(torch.nn.Module):
+    def __init__(self, dim=16):
+        super().__init__()
+        self.dim = dim
+        self.container = ModelContainer(dim)
+
+    def forward(self, input):
+        out = self.container(input)
+
+        # Make sure it's at the right level of the stack
+        assert len(self._external_params) == 0
+        assert len(self.container._external_params) == 1
+        assert len(self.container.dangler._external_params) == 0
+        return out
+
+
+def test_ext_param_return():
+    setup_serial_env()
+
+    net = DanglingExt()
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        engine.backward(loss)
+        engine.step()
+
+
+@pytest.mark.skip('WIP')
+def test_ext_param_returnobj():
+    setup_serial_env()
+    print()
+
+    net = ModelContainer(return_obj=True)
+
+    args = SimpleNamespace(local_rank=0)
+    engine, optim, _, _ = deepspeed.initialize(args=args,
+                                               model=net,
+                                               model_parameters=net.parameters(),
+                                               config_params=config_dict)
+
+    for _ in range(5):
+        input = torch.rand(net.dim).to(engine.device).half()
+        loss = engine(input)
+        assert len(net._external_params) == 1
+        assert len(net.dangler._external_params) == 0
+        engine.backward(loss)
+        engine.step()
diff --git a/tests/unit/test_zero_tiled.py b/tests/unit/test_zero_tiled.py
new file mode 100644
index 000000000000..a8b63b11d32a
--- /dev/null
+++ b/tests/unit/test_zero_tiled.py
@@ -0,0 +1,169 @@
+import copy
+
+import torch
+import deepspeed
+from deepspeed.runtime.zero.tiling import TiledLinear, TiledLinearReturnBias
+
+import pytest
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2), (5, 5), (32, 32)])
+def test_tiled_init(in_splits, out_splits):
+    in_f = 32
+    out_f = 40
+    base = torch.nn.Linear(in_f, out_f, bias=True)
+    l = TiledLinear(in_f,
+                    out_f,
+                    bias=True,
+                    init_linear=copy.deepcopy(base),
+                    out_splits=out_splits,
+                    in_splits=in_splits)
+
+    for out_id in range(out_splits):
+        for in_id in range(in_splits):
+            local_l = l.linears[out_id][in_id]
+            assert isinstance(local_l, torch.nn.Linear)
+
+            rstart = l.out_parts[out_id]
+            rstop = l.out_parts[out_id + 1]
+            cstart = l.in_parts[in_id]
+            cstop = l.in_parts[in_id + 1]
+
+            local_out = rstop - rstart
+            local_in = cstop - cstart
+            assert local_l.weight.size()[1] == local_in, f'local[{out_id}][{in_id}].size {local_l.weight.size()}'
+            assert local_l.weight.size()[0] == local_out
+
+            test = base.weight[rstart:rstop, cstart:cstop]
+
+            assert local_l.weight.size() == test.size()
+            assert torch.equal(local_l.weight.data, test.data)
+
+            if in_id == in_splits - 1:
+                assert local_l.bias is not None
+                assert local_l.bias.size()[0] == local_out
+            else:
+                assert local_l.bias is None
+
+
+@pytest.mark.parametrize('in_splits,out_splits', [(0, 0), (33, 33)])
+def test_tiled_baddim(in_splits, out_splits):
+    dim = 32
+    with pytest.raises(RuntimeError):
+        l = TiledLinear(dim, dim, out_splits=out_splits, in_splits=in_splits)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_forward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = torch.nn.Linear(in_f, out_f, bias=bias)
+    test = TiledLinear(in_f,
+                       out_f,
+                       bias=bias,
+                       init_linear=copy.deepcopy(base),
+                       out_splits=out_splits,
+                       in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out = base(copy.deepcopy(inp))
+    test_out = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out, test_out, rtol=1e-4)
+
+    base_out.sum().backward()
+    test_out.sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
+
+
+class LinearWrapper(torch.nn.Linear):
+    """Returns its own bias to simulate Megatron-LM's behavior.
+
+    Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
+    """
+    def forward(self, input):
+        out = super().forward(input)
+        return out, self.bias
+
+
+@pytest.mark.parametrize('bias', [False, True])
+@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
+@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
+def test_tiled_returnbias_backward(in_splits, out_splits, bias, in_f, out_f):
+    base = LinearWrapper(in_f, out_f, bias=bias)
+    test = TiledLinearReturnBias(in_f,
+                                 out_f,
+                                 bias=bias,
+                                 linear_cls=LinearWrapper,
+                                 init_linear=copy.deepcopy(base),
+                                 out_splits=out_splits,
+                                 in_splits=in_splits)
+
+    inp = torch.rand(in_f)
+
+    base_out_t, base_out_b = base(copy.deepcopy(inp))
+    test_out_t, test_out_b = test(copy.deepcopy(inp))
+    assert torch.allclose(base_out_t, test_out_t, rtol=1e-4)
+    if base_out_b is None:
+        assert test_out_b is None
+        base_out_b = torch.zeros_like(base_out_t)
+        test_out_b = torch.zeros_like(test_out_t)
+    else:
+        assert test_out_b is not None
+        assert torch.allclose(base_out_b, test_out_b, rtol=1e-4)
+
+    (base_out_t + base_out_b).sum().backward()
+    (test_out_t + test_out_b).sum().backward()
+
+    # compare grads
+    for row in range(out_splits):
+        rstart = test.out_parts[row]
+        rstop = test.out_parts[row + 1]
+
+        for col in range(in_splits):
+            cstart = test.in_parts[col]
+            cstop = test.in_parts[col + 1]
+
+            local = test.linears[row][col]
+            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
+            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
+
+            if local.bias is not None:
+                base_grad = base.bias.grad[rstart:rstop]
+                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
diff --git a/version.txt b/version.txt
index e4737652ca5a..9e29e106195a 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.13
+0.3.15