From f1c8c3ba7f0e138e201b7a61d4fcdbb459269e98 Mon Sep 17 00:00:00 2001
From: KungYork <30741085+KungYork@users.noreply.github.com>
Date: Mon, 11 Sep 2023 13:20:30 +0800
Subject: [PATCH 01/18] Fix kunlunxin-glm training. (#242)

* Fix kunlunxin GLM training configs

* Relocate xacc install logic

* Modify max_steps for config 1x1 and 2x8
---
 training/kunlunxin/docker_image/pytorch/pytorch_install.sh   | 2 ++
 training/kunlunxin/glm-pytorch/config/config_R300x1x1.py     | 2 +-
 training/kunlunxin/glm-pytorch/config/config_R300x2x8.py     | 2 +-
 .../kunlunxin/glm-pytorch/config/environment_variables.sh    | 2 ++
 training/kunlunxin/glm-pytorch/config/requirements.txt       | 1 +
 training/kunlunxin/glm-pytorch/extern/trainer_adapter.py     | 5 +++--
 6 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh
index 850a304b4..2a96fe267 100644
--- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh
+++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh
@@ -4,3 +4,5 @@ set -xe
 
 pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl
 pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl
+
+python -m xacc.install
diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py
index 8a4c96915..cd9afdd40 100644
--- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py
+++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py
@@ -14,4 +14,4 @@
 lr_decay_iters = 4338
 log_freq = 1
 seed = 4096
-max_samples_termination = 925510
+max_samples_termination = 4000
diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py
index 840259660..fa79a403e 100644
--- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py
+++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py
@@ -14,4 +14,4 @@
 lr_decay_iters = 4338
 log_freq = 1
 seed = 4096
-max_samples_termination = 2776540
+max_samples_termination = 20000
diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh
index 9c9f20b8e..b527e7873 100755
--- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh
+++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh
@@ -17,3 +17,5 @@ export ALLREDUCE_FUSION=0
 
 export XMLIR_F_XPU_FC_GEMM_MODE=float
 export XMLIR_F_FAST_INDEX_PUT=true
+
+export XACC_ENABLE=1
diff --git a/training/kunlunxin/glm-pytorch/config/requirements.txt b/training/kunlunxin/glm-pytorch/config/requirements.txt
index 8bac0066c..46109702b 100644
--- a/training/kunlunxin/glm-pytorch/config/requirements.txt
+++ b/training/kunlunxin/glm-pytorch/config/requirements.txt
@@ -2,3 +2,4 @@ h5sparse
 boto3
 h5py
 numpy>=1.15.4
+psutil
diff --git a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py
index 3ac97e41a..719532ede 100644
--- a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py
+++ b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py
@@ -1,13 +1,13 @@
 import torch
-import config
-
 from torch import nn
 import torch.distributed as dist
 
+import config
 from optimizers import get_optimizer_param_groups
 from optimizers.loss_scaler import DynamicLossScaler
 from driver.dist_pytorch import main_proc_print
 
+import torch_xmlir
 import torch_xmlir.core.xpu_model as xm
 from torch_xmlir.optimizer import AdamW as Adam
 from torch_xmlir.nn.clip_grad import clip_grad_norm
@@ -79,4 +79,5 @@ def _clip_grad():
     if DynamicLossScaler._has_inf_or_nan(reduced_loss):
         main_proc_print("Found NaN loss, skip backward")
 
+    torch_xmlir.xpu.empty_cache()
     return reduced_loss

From 9603550e72c037bbe60b5a641c53dc879920f2c6 Mon Sep 17 00:00:00 2001
From: Zhou Yu <zycosmos@gmail.com>
Date: Thu, 14 Sep 2023 11:40:06 +0800
Subject: [PATCH 02/18] glm: fix dataset url (#248)

Co-authored-by: zhouyu <zhouyu@baai.ac.cn>
---
 training/benchmarks/glm/README.md | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/training/benchmarks/glm/README.md b/training/benchmarks/glm/README.md
index 0481948e9..2dfa685bd 100644
--- a/training/benchmarks/glm/README.md
+++ b/training/benchmarks/glm/README.md
@@ -13,20 +13,38 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
 
 
 ### 数据集
-- 数据集下载地址
-> `https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip`
+- 数据集及checkpoint下载地址
+>`https://model.baai.ac.cn/model-detail/100097`  
+> 文件名：`glm_train_dataset.zip`
 
 - 预处理
-> 无需预处理 
+- 无需预处理，解压缩数据集即可。
+```bash
+unzip glm_train_dataset.zip
+```
+解压后的目录结构
+```bash
+├── ReCoRD
+│   └── glm_train_eval_hdf5_sparse
+│       ├── eval_hdf5
+│       │   └── eval_sparse.hdf5
+│       └── train_hdf5
+│           └── train_sparse.hdf5
+├── blocklm-large-blank
+│   ├── 200000
+│   │   └── mp_rank_00_model_states.pt
+│   └── latest_checkpointed_iteration.txt
+```
+
+
 
-### 模型checkpoint 
-> `https://cloud.tsinghua.edu.cn/d/13f5b03da9594e5490c4/files/?p=%2Fglm-large-blank.tar.bz2`
 
 ### 框架与芯片支持情况
 |     | Pytorch  |Paddle|TensorFlow2|
 |  ----  | ----  |  ----  | ----  |
 | Nvidia GPU | ✅ |N/A  |N/A|
 | 昆仑芯 XPU | ✅ |N/A  |N/A|
+| 天数智芯 GPU | ✅ |N/A  |N/A|
 
 
 

From 4cd6d53d7adf7d9f8cef7d80cb9413dfa0261705 Mon Sep 17 00:00:00 2001
From: jinxiangshi <44688400+jinxiangshi@users.noreply.github.com>
Date: Thu, 14 Sep 2023 11:41:15 +0800
Subject: [PATCH 03/18] kunlunxin berfLarge inference configs && results (#212)

* kunlunxin inference : add bertLarge

* Revert "kunlunxin inference : add bertLarge"

This reverts commit cd9127c79de9c46f26c90edf09a6e5c65fe93054.

* kunlunxin inference : add bertLarge

* kunlunxin : remove re-install transformers

* adjust env for bertlarge

* kunlunxin: update bertLarge performance

* Update BertLarge performance

---------

Co-authored-by: zhaoyixuan02 <zhaoyixuan02@baidu.com>
Co-authored-by: Shi Jinxiang <shijinxiang@baidu.com>
---
 inference/benchmarks/bertLarge/README.md      | 20 +++++++++-
 .../kunlunxin_configurations.yaml             |  3 ++
 inference/inference_engine/kunlunxin/xtcl.py  | 40 +++++++++++++------
 3 files changed, 49 insertions(+), 14 deletions(-)
 create mode 100644 inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml

diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md
index f84a474eb..349240525 100644
--- a/inference/benchmarks/bertLarge/README.md
+++ b/inference/benchmarks/bertLarge/README.md
@@ -40,6 +40,24 @@ bert_reference_results_text_md5.txt
 
    - TensorRT 8.5.1.7
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+
+   - XTCL 2.1
+
 ### 4. 运行情况（BERT-Large）
 
 * 指标列表
@@ -64,5 +82,5 @@ bert_reference_results_text_md5.txt
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16      | 32 | 1283.9   | 257.3       | 260.4      | 408.3         | 418.1          | 45.3% | 0.600/0.638 | 17.4/40.0 |
 | tensorrt | fp32   | 32 | 1868.8   | 150.4       | 152.2      | 190.4         | 194.1       | 42.0% | 0.638/0.638 | 16.9/40.0 |
-
+| kunlunxin_xtcl| W32A16   | 32 |3867.6 | None          | None       | 93.8          | 124.9          | None | 0.638/0.638| None| 
 
diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
new file mode 100644
index 000000000..c29b9c46b
--- /dev/null
+++ b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,3 @@
+compiler: xtcl
+no_validation: true
+exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx
diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py
index 396cc3ae9..2643f51d5 100755
--- a/inference/inference_engine/kunlunxin/xtcl.py
+++ b/inference/inference_engine/kunlunxin/xtcl.py
@@ -3,7 +3,8 @@
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 from tvm.relay import param_dict
-from tvm.contrib import xpu_config
+from tvm.contrib import graph_executor, xpu_config
+from tvm.runtime.vm import VirtualMachine
 import torch
 import os
 import subprocess
@@ -11,8 +12,10 @@
 import numpy as np
 import time
 
+USE_VM_COMPILE = False
+
 class InferModel:
-    
+
     def __init__(self, config , onnx_path, model):
         self.input_names = []
         self.engine = self.build_engine(config, onnx_path)
@@ -27,7 +30,7 @@ def build_engine(self, config, onnx_path):
             input_name = input.name #'inputs:0'
             self.input_names.append(input_name)
             shape_dict[input_name] = input_shape
-        
+
         mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
 
         target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}'
@@ -44,21 +47,32 @@ def build_engine(self, config, onnx_path):
                                                  config_var_dtype_map=input_fp16,
                                                  ).value()
         else: ## fp32
-            os.environ['XTCL_USE_FP16'] = '0'
-            os.environ['XTCL_QUANTIZE_WEIGHT'] = '0'
+            os.environ['XTCL_USE_FP16'] = '1'
+            os.environ['XTCL_QUANTIZE_WEIGHT'] = '1'
 
         with tvm.transform.PassContext(opt_level=3, config=build_config):
-            vm_exec = relay.backend.vm.compile(mod,
-                                             target=target_host,
-                                             target_host=target_host,
-                                             params=params)
-        from tvm.runtime.vm import VirtualMachine
-        vm = VirtualMachine(vm_exec, ctx)
-        return vm
+            if USE_VM_COMPILE:
+                vm_exec = relay.backend.vm.compile(mod,
+                                                target=target_host,
+                                                target_host=target_host,
+                                                params=params)
+                
+                vm = VirtualMachine(vm_exec, ctx)
+                return vm
+            else:
+                graph, lib, params = relay.build(mod,
+                                                target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2",
+                                                params=params)
+                m = graph_executor.create(graph, lib, ctx)
+                m.set_input(**params)
+                return m
 
     def __call__(self, model_inputs: list):
         for index, input_name in enumerate(self.input_names):
-            self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
+            if USE_VM_COMPILE:
+                self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
+            else:
+                self.engine.set_input(input_name, tvm.nd.array(model_inputs[index]))
         self.engine.run()
         output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
         foo_time_start = time.time()

From e8c406d6ebf2d86d85bc564a2d87c19f45fedb08 Mon Sep 17 00:00:00 2001
From: Zhou Yu <zycosmos@gmail.com>
Date: Thu, 14 Sep 2023 11:41:59 +0800
Subject: [PATCH 04/18] update cpm 1x1 running stats (#238)

Co-authored-by: zhouyu <zhouyu@baai.ac.cn>
---
 training/nvidia/cpm-pytorch/README.md                 | 2 +-
 training/nvidia/cpm-pytorch/config/config_A100x1x1.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/training/nvidia/cpm-pytorch/README.md b/training/nvidia/cpm-pytorch/README.md
index 7f26bfccc..e16f10928 100644
--- a/training/nvidia/cpm-pytorch/README.md
+++ b/training/nvidia/cpm-pytorch/README.md
@@ -48,5 +48,5 @@
 | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
 | A100单机8卡（1x8）  | fp16      | /                | 1641     | 587     | 835     | 1059   | 0.92  | 12.9/40.0 |
 | A100单机8卡（1x8）  | fp16      | bs=128,lr=0.002  | 5469     | 771     | 1090    | 1292   | 0.918 | 23.1/40.0 |
-| A100单机单卡（1x1） | fp16      | bs=192,lr=0.0005 |          | 78.4    | 111.9   | 127.2  |       | 34.8/40.0 |
+| A100单机单卡（1x1） | fp16      | bs=192,lr=0.0005 |          | 98.8    | 143.8   | 168.8  |       | 39.5/40.0 |
 | A100两机8卡（2x8）  | fp16      | bs=192,lr=0.0005 |          | 1583    | 2221    | 2583.8 |       | 29.9/40.0 |
\ No newline at end of file
diff --git a/training/nvidia/cpm-pytorch/config/config_A100x1x1.py b/training/nvidia/cpm-pytorch/config/config_A100x1x1.py
index ea439af5e..85374eb0e 100644
--- a/training/nvidia/cpm-pytorch/config/config_A100x1x1.py
+++ b/training/nvidia/cpm-pytorch/config/config_A100x1x1.py
@@ -3,13 +3,13 @@
 
 fp16 = True
 dist_backend = "nccl"
-target_embedding_average = 0.8
+target_embedding_average = 0.92
 
 gradient_accumulation_steps = 1
 
 train_batch_size = 32
 eval_batch_size = train_batch_size
-max_steps = 60000
+max_steps = 3000
 max_samples_termination = 439126000
 
 warmup = 0.2

From c9d87bb087a913307a63926ed39a73fd0bd62ec9 Mon Sep 17 00:00:00 2001
From: Zhou Yu <zycosmos@gmail.com>
Date: Thu, 14 Sep 2023 11:44:43 +0800
Subject: [PATCH 05/18] update data_dir for test_conf (#247)

Co-authored-by: zhouyu <zhouyu@baai.ac.cn>
---
 training/run_benchmarks/config/test_conf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index fef579ed2..771b5cbbf 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -53,9 +53,9 @@
     "model:framework:hardwareID:nnodes:nproc:repeat": "dataset path"}
 '''
 CASES = {
-    "bert:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/bert/train/",
-    "glm:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/glm/train/",
-    "cpm:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/cpm/train/",
+    "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/",
+    "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/",
+    "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/",
 
     # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
     # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
@@ -72,6 +72,6 @@
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
 
     
-    # "transformer:pytorch_1.13:A100:1:8:1": "/home/datasets_ckpt/transformer/train/",
+    # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
 }

From e9ee7c7e3f6dceda757c088336aa775e3d961861 Mon Sep 17 00:00:00 2001
From: KungYork <30741085+KungYork@users.noreply.github.com>
Date: Fri, 15 Sep 2023 15:32:27 +0800
Subject: [PATCH 06/18] Add DistilBERT model (#249)

* Add DistilBert with training logic under developing

* DistilBert for 1x1 GPU training

* DistilBert for 1x8 GPU training

* Add README and externel configs

* Remove non-necessary files

* Restore environment_varaibles.sh from kunlunxin-cpm

* Update training configurations in _base.py

update max_epoch and target_acc

* Update README.md

* Add nvidia pytorch1.12 docker

* Update README.md

* Add 1x1 2x8 cases

* Add p_core unit name

* Add p_core unit name

* Update README.md

---------

Co-authored-by: wangyakai <root@szzj-isa-ai-chip1.szzj.baidu.com>
---
 training/benchmarks/distilbert/README.md      |  49 +++++++
 .../distilbert/pytorch/config/__init__.py     |   2 +
 .../distilbert/pytorch/config/_base.py        |  54 +++++++
 .../pytorch/config/mutable_params.py          |   6 +
 .../create_train_eval_data.py                 |  40 ++++++
 .../pytorch/dataloaders/__init__.py           |   1 +
 .../pytorch/dataloaders/dataloader.py         | 134 ++++++++++++++++++
 .../distilbert/pytorch/model/__init__.py      |  22 +++
 .../distilbert/pytorch/optimizers/__init__.py |  27 ++++
 .../distilbert/pytorch/run_pretraining.py     | 128 +++++++++++++++++
 .../distilbert/pytorch/schedulers/__init__.py |  11 ++
 .../distilbert/pytorch/train/__init__.py      |   0
 .../distilbert/pytorch/train/evaluator.py     |  44 ++++++
 .../distilbert/pytorch/train/trainer.py       | 125 ++++++++++++++++
 .../pytorch/train/trainer_adapter.py          |  34 +++++
 .../pytorch/train/training_state.py           |  78 ++++++++++
 training/nvidia/distilbert-pytorch/README.md  |  45 ++++++
 .../config/config_A100x1x1.py                 |   2 +
 .../config/config_A100x1x8.py                 |   1 +
 .../config/config_A100x2x8.py                 |   1 +
 .../config/requirements.txt                   |   2 +
 .../nvidia/distilbert-pytorch/extern/.gitkeep |   0
 .../docker_image/pytorch_1.12/Dockerfile      |   4 +
 .../pytorch_1.12/pytorch1.12_install.sh       |   1 +
 training/run_benchmarks/config/test_conf.py   |   1 +
 25 files changed, 812 insertions(+)
 create mode 100644 training/benchmarks/distilbert/README.md
 create mode 100644 training/benchmarks/distilbert/pytorch/config/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/config/_base.py
 create mode 100644 training/benchmarks/distilbert/pytorch/config/mutable_params.py
 create mode 100644 training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py
 create mode 100644 training/benchmarks/distilbert/pytorch/dataloaders/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py
 create mode 100644 training/benchmarks/distilbert/pytorch/model/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/optimizers/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/run_pretraining.py
 create mode 100644 training/benchmarks/distilbert/pytorch/schedulers/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/train/__init__.py
 create mode 100644 training/benchmarks/distilbert/pytorch/train/evaluator.py
 create mode 100644 training/benchmarks/distilbert/pytorch/train/trainer.py
 create mode 100644 training/benchmarks/distilbert/pytorch/train/trainer_adapter.py
 create mode 100644 training/benchmarks/distilbert/pytorch/train/training_state.py
 create mode 100644 training/nvidia/distilbert-pytorch/README.md
 create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x1x1.py
 create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x1x8.py
 create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x2x8.py
 create mode 100644 training/nvidia/distilbert-pytorch/config/requirements.txt
 create mode 100644 training/nvidia/distilbert-pytorch/extern/.gitkeep
 create mode 100644 training/nvidia/docker_image/pytorch_1.12/Dockerfile
 create mode 100644 training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh

diff --git a/training/benchmarks/distilbert/README.md b/training/benchmarks/distilbert/README.md
new file mode 100644
index 000000000..fa2aeb4ef
--- /dev/null
+++ b/training/benchmarks/distilbert/README.md
@@ -0,0 +1,49 @@
+## Model Introduction
+### DistilBERT base model (uncased)
+
+This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-uncased). It was
+introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found
+[here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation). This model is uncased: it does
+not make a difference between english and English.
+
+## Model and Training Scripts source code
+Pytorch case:
+This repository includes software from https://github.com/huggingface/transformers/tree/v4.33.0
+licensed under the Apache License 2.0.
+
+Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf.
+
+## Dataset and Model Checkpoints
+
+> Dataset website：https://huggingface.co/datasets/sst2
+https://huggingface.co/distilbert-base-uncased
+> Model checkpoint website: https://huggingface.co/distilbert-base-uncased
+
+We have already preprocessed the dataset and the model checkpoint files(The preprocessing script is `training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py`).
+The preprocessed can be downloaded directly from https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/distilbert_train.tar.
+No additional preprocessing steps need to be conducted.
+
+After decompressing, the dataset and model checkpoint files are organized as the following:
+
+```
+distilbert
+├── dataset                     # dataset files
+│   ├── eval_dataset.npz
+│   └── train_dataset.npz
+└── model                       # model checkpoint and config files
+    ├── config.json
+    ├── pytorch_model.bin
+    ├── special_tokens_map.json
+    ├── tokenizer_config.json
+    └── vocab.txt
+```
+
+## Benchmark Task and Target Accuracy
+This experiment is to finetune a text classification task on SST-2 dataset with DistilBERT-base-uncased pretrained checkpoints.
+After finetuning 10 epoches, the DistilBERT-base-uncased model is able to achieve accuracy score of 90+, which matches the evaluation result on the [report](https://huggingface.co/distilbert-base-uncased).
+
+## AI Frameworks && Accelerators supports
+
+|            | Pytorch | Paddle | TensorFlow2 |
+| ---------- | ------- | ------ | ----------- |
+| Nvidia GPU | [✅](../../nvidia/distilbert-pytorch/README.md)       | N/A    | N/A       |
diff --git a/training/benchmarks/distilbert/pytorch/config/__init__.py b/training/benchmarks/distilbert/pytorch/config/__init__.py
new file mode 100644
index 000000000..d877a8e37
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/config/__init__.py
@@ -0,0 +1,2 @@
+from ._base import *
+from .mutable_params import mutable_params
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/config/_base.py b/training/benchmarks/distilbert/pytorch/config/_base.py
new file mode 100644
index 000000000..799d342db
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/config/_base.py
@@ -0,0 +1,54 @@
+# DO NOT MODIFY THESE REQUIRED PARAMETERS
+
+# Required parameters
+vendor: str = None
+data_dir: str = None
+name: str = "distilbert"
+cudnn_benchmark: bool = False
+cudnn_deterministic: bool = True
+
+# Optional parameters
+
+# =========================================================
+# loss scale
+# =========================================================
+lr: float = 5e-5
+weight_decay = 0.0
+
+# =========================================================
+# train && evaluate
+# =========================================================
+train_batch_size: int = 4
+eval_batch_size: int = 4
+
+max_epoch: int = 10
+target_acc: float = 0.91
+
+do_train = True
+distributed: bool = True
+
+
+# =========================================================
+# utils
+# =========================================================
+seed: int = 0
+dist_backend: str = 'nccl'
+device: str = None
+
+# =========================================================
+# datasets
+# =========================================================
+dataloader_drop_last: bool = False
+dataloader_num_workers: int = 8
+
+# =========================================================
+# for driver
+# =========================================================
+local_rank: int = -1
+use_env: bool = True
+log_freq: int = 1000
+print_freq: int = 1000
+n_device: int = 1
+sync_bn: bool = False
+gradient_accumulation_steps: int = 1
+
diff --git a/training/benchmarks/distilbert/pytorch/config/mutable_params.py b/training/benchmarks/distilbert/pytorch/config/mutable_params.py
new file mode 100644
index 000000000..6a1879263
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/config/mutable_params.py
@@ -0,0 +1,6 @@
+mutable_params = [
+    'vendor', 'data_dir', 'lr', 'weight_decay', 'train_batch_size', 
+    'gradient_accumulation_steps', 'eval_batch_size', 'do_train', 
+    'distributed', 'dist_backend', 'device', 'cudnn_benchmark', 
+    'cudnn_deterministic'
+]
diff --git a/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py b/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py
new file mode 100644
index 000000000..db5f0b327
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py
@@ -0,0 +1,40 @@
+import os
+
+import numpy as np
+from datasets import load_dataset
+from transformers import DistilBertTokenizer
+
+
+def save_dataset(ds, save_path):
+    np.savez(save_path,
+             idx=ds['idx'],
+             sentence=ds['sentence'],
+             label=ds['label'],
+             input_ids=ds['input_ids'],
+             attention_mask=ds['attention_mask'],)
+
+
+def main():
+    data_prefix = 'distilbert/dataset'
+    os.makedirs(data_prefix, exist_ok=True)
+    train_datapath = os.path.join(data_prefix, 'train_dataset.npz')
+    eval_datapath = os.path.join(data_prefix, 'eval_dataset.npz')
+
+    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+
+    raw_datasets = load_dataset("sst2")
+
+    def tokenize_function(examples):
+        return tokenizer(examples["sentence"], padding="max_length", truncation=True)
+
+    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
+
+    train_dataset = tokenized_datasets["train"].with_format('numpy')
+    save_dataset(train_dataset, train_datapath)
+
+    eval_dataset = tokenized_datasets["validation"].with_format('numpy')
+    save_dataset(eval_dataset, eval_datapath)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py b/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py
new file mode 100644
index 000000000..83fa73435
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py
@@ -0,0 +1 @@
+from .dataloader import build_train_dataloader, build_eval_dataloader
diff --git a/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py b/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py
new file mode 100644
index 000000000..843176f80
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py
@@ -0,0 +1,134 @@
+import os
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from torch.utils.data.dataloader import default_collate
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+from collections.abc import Mapping
+InputDataClass = NewType("InputDataClass", Any)
+
+class DistilBertDataset(Dataset):
+    def __init__(self, filepath):
+        origin_data = np.load(filepath)
+        self.idx = origin_data['idx']
+        self.sentence = origin_data['sentence']
+        self.label = origin_data['label']
+        self.input_ids = origin_data['input_ids']
+        self.attention_mask = origin_data['attention_mask']
+
+    def __len__(self):
+        return len(self.idx)
+
+    def __getitem__(self, idx):
+        sample = {
+            'sentence': self.sentence[idx],
+            'label': self.label[idx],
+            'input_ids': self.input_ids[idx],
+            'attention_mask': self.attention_mask[idx],
+        }
+        return sample
+
+
+def default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+    """
+        https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/data/data_collator.py#L105
+    """
+    if not isinstance(features[0], Mapping):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=torch.long)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], torch.Tensor):
+            batch["labels"] = torch.stack([f["label_ids"] for f in features])
+        else:
+            dtype = torch.long if type(first["label_ids"][0]) is int else torch.float
+            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, torch.Tensor):
+                batch[k] = torch.stack([f[k] for f in features])
+            elif isinstance(v, np.ndarray):
+                batch[k] = torch.tensor(np.stack([f[k] for f in features]))
+            else:
+                batch[k] = torch.tensor([f[k] for f in features])
+
+    return batch
+
+
+def build_train_sampler(config, dataset):
+    if torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank, seed = config.seed)
+    else:
+        generator = torch.Generator()
+        generator.manual_seed(config.seed)
+        sampler = torch.utils.data.RandomSampler(dataset, generator=generator)
+    return sampler
+
+
+def build_train_dataloader(config):
+    train_dataset = DistilBertDataset(
+        os.path.join(config.data_dir, 'dataset', 'train_dataset.npz'))
+
+    train_sampler = build_train_sampler(config, train_dataset)
+    data_loader = DataLoader(
+            train_dataset,
+            sampler=train_sampler,
+            batch_size=config.train_batch_size,
+            collate_fn=default_data_collator,
+            drop_last=config.dataloader_drop_last,
+            num_workers=config.dataloader_num_workers,
+        )
+    return data_loader
+
+
+def build_eval_sampler(dataset):
+    if torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank)
+    else:
+        sampler = None
+    return sampler
+
+
+def build_eval_dataloader(config):
+    eval_dataset = DistilBertDataset(
+        os.path.join(config.data_dir, 'dataset', 'eval_dataset.npz'))
+
+    eval_sampler = build_eval_sampler(eval_dataset)
+    data_loader = DataLoader(
+            eval_dataset,
+            sampler=eval_sampler,
+            batch_size=config.eval_batch_size,
+            collate_fn=default_data_collator,
+            drop_last=config.dataloader_drop_last,
+            num_workers=config.dataloader_num_workers,
+        )
+    
+    return data_loader
+
+
+if __name__ == '__main__':
+    from collections import namedtuple
+    Config = namedtuple(
+        'Config',
+        ['data_dir', 'distributed', 'train_batch_size', 'eval_batch_size', 'dataloader_drop_last', 'dataloader_num_workers', 'seed'])
+    config = Config('distilbert', False, 4, 4, False, 8, 1234)
+    train_dataloader = build_train_dataloader(config)
+    for i, batch in enumerate(train_dataloader):
+        print(batch.keys())
+        break
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/model/__init__.py b/training/benchmarks/distilbert/pytorch/model/__init__.py
new file mode 100644
index 000000000..0b2938d19
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/model/__init__.py
@@ -0,0 +1,22 @@
+import os
+from collections import namedtuple
+
+from transformers import DistilBertConfig, DistilBertTokenizer
+from transformers import DistilBertForSequenceClassification
+
+
+def create_model(config):
+    model_path = os.path.join(config.data_dir, 'model')
+    hfconfig = DistilBertConfig.from_pretrained(model_path)
+    model = DistilBertForSequenceClassification.from_pretrained(model_path,
+                                                       config=hfconfig)
+    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
+    return model, hfconfig, tokenizer
+
+
+if __name__ == '__main__':
+
+    Config = namedtuple('Config', ['data_dir'])
+    config = Config('distilbert')
+    model, model_config, tokenizer = create_model(config)
+    import pdb; pdb.set_trace()
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/optimizers/__init__.py b/training/benchmarks/distilbert/pytorch/optimizers/__init__.py
new file mode 100644
index 000000000..30d50d86f
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/optimizers/__init__.py
@@ -0,0 +1,27 @@
+import torch
+
+
+def create_optimizer(model, args):
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            args.weight_decay,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr)
+    return optimizer
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/run_pretraining.py b/training/benchmarks/distilbert/pytorch/run_pretraining.py
new file mode 100644
index 000000000..ae32d9476
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/run_pretraining.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import os
+import sys
+import time
+from typing import Any, Tuple
+
+# benchmarks目录 append到sys.path
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH,
+                                             "../../")))  # benchmarks目录
+import config
+from driver import Event, dist_pytorch
+from driver.helper import InitHelper
+
+from train import trainer_adapter
+from train.evaluator import Evaluator
+from train.trainer import Trainer
+from train.training_state import TrainingState
+from dataloaders.dataloader import build_train_dataloader, build_eval_dataloader
+
+logger = None
+
+
+def main() -> Tuple[Any, Any]:
+    global logger
+    global config
+
+    # init
+    init_helper = InitHelper(config)
+    model_driver = init_helper.init_driver(globals(), locals())
+    config = model_driver.config
+    dist_pytorch.init_dist_training_env(config)
+    dist_pytorch.barrier(config.vendor)
+    model_driver.event(Event.INIT_START)
+
+    config.distributed = dist_pytorch.get_world_size() > 1
+    # logger
+    logger = model_driver.logger
+
+    train_dataloader = build_train_dataloader(config)
+    eval_dataloader = build_eval_dataloader(config)
+
+    seed = config.seed
+
+    init_helper.set_seed(seed, model_driver.config.vendor)
+
+    # 创建TrainingState对象
+    training_state = TrainingState()
+
+    # 构建 trainer：依赖 evaluator、TrainingState对象
+    evaluator = Evaluator(config, eval_dataloader)
+    trainer = Trainer(driver=model_driver,
+                      adapter=trainer_adapter,
+                      evaluator=evaluator,
+                      training_state=training_state,
+                      device=config.device,
+                      config=config)
+    training_state._trainer = trainer
+
+    # 设置分布式环境, trainer init()
+    dist_pytorch.barrier(config.vendor)
+    trainer.init(train_dataloader)
+    dist_pytorch.barrier(config.vendor)
+
+    # evaluation统计
+    init_evaluation_start = time.time()  # evaluation起始时间，单位为秒
+
+    training_state.acc = evaluator.evaluate(trainer)
+
+    init_evaluation_end = time.time()  # evaluation结束时间，单位为秒
+
+    init_evaluation_info = dict(time=init_evaluation_end -
+                                init_evaluation_start)
+
+    model_driver.event(Event.INIT_EVALUATION, init_evaluation_info)
+
+    if not config.do_train:
+        return config, training_state
+
+    model_driver.event(Event.INIT_END)
+
+    # TRAIN_START
+    dist_pytorch.barrier(config.vendor)
+    model_driver.event(Event.TRAIN_START)
+    train_start_time = time.time()
+
+    # 训练过程
+    epoch = 1
+    while not training_state.end_training:
+        training_state.epoch = epoch
+        trainer.train_one_epoch(train_dataloader)
+        epoch += 1
+
+    # TRAIN_END事件
+    training_state.train_time = time.time() - train_start_time
+    model_driver.event(Event.TRAIN_END)
+
+    return config, training_state
+
+
+if __name__ == "__main__":
+    start = time.time()
+    config_update, state = main()
+    if not dist_pytorch.is_main_process():
+        sys.exit(0)
+
+    # 训练信息写日志
+    e2e_time = time.time() - start
+    if config_update.do_train:
+
+        finished_info = {
+            "e2e_time": e2e_time,
+            "train_time": state.train_time,
+            "train_no_eval_time": state.no_eval_time,
+            "pure_training_computing_time": state.pure_compute_time,
+            "throughput(ips)_raw": state.num_trained_samples / state.train_time,
+            "throughput(ips)_no_eval":
+            state.num_trained_samples / state.no_eval_time,
+            "throughput(ips)_pure_compute":
+            state.num_trained_samples / state.pure_compute_time,
+            "converged": state.converged,
+            "acc": state.acc,
+        }
+    else:
+        finished_info = {"e2e_time": e2e_time}
+    logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
diff --git a/training/benchmarks/distilbert/pytorch/schedulers/__init__.py b/training/benchmarks/distilbert/pytorch/schedulers/__init__.py
new file mode 100644
index 000000000..7ba78bbd8
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/schedulers/__init__.py
@@ -0,0 +1,11 @@
+from transformers import get_scheduler
+
+
+def create_scheduler(optimizer, train_dataloader, args):
+    lr_scheduler = get_scheduler(
+        name='linear',
+        optimizer=optimizer,
+        num_warmup_steps=0,
+        num_training_steps=len(train_dataloader) * args.max_epoch,
+    )
+    return lr_scheduler
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/train/__init__.py b/training/benchmarks/distilbert/pytorch/train/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/distilbert/pytorch/train/evaluator.py b/training/benchmarks/distilbert/pytorch/train/evaluator.py
new file mode 100644
index 000000000..27ae1c796
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/train/evaluator.py
@@ -0,0 +1,44 @@
+import os
+
+import torch
+from torch.types import Device
+
+from driver import dist_pytorch
+
+class Evaluator:
+    """Evaluator"""
+    def __init__(self, config, dataloader):
+        self.config = config
+        self.eval_dataloader = dataloader
+        self.device = config.device
+    
+    def process_batch(self, batch, device: Device):
+        """Process batch and produce inputs for the model."""
+        for k, v in batch.items():
+            batch[k] = v.to(device, non_blocking=True)
+        return batch
+
+    def evaluate(self, trainer):
+        model = trainer.model
+        model.eval()
+
+        total_output = 0.0
+        num_examples = len(self.eval_dataloader.dataset)
+        with torch.no_grad():
+            # For all the batches in the dataset.
+            for step, inputs in enumerate(self.eval_dataloader):
+                # Forward pass through the model.
+                inputs = self.process_batch(inputs, self.device)
+                output = model(**inputs)
+                # For accuracy, return the number of correctly predicted samples.
+                outputs = torch.argmax(output['logits'], -1)
+                correct = (outputs == inputs['labels']).float()
+                output = correct.sum()
+
+                # Reduce across processes.
+                if dist_pytorch.is_dist_avail_and_initialized():
+                    torch.distributed.all_reduce(output)
+
+                total_output += output
+        acc = total_output / num_examples
+        return acc.item()
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/train/trainer.py b/training/benchmarks/distilbert/pytorch/train/trainer.py
new file mode 100644
index 000000000..e3209b35d
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/train/trainer.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import time
+import os
+import sys
+import math
+
+import torch
+import torch.utils.data
+from torch.types import Device
+
+from model import create_model
+from optimizers import create_optimizer
+from schedulers import create_scheduler
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
+from driver import Driver, Event, dist_pytorch
+
+
+class Trainer:
+    def __init__(self, driver: Driver, adapter, evaluator: Evaluator,
+                 training_state: TrainingState, device: Device, config):
+        super(Trainer, self).__init__()
+        self.driver = driver
+        self.adapter = adapter
+        self.training_state = training_state
+        self.device = device
+        self.config = config
+        self.evaluator = evaluator
+
+    def init(self, train_dataloader):
+        self.model, self.model_config, self.tokenizer = create_model(
+            self.config)
+        self.model.to(self.device)
+
+        self.model = self.adapter.convert_model(self.model)
+        self.model = self.adapter.model_to_ddp(self.config, self.model)
+
+        self.optimizer = create_optimizer(self.model, self.config)
+        self.lr_scheduler = create_scheduler(self.optimizer, train_dataloader,
+                                             self.config)
+
+
+    def process_batch(self, batch, device: Device):
+        """Process batch and produce inputs for the model."""
+        for k, v in batch.items():
+            batch[k] = v.to(device, non_blocking=True)
+        return batch
+
+
+    def train_one_epoch(self, dataloader):
+        state = self.training_state
+        driver = self.driver
+        driver.event(Event.EPOCH_BEGIN, state.epoch)
+
+        no_eval_start = time.time()
+        for _, data in enumerate(dataloader):
+            data = self.process_batch(data, self.device)
+
+            pure_compute_start = time.time()
+            state.global_steps += 1
+            state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size(
+                self.config)
+
+            driver.event(Event.STEP_BEGIN, step=state.global_steps)
+            self.train_one_step(data)
+
+            train_end = time.time()
+            state.pure_compute_time += train_end - pure_compute_start
+            state.no_eval_time += train_end - no_eval_start
+
+            other_state = dict()
+            if state.global_steps % self.config.gradient_accumulation_steps == 0:
+                sequences_per_second = state.num_trained_samples / state.no_eval_time
+                other_state["seq/s"] = sequences_per_second
+
+            step_info = state.to_dict(**other_state)
+            driver.event(Event.STEP_END,
+                         message=step_info,
+                         step=state.global_steps,
+                         loss=state.loss)
+            
+            no_eval_start = time.time()
+
+        driver.event(Event.EPOCH_END, state.epoch)
+        eval_start = time.time()
+        state.acc = self.evaluator.evaluate(self)
+        eval_result = dict(
+            global_steps=state.global_steps,
+            acc=state.acc,
+            time=time.time() - eval_start)
+        driver.event(Event.EVALUATE, eval_result)
+        self.detect_training_status(state)
+        
+
+    def train_one_step(self, data):
+
+        state = self.training_state
+        self.model.train()
+
+        outputs = self.model(**data)
+        #loss 为标量
+        loss = outputs["loss"].item()
+        state.loss = loss
+        self.adapter.backward(self.config, state.global_steps, outputs["loss"],
+                              self.optimizer)
+        self.lr_scheduler.step()
+        self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
+                          self.optimizer)
+
+
+    def detect_training_status(self, state: TrainingState):
+        if state.acc >= self.config.target_acc:
+            state.converged_success()
+            state.end_training = True
+
+        if state.epoch >= self.config.max_epoch:
+            state.end_training = True
+
+        return state.end_training
+
diff --git a/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py b/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py
new file mode 100644
index 000000000..1dc597460
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import torch
+from torch import nn
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+
+import config
+
+def convert_model(model: nn.Module) -> nn.Module:
+    """convert_model"""
+    return model
+
+
+def model_to_ddp(config, model: nn.Module) -> nn.Module:
+    use_ddp = dist.is_initialized()
+    if use_ddp:
+        model = DistributedDataParallel(
+            model,
+            device_ids=[config.local_rank])
+
+    return model
+
+
+def backward(config, step: int, loss: torch.Tensor, optimizer, **kwarg):
+    if config.gradient_accumulation_steps > 1:
+        loss = loss / config.gradient_accumulation_steps
+
+    loss.backward()
+
+    if step % config.gradient_accumulation_steps == 0:
+        optimizer.step()
+        optimizer.zero_grad()
\ No newline at end of file
diff --git a/training/benchmarks/distilbert/pytorch/train/training_state.py b/training/benchmarks/distilbert/pytorch/train/training_state.py
new file mode 100644
index 000000000..e97ead272
--- /dev/null
+++ b/training/benchmarks/distilbert/pytorch/train/training_state.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+from dataclasses import dataclass
+
+import torch
+import inspect
+
+@dataclass
+class TrainingState:
+    """TrainingState dataclass"""
+    _trainer = None
+    _status = 'aborted'  # later set to 'success' if termination criteria met
+
+    global_steps = 0
+
+    loss: float = 0.0
+    acc: float = 0.0
+
+    epoch: int = 1
+
+    end_training: bool = False
+    converged: bool = False
+
+    train_time = 0.0
+    no_eval_time = 0.0
+    pure_compute_time = 0.0
+
+    num_trained_samples = 0
+
+    def status(self):
+        """get status"""
+        if self.converged:
+            self._status = "success"
+        return self._status
+
+    def converged_success(self):
+        """converged success"""
+        self.end_training = True
+        self.converged = True
+
+    def _is_property(self, value):
+        status = [
+            not callable(value), not inspect.isclass(value),
+            not inspect.ismodule(value), not inspect.ismethod(value),
+            not inspect.isfunction(value), not inspect.isbuiltin(value),
+            "classmethod object" not in str(value)
+        ]
+        return all(status)
+    
+    
+    def to_dict(self, **kwargs):
+        state_dict = dict()
+
+        for var_name, value in self.__dict__.items():
+            if not var_name.startswith("_") and self._is_property(value):
+                state_dict[var_name] = value
+
+        lr = self._trainer.lr_scheduler.get_lr()
+        if isinstance(lr, (tuple, list)):
+            lr = lr[0]
+        state_dict["learning_rate"] = lr
+
+        exclude = [
+            "acc", "skipped_steps",
+            "converged", "init_time", "raw_train_time"
+        ]
+        for exkey in exclude:
+            if exkey in state_dict:
+                state_dict.pop(exkey)
+
+        state_dict.update(kwargs)
+
+        for k in state_dict.keys():
+            if torch.is_tensor(state_dict[k]):
+                state_dict[k] = state_dict[k].item()
+
+        return state_dict
diff --git a/training/nvidia/distilbert-pytorch/README.md b/training/nvidia/distilbert-pytorch/README.md
new file mode 100644
index 000000000..0ec6b5e89
--- /dev/null
+++ b/training/nvidia/distilbert-pytorch/README.md
@@ -0,0 +1,45 @@
+### 1. 下载数据集和模型
+[下载链接](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/distilbert_train.tar) 
+
+### 2. 设置test_conf.py
+
+为了使得`training/nvidia/distilbert-pytorch/config/requirements.txt`里的依赖库均能被下载，需要将`training/run_benchmarks/config/test_conf.py`里的`PIP_SOURCE`的值修改为`https://pypi.tuna.tsinghua.edu.cn/simple`
+
+### 3. Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-1.12.0a0+bd13bc6
+   - 依赖软件版本：
+     - cuda: 11.6
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                              |
+| -------------- | ----------------------- | ------------------------------------- |
+| 任务类别       | Summarization                |                                       |
+| 模型           | distilbert                |                                       |
+| 数据集         | SST-2               |                                       |
+| 超参修改       | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称   | nvidia A100             |                                       |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB              |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间               |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际样本数数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时     |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=512)      |
+| 训练结果       | acc,见“性能指标”        | 分类准确率            |
+| 额外修改项     | 无                      |                                       |
+
+* 性能指标
+
+| 配置               | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc  | mem |
+| ------------------ | --------- | ---- | ----      | ----     | ----   | ----   |  ---- |  ---- |
+| A100单机8卡（1x8） | fp32        | /    | 361      | 1764.0    | 1861.9 | 1942.6 |  0.915 | 13.9 /40.0 |
diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py b/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py
new file mode 100644
index 000000000..9d1621307
--- /dev/null
+++ b/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,2 @@
+train_batch_size = 32
+gradient_accumulation_steps = 8
diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py b/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py
new file mode 100644
index 000000000..f85ba4108
--- /dev/null
+++ b/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py
@@ -0,0 +1 @@
+train_batch_size = 32
diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py b/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py
new file mode 100644
index 000000000..ed3891623
--- /dev/null
+++ b/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py
@@ -0,0 +1 @@
+train_batch_size = 16
diff --git a/training/nvidia/distilbert-pytorch/config/requirements.txt b/training/nvidia/distilbert-pytorch/config/requirements.txt
new file mode 100644
index 000000000..a772ae8fa
--- /dev/null
+++ b/training/nvidia/distilbert-pytorch/config/requirements.txt
@@ -0,0 +1,2 @@
+datasets==2.14.4
+transformers==4.33.0
\ No newline at end of file
diff --git a/training/nvidia/distilbert-pytorch/extern/.gitkeep b/training/nvidia/distilbert-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/nvidia/docker_image/pytorch_1.12/Dockerfile b/training/nvidia/docker_image/pytorch_1.12/Dockerfile
new file mode 100644
index 000000000..e57eaa828
--- /dev/null
+++ b/training/nvidia/docker_image/pytorch_1.12/Dockerfile
@@ -0,0 +1,4 @@
+FROM nvcr.io/nvidia/pytorch:22.04-py3
+RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple"
+RUN /bin/bash -c "uname -a"
+RUN /bin/bash -c alias python3=python
diff --git a/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh b/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh
new file mode 100644
index 000000000..cc1f786e8
--- /dev/null
+++ b/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
\ No newline at end of file
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 771b5cbbf..81bf848d5 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -71,6 +71,7 @@
     # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech",
     # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/",
 
+    # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/",
     
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",

From c845525c015c6f0ea6122ae8f5d4215c0481cad6 Mon Sep 17 00:00:00 2001
From: KungYork <30741085+KungYork@users.noreply.github.com>
Date: Fri, 15 Sep 2023 17:50:08 +0800
Subject: [PATCH 07/18] GPT2 (#205)

* Add gpt2 model

* Add gpt2 test case in test_conf.py

* refine README and python files

* Remove redundant codes and re-organize denpendency

* remove redundancy files

* refine gpt_dataset

* "Refine traing job"

* Refine README

* fix typo in README.md

* Update README.md

* Add config for 1x1 2x8

* Update README.md 1x1 config

* Update README.md
---
 training/benchmarks/gpt2/README.md            |   49 +
 .../gpt2/pytorch/config/__init__.py           |    2 +
 .../benchmarks/gpt2/pytorch/config/_base.py   |  122 ++
 .../gpt2/pytorch/config/mutable_params.py     |    6 +
 .../gpt2/pytorch/dataloaders/__init__.py      |    3 +
 .../gpt2/pytorch/dataloaders/dataloader.py    |   33 +
 .../pytorch/dataloaders/gpt2_tokenization.py  |  263 +++++
 .../gpt2/pytorch/dataloaders/gpt_dataset.py   |  302 +++++
 .../pytorch/dataloaders/indexed_dataset.py    |  344 ++++++
 .../gpt2/pytorch/dataloaders/tokenizer.py     |  137 +++
 .../benchmarks/gpt2/pytorch/model/__init__.py |   13 +
 .../gpt2/pytorch/model/layers/__init__.py     |    0
 .../pytorch/model/layers/fused_bias_gelu.py   |   43 +
 .../pytorch/model/layers/fused_softmax.py     |  212 ++++
 .../gpt2/pytorch/model/layers/layers.py       |  288 +++++
 .../gpt2/pytorch/model/layers/transformer.py  |  985 ++++++++++++++++
 .../gpt2/pytorch/model/layers/utils.py        |   37 +
 .../gpt2/pytorch/model/losses/__init__.py     |    0
 .../pytorch/model/losses/cross_entropy.py     |   32 +
 .../gpt2/pytorch/model/models/__init__.py     |   20 +
 .../gpt2/pytorch/model/models/enums.py        |   25 +
 .../gpt2/pytorch/model/models/gpt_model.py    |  123 ++
 .../pytorch/model/models/language_model.py    |  502 ++++++++
 .../gpt2/pytorch/model/models/module.py       |  125 ++
 .../gpt2/pytorch/model/models/utils.py        |   54 +
 .../benchmarks/gpt2/pytorch/mpu/__init__.py   |   11 +
 .../gpt2/pytorch/optimizer/__init__.py        |  138 +++
 .../gpt2/pytorch/optimizer/clip_grads.py      |   86 ++
 .../pytorch/optimizer/distrib_optimizer.py    | 1011 +++++++++++++++++
 .../gpt2/pytorch/optimizer/grad_scaler.py     |  119 ++
 .../gpt2/pytorch/optimizer/optimizer.py       |  645 +++++++++++
 .../gpt2/pytorch/run_pretraining.py           |  144 +++
 .../gpt2/pytorch/schedulers/__init__.py       |    1 +
 .../gpt2/pytorch/schedulers/factory.py        |   34 +
 .../schedulers/optimizer_param_scheduler.py   |  214 ++++
 .../benchmarks/gpt2/pytorch/train/__init__.py |    0
 .../gpt2/pytorch/train/evaluator.py           |   46 +
 .../benchmarks/gpt2/pytorch/train/trainer.py  |  190 ++++
 .../gpt2/pytorch/train/trainer_adapter.py     |   57 +
 .../gpt2/pytorch/train/training_state.py      |   79 ++
 .../benchmarks/gpt2/pytorch/train/utils.py    |  101 ++
 training/nvidia/gpt2-pytorch/README.md        |   42 +
 .../gpt2-pytorch/config/config_A100x1x1.py    |    4 +
 .../gpt2-pytorch/config/config_A100x1x8.py    |    1 +
 .../gpt2-pytorch/config/config_A100x2x8.py    |    1 +
 .../gpt2-pytorch/config/config_common.py      |    6 +
 training/nvidia/gpt2-pytorch/extern/.gitkeep  |    0
 training/run_benchmarks/config/test_conf.py   |    1 +
 48 files changed, 6651 insertions(+)
 create mode 100644 training/benchmarks/gpt2/README.md
 create mode 100644 training/benchmarks/gpt2/pytorch/config/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/config/_base.py
 create mode 100644 training/benchmarks/gpt2/pytorch/config/mutable_params.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py
 create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/tokenizer.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/layers.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/transformer.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/utils.py
 create mode 100755 training/benchmarks/gpt2/pytorch/model/losses/__init__.py
 create mode 100755 training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/enums.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/gpt_model.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/language_model.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/module.py
 create mode 100644 training/benchmarks/gpt2/pytorch/model/models/utils.py
 create mode 100644 training/benchmarks/gpt2/pytorch/mpu/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
 create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py
 create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py
 create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/optimizer.py
 create mode 100644 training/benchmarks/gpt2/pytorch/run_pretraining.py
 create mode 100755 training/benchmarks/gpt2/pytorch/schedulers/__init__.py
 create mode 100755 training/benchmarks/gpt2/pytorch/schedulers/factory.py
 create mode 100644 training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/__init__.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/evaluator.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/trainer.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/trainer_adapter.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/training_state.py
 create mode 100644 training/benchmarks/gpt2/pytorch/train/utils.py
 create mode 100644 training/nvidia/gpt2-pytorch/README.md
 create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x1x1.py
 create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x1x8.py
 create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x2x8.py
 create mode 100755 training/nvidia/gpt2-pytorch/config/config_common.py
 create mode 100644 training/nvidia/gpt2-pytorch/extern/.gitkeep

diff --git a/training/benchmarks/gpt2/README.md b/training/benchmarks/gpt2/README.md
new file mode 100644
index 000000000..f720d14a8
--- /dev/null
+++ b/training/benchmarks/gpt2/README.md
@@ -0,0 +1,49 @@
+### 模型信息
+- 模型介绍
+
+GPT-2 Medium is the 345M parameter version of Megatron-GPT2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective.
+
+>[Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) 
+
+- 模型代码来源
+
+This case includes code from open source project at https://github.com/NVIDIA/Megatron-LM/tree/v3.0/megatron
+
+Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf.
+
+
+### 数据集
+- 数据集下载地址
+> Dataset website：https://huggingface.co/datasets/lambada
+
+> The training data should be downloaded from huggingface. First, download training data in a loose json format, with one json containing a text sample per line. For example in python interpreter:
+
+```
+from datasets import load_dataset
+
+train_data = load_dataset('lambada', split='train')
+train_data.to_json("lambada.train.json", lines=True)
+```
+
+- 预处理
+> The training data requires preprocessing. 
+The loose json is then processed into a binary format for training. To convert the json into mmap format use preprocess_data.py. An example script to prepare data for GPT2 training is:
+
+``` bash
+python tools/preprocess_data.py \
+        --input lambada.train.json \
+        --output-prefix lambada \
+        --vocab gpt2-vocab.json \
+        --dataset-impl mmap \
+        --tokenizer-type GPT2BPETokenizer \
+        --merge-file gpt2-merges.txt \
+        --append-eod \
+        --workers 32 \
+        --chunk-size 25 \
+```
+
+
+### 框架与芯片支持情况
+|     | Pytorch  |Paddle|TensorFlow2|
+|  ----  | ----  |  ----  | ----  |
+| Nvidia GPU | ✅ |N/A  |N/A|
diff --git a/training/benchmarks/gpt2/pytorch/config/__init__.py b/training/benchmarks/gpt2/pytorch/config/__init__.py
new file mode 100644
index 000000000..96e0aae70
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/config/__init__.py
@@ -0,0 +1,2 @@
+from ._base import *
+from .mutable_params import mutable_params
diff --git a/training/benchmarks/gpt2/pytorch/config/_base.py b/training/benchmarks/gpt2/pytorch/config/_base.py
new file mode 100644
index 000000000..ae9a91b57
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/config/_base.py
@@ -0,0 +1,122 @@
+# Required parameters
+
+vendor: str = None
+data_dir: str = None
+name: str = "GPT2"
+cudnn_benchmark: bool = False
+cudnn_deterministic: bool = True
+
+use_env: bool = True
+log_freq: int = 1
+device: str = None
+
+# =========================================================
+# train config
+# =========================================================
+
+seed: int = 1234
+gradient_accumulation_steps: int = 1
+
+max_steps: int = 23070
+train_batch_size: int = 4
+
+eval_iter_start_samples: int = 3200
+eval_interval_samples: int = 3200
+
+target_acc: float = 0.60
+
+# =========================================================
+# data
+# =========================================================
+
+train_data_prefix: str = "lambada_train_text_document"
+test_data_prefix: str = "lambada_test.json"
+vocab_file: str = "gpt2-vocab.json"
+merge_file: str = "gpt2-merges.txt"
+
+# =========================================================
+# loss scale
+# =========================================================
+clip_grad: float = 1.0
+
+# =========================================================
+# optimizer & lr scheduler & weight decay
+# =========================================================
+optimizer: str = "adam"
+adam_beta1: float = 0.9
+adam_beta2: float = 0.999
+adam_eps: float = 1e-8
+
+lr: float = 0.00015
+min_lr: float = 1e-05
+lr_warmup_fraction: float = 0.01
+lr_warmup_iters: int = 0
+lr_warmup_samples: int = 0
+lr_decay_style: str = "cosine"
+lr_decay_samples: int=None
+
+weight_decay: float = 0.01
+start_weight_decay: float = 0.01
+end_weight_decay: float = 0.01
+weight_decay_incr_style: str = "constant"
+
+use_distributed_optimizer: bool = False
+barrier_with_L1_time: bool = True
+
+# =========================================================
+# transformer
+# =========================================================
+
+num_layers: int = 24
+encoder_num_layers: str = 24
+
+num_attention_heads: int = 16
+hidden_size: int = 1024
+ffn_hidden_size: int = 4096
+kv_channels: int = 64
+seq_length: int = 1024
+attention_dropout: float = 0.1
+hidden_dropout: float = 0.1
+transformer_impl: str = "local"
+use_flash_attn: bool = False
+
+layernorm_epsilon: float = 1e-05
+
+fp16: bool = False
+bf16: bool = False
+
+init_method_std: float = 0.02
+import torch
+params_dtype = torch.float32
+masked_softmax_fusion: bool = True
+bias_gelu_fusion: bool = True
+bias_dropout_fusion: bool = True
+apply_residual_connection_post_layernorm: bool = False
+apply_query_key_layer_scaling: bool = True
+fp16_lm_cross_entropy: bool = False
+fp32_residual_connection: bool = False
+attention_softmax_in_fp32: bool = False
+
+# =========================================================
+# dataset
+# =========================================================
+
+tokenizer_type: str = "GPT2BPETokenizer"
+num_workers: int = 2
+mmap_warmup: bool = False
+padded_vocab_size: int = 0
+make_vocab_size_divisible_by: int = 128
+max_position_embeddings: int = 1024
+
+reset_position_ids: bool = False
+reset_attention_mask: bool = False
+eod_mask_loss: bool = False
+
+# =========================================================
+# distributed parallel
+# =========================================================
+
+dist_backend: str = None
+DDP_impl: str = "native"
+gradient_accumulation_fusion: bool = False
+use_contiguous_buffers_in_local_ddp: bool = False
diff --git a/training/benchmarks/gpt2/pytorch/config/mutable_params.py b/training/benchmarks/gpt2/pytorch/config/mutable_params.py
new file mode 100644
index 000000000..ecab0dd70
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/config/mutable_params.py
@@ -0,0 +1,6 @@
+mutable_params = [
+    'vendor', 'data_dir', 'lr', 'weight_decay',
+    "gradient_accumulation_steps", "max_steps", 
+    "train_batch_size", "eval_iter_start_samples", "eval_interval_samples", 
+    'dist_backend', 'device',
+]
\ No newline at end of file
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py b/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py
new file mode 100644
index 000000000..3731c0321
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from .tokenizer import get_tokenizer
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py b/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py
new file mode 100644
index 000000000..c4ddef76f
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Dataloaders."""
+
+import torch
+
+from mpu import get_data_parallel_rank, get_data_parallel_world_size
+
+
+def build_data_loader(dataset, train_batch_size, num_workers, drop_last,
+        task_collate_fn=None):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    if torch.distributed.is_initialized():
+        world_size = get_data_parallel_world_size()
+        rank = get_data_parallel_rank()
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset, num_replicas=world_size, rank=rank)
+    else:
+        sampler = None
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=train_batch_size,
+                                              sampler=sampler,
+                                              shuffle=False,
+                                              num_workers=num_workers,
+                                              drop_last=drop_last,
+                                              pin_memory=True,
+                                              collate_fn=task_collate_fn)
+
+    return data_loader
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py b/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py
new file mode 100644
index 000000000..c164f5cdf
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py
@@ -0,0 +1,263 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for OpenAI GPT."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE
+    # tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
+}
+PRETRAINED_MERGES_ARCHIVE_MAP = {
+    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'gpt2': 1024,
+}
+VOCAB_NAME = 'vocab.json'
+MERGES_NAME = 'merges.txt'
+SPECIAL_TOKENS_NAME = 'special_tokens.txt'
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = unichr if sys.version_info[0] == 2 else chr
+    bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \
+        list(range(ord("®"), ord("ÿ") + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(object):
+    """
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
+    """
+
+    def __init__(self, vocab_file, merges_file, errors='replace',
+                 special_tokens=None, max_len=None):
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.encoder = json.load(open(vocab_file))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for
+        # capitalized versions of contractions
+        self.pat = re.compile(
+            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        self.special_tokens = {}
+        self.special_tokens_decoder = {}
+        self.set_special_tokens(special_tokens)
+
+    def __len__(self):
+        return len(self.encoder) + len(self.special_tokens)
+
+    def set_special_tokens(self, special_tokens):
+        """ Add a list of additional tokens to the encoder.
+            The additional tokens are indexed starting from the last index of the
+            current vocabulary in the order of the `special_tokens` list.
+        """
+        if not special_tokens:
+            self.special_tokens = {}
+            self.special_tokens_decoder = {}
+            return
+        self.special_tokens = dict((tok, len(self.encoder) + i)
+                                   for i, tok in enumerate(special_tokens))
+        self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()}
+        logger.info("Special tokens {}".format(self.special_tokens))
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a sequence of tokens into ids using the vocab. """
+        ids = []
+        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.encoder.get(tokens, 0)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.encoder.get(token, 0))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this OpenAI GPT model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(
+                    len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """Converts a sequence of ids in BPE tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            if i in self.special_tokens_decoder:
+                if not skip_special_tokens:
+                    tokens.append(self.special_tokens_decoder[i])
+            else:
+                tokens.append(self.decoder[i])
+        return tokens
+
+    def encode(self, text):
+        return self.convert_tokens_to_ids(self.tokenize(text))
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(vocab_path):
+            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
+            return
+        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        merge_file = os.path.join(vocab_path, MERGES_NAME)
+        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
+
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+
+        index = len(self.encoder)
+        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
+            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+
+        return vocab_file, merge_file, special_tokens_file
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py b/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py
new file mode 100644
index 000000000..4206b43e8
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT style dataset."""
+
+import json
+
+import numpy as np
+import torch
+
+from dataloaders.indexed_dataset import make_dataset as make_indexed_dataset
+from dataloaders.dataloader import build_data_loader
+from dataloaders import get_tokenizer
+
+import config
+
+def build_train_test_datasets(train_num_samples,
+                                seq_length, seed, skip_warmup,
+                                train_data_prefix=None,
+                                test_data_prefix=None):
+    """Build train, valid, and test datasets."""
+    # get the tokenizer
+    tokenizer = get_tokenizer()
+
+    train_dataset, test_dataset = None, None
+    # Single dataset.
+    assert train_data_prefix is not None
+    train_dataset = build_dataset("train", train_data_prefix,
+                                train_num_samples, seq_length, seed,
+                                skip_warmup)
+    assert test_data_prefix is not None
+    test_dataset = _LambadaDataset(test_data_prefix, tokenizer.eod, tokenizer,
+                                  seq_length)
+    return (train_dataset, test_dataset)
+
+
+def build_dataset(dataset_name, data_prefix, num_samples, seq_length, seed, skip_warmup):
+    """
+    Build dataset. This method is called when individual
+    train, valid, test datasets are provided
+    """
+    dataset = None
+
+    # Indexed dataset.
+    indexed_dataset = get_indexed_dataset_(data_prefix,
+                                           skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+
+    documents = np.arange(start=0, stop=total_num_of_documents,
+                        step=1, dtype=np.int32)
+
+    dataset = GPTDataset(dataset_name, data_prefix,
+                        documents, indexed_dataset,
+                        num_samples, seq_length, seed)
+
+    return dataset
+
+
+def get_indexed_dataset_(data_prefix, skip_warmup):
+    """Build indexed dataset."""
+    indexed_dataset = make_indexed_dataset(data_prefix,
+                                           "mmap",
+                                           skip_warmup)
+    return indexed_dataset
+
+
+class GPTDataset(torch.utils.data.Dataset):
+
+    def __init__(self, name, data_prefix, documents, indexed_dataset,
+                 num_samples, seq_length, seed):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        # Build index mappings.
+        self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+            documents, self.indexed_dataset.sizes, seq_length, seed)
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def __getitem__(self, idx):
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        if doc_index_f == doc_index_l:
+            sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                              offset=offset_f,
+                                              length=offset_l - offset_f + 1)
+        else:
+            # Otherwise, get the rest of the initial document.
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
+                                                    offset=offset_f)]
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+            # And finally add the relevant portion of last document.
+            sample_list.append(self.indexed_dataset.get(
+                self.doc_idx[doc_index_l],
+                length=offset_l + 1))
+            sample = np.concatenate(sample_list)
+
+        return {'text': np.array(sample, dtype=np.int64)}
+
+
+def _build_index_mappings(documents, sizes, seq_length, seed):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # doc-idx.
+    doc_idx = _build_doc_idx(documents, np_rng)
+    # sample-idx.
+    assert doc_idx.dtype == np.int32
+    assert sizes.dtype == np.int32
+    sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
+                                   tokens_per_epoch)
+    # shuffle-idx.
+    # -1 is due to data structure used to retieve the index:
+    #    sample i --> [sample_idx[i], sample_idx[i+1])
+    num_samples_ = sample_idx.shape[0] - 1
+    shuffle_idx = _build_shuffle_idx(num_samples_,
+                                        sample_idx.shape[0] - 1, np_rng)
+    
+    return doc_idx, sample_idx, shuffle_idx
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _build_doc_idx(documents, np_rng):
+    """Build an array with length = number-of-epochs * number-of-dcuments.
+    Each index is mapped to a corresponding document."""
+    doc_idx = np.mgrid[0:1, 0:len(documents)][1]
+    doc_idx[:] = documents
+    doc_idx = doc_idx.reshape(-1)
+    doc_idx = doc_idx.astype(np.int32)
+    np_rng.shuffle(doc_idx)
+    return doc_idx
+
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Begining offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += (remaining_seq_length + doc_length - 1)
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the begining of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(num_samples, total_size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    
+    dtype_ = np.uint32
+    if total_size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+
+    shuffle_idx_first = np.arange(start=0, stop=num_samples,
+                                  step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = np.arange(start=num_samples, stop=total_size,
+                                 step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_last)
+
+    return np.concatenate((shuffle_idx_first, shuffle_idx_last))
+
+
+def build_train_test_data_dataloaders(
+        build_train_test_datasets_provider):
+    """XXX"""
+
+    (train_dataloader, test_dataloader) = (None, None)
+
+    # Number of train/valid/test samples.
+    train_samples = config.max_steps* config.global_batch_size
+
+    # Build the datasets.
+    train_ds, test_ds = build_train_test_datasets_provider(
+        train_num_samples=train_samples)
+
+    # Build dataloders.
+    train_dataloader = build_data_loader(train_ds, config.train_batch_size,
+                                   config.num_workers, drop_last=False)
+
+    test_dataloader = build_data_loader(test_ds, config.train_batch_size,
+                                   config.num_workers, drop_last=False)
+
+    # Flags to know if we need to do training/validation/testing.
+    config.do_train = train_dataloader is not None and config.max_steps> 0
+
+    return train_dataloader, test_dataloader 
+
+
+class _LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.tokenizer = tokenizer
+        self.strict = strict
+
+        self.tokens = []
+        self.labels = []
+        with open(path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.tokenize(text)
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
+        last_token = self.tokenizer.tokenize(' ' + last_token)
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx]
+        num_tokens = len(tokens)
+        pad_mask = [0] * num_tokens
+        labels = self.labels[idx]
+        pad_mask += [1] * len(labels)
+        tokens = tokens + labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            pad_mask += [0] * (num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py b/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py
new file mode 100644
index 000000000..661022a39
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py
@@ -0,0 +1,344 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+from functools import lru_cache
+import os
+import shutil
+import struct
+from itertools import accumulate
+
+import numpy as np
+import torch
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ['lazy', 'cached', 'mmap']
+
+
+def make_builder(out_file, impl, vocab_size=None):
+    if impl == 'mmap':
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
+
+
+def make_dataset(path, impl='mmap', skip_warmup=False):
+    if not MMapIndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+    elif impl == 'mmap' and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def dataset_exists(path, impl):
+    if impl == 'mmap':
+        return MMapIndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: float,
+    7: np.double,
+    8: np.uint16
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + '.idx'
+
+
+def data_file_path(prefix_path):
+    return prefix_path + '.bin'
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+def _warmup_mmap_file(path):
+    with open(path, 'rb') as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(torch.utils.data.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b'MMIDIDX\x00\x00'
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, 'wb')
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack('<Q', 1))
+                    self._file.write(struct.pack('<B', code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack('<Q', len(sizes)))
+                    self._file.write(struct.pack('<Q', len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order='C'))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order='C'))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order='C'))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, 'rb') as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    'Index file doesn\'t match expected format. '
+                    'Make sure that --dataset-impl is configured properly.'
+                )
+                version = struct.unpack('<Q', stream.read(8))
+                assert (1,) == version
+
+                dtype_code, = struct.unpack('<B', stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack('<Q', stream.read(8))[0]
+                self._doc_count = struct.unpack('<Q', stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                _warmup_mmap_file(path)
+
+            self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
+            self._bin_buffer = memoryview(self._bin_buffer_mmap)
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
+            self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
+                                           offset=offset + self._sizes.nbytes)
+            self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+
+        def __del__(self):
+            self._bin_buffer_mmap._mmap.close()
+            del self._bin_buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            _warmup_mmap_file(data_file_path(self._path))
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, (int, np.integer)):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=size, offset=ptr)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                     count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx, offset=0, length=None):
+        """ Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype,
+                                 count=length, offset=ptr)
+        return np_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return (
+            os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+        )
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype=np.int64):
+        self._data_file = open(out_file, 'wb')
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def add_item(self, tensor):
+        np_array = np.array(tensor.numpy(), dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.append(np_array.size)
+
+    def add_doc(self, tensor, sizes):
+        np_array = np.array(tensor, dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order='C'))
+        self._sizes.extend(sizes)
+        self._doc_idx.append(len(self._sizes))
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])
+
+        # Concatenate data
+        with open(data_file_path(another_file), 'rb') as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
+
diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/tokenizer.py b/training/benchmarks/gpt2/pytorch/dataloaders/tokenizer.py
new file mode 100644
index 000000000..bf15b4000
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/dataloaders/tokenizer.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron tokenizers."""
+
+from abc import ABC
+from abc import abstractmethod
+import os
+
+from .gpt2_tokenization import GPT2Tokenizer
+import config
+
+_GLOBAL_TOKENIZER = None
+
+def get_tokenizer():
+    global _GLOBAL_TOKENIZER
+    if _GLOBAL_TOKENIZER is None:
+        _GLOBAL_TOKENIZER = build_tokenizer()
+    return _GLOBAL_TOKENIZER
+
+def build_tokenizer():
+    """Initialize tokenizer."""
+
+    assert config.vocab_file is not None
+    assert config.merge_file is not None
+    
+    vocab_file_path = os.path.join(config.data_dir, config.vocab_file)
+    merge_file_path = os.path.join(config.data_dir, config.merge_file)
+    tokenizer = _GPT2BPETokenizer(vocab_file_path, merge_file_path)
+
+    # Add vocab size.
+    config.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size)
+
+    return tokenizer
+
+
+def _vocab_size_with_padding(orig_vocab_size):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+
+    after = orig_vocab_size
+    multiple = config.make_vocab_size_divisible_by
+    while (after % multiple) != 0:
+        after += 1
+    return after
+
+
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+
+    def detokenize(self, token_ids):
+        raise NotImplementedError('detokenizer is not implemented for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def cls(self):
+        raise NotImplementedError('CLS is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def sep(self):
+        raise NotImplementedError('SEP is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def pad(self):
+        raise NotImplementedError('PAD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def eod(self):
+        raise NotImplementedError('EOD is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+    @property
+    def mask(self):
+        raise NotImplementedError('MASK is not provided for {} '
+                                  'tokenizer'.format(self.name))
+
+
+class _GPT2BPETokenizer(AbstractTokenizer):
+    """Original GPT2 BPE tokenizer."""
+
+    def __init__(self, vocab_file, merge_file):
+        name = 'GPT2 BPE'
+        super().__init__(name)
+
+        self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
+                                       special_tokens=[], max_len=None)
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer.encoder)
+
+    @property
+    def vocab(self):
+        return self.tokenizer.encoder
+
+    @property
+    def inv_vocab(self):
+        return self.tokenizer.decoder
+
+    def tokenize(self, text):
+        return self.tokenizer.encode(text)
+
+    def detokenize(self, token_ids):
+        return self.tokenizer.decode(token_ids)
+
+    @property
+    def eod(self):
+        return self.eod_id
+
diff --git a/training/benchmarks/gpt2/pytorch/model/__init__.py b/training/benchmarks/gpt2/pytorch/model/__init__.py
new file mode 100644
index 000000000..8a4b8c43c
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/__init__.py
@@ -0,0 +1,13 @@
+from model.models.gpt_model import GPTModel
+
+def create_model(args):
+    # config.resume_step = 0
+    
+    model = GPTModel(
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+    )
+
+    return None, model,
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/__init__.py b/training/benchmarks/gpt2/pytorch/model/layers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py b/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py
new file mode 100644
index 000000000..29222db02
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+
+###### BIAS GELU FUSION/ NO AUTOGRAD ################
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+
+@torch.jit.script
+def bias_gelu(bias, y):
+    x = bias + y
+    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff*g
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(bias, input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, bias, input)
+        return tmp, tmp
+
+bias_gelu_impl = GeLUFunction.apply
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py b/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py
new file mode 100644
index 000000000..2f359d166
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+from model.models.enums import AttnMaskType
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+
+        return input_grads, None
+
+
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+
+        scale_t = torch.tensor([scale])
+
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+
+        softmax_results, scale_t = ctx.saved_tensors
+
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+
+
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        # if self.is_kernel_available(mask, *input.size()):
+        #     return self.forward_fused_softmax(input, mask)
+        # else:
+        return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4 
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/layers.py b/training/benchmarks/gpt2/pytorch/model/layers/layers.py
new file mode 100644
index 000000000..cc7cf6194
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/layers/layers.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+
+from model.layers.utils import divide
+
+
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
+                                  return_master_weight=False,
+                                  *, params_dtype=torch.float32):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=torch.float,
+                                requires_grad=False)
+    init_method(master_weight)
+    master_weight = master_weight.to(dtype=params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    my_weight_list = weight_list
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+
+    Keyword Arguments:
+        init_method: method to initialize weights.
+        params_dtype
+        use_cpu_initialization
+        perform_initialization
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, *,
+                 init_method=init.xavier_normal_,
+                 params_dtype: torch.dtype=torch.float32,
+                 use_cpu_initialization: bool=False,
+                 perform_initialization: bool=True):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the detauls for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.vocab_start_index = 0
+        self.vocab_end_index = self.num_embeddings
+
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        self.weight = Parameter(torch.empty(
+            self.num_embeddings_per_partition, self.embedding_dim,
+            dtype=params_dtype))
+        _initialize_affine_weight_cpu(
+            self.weight, self.num_embeddings, self.embedding_dim,
+            self.num_embeddings_per_partition, 0, init_method,
+            params_dtype=params_dtype)
+        self.weight.data = self.weight.data.cuda()
+
+    def forward(self, input_):
+        masked_input = input_
+        # Get the embeddings.
+        output = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments
+        bias: If true, add bias
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+        params_dtype:
+        use_cpu_initialization:
+        gradient_accumulation_fusion:
+        sequence_parallel_enabled:
+    """
+
+    def __init__(self, input_size, output_size, *,
+                 bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=torch.float32,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 gradient_accumulation_fusion=False,
+                 sequence_parallel_enabled: bool = False,
+                 ):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        # Divide the weight matrix along the last dimension.
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        self.weight = Parameter(torch.empty(self.output_size,
+                                            self.input_size,
+                                            dtype=params_dtype))
+        self.master_weight = _initialize_affine_weight_cpu(
+            self.weight, self.output_size, self.input_size,
+            self.output_size, 0, init_method,
+            stride=stride, return_master_weight=keep_master_weight_for_test)
+        if bias:
+            self.bias = Parameter(torch.empty(
+                self.output_size, dtype=params_dtype))
+
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+    def forward(self, input_):
+        """Forward of ColumnParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        bias = self.bias if not self.skip_bias_add else None
+        output = F.linear(input_, self.weight, bias)
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+    
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+
+    Keyword Arguments:
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimization where bias
+                       can be fused with other elementwise operations. We skip
+                       adding bias but instead return it.
+        params_dtype:
+        use_cpu_initialization:
+        perform_initialization:
+        gradient_accumulation_fusion:
+        sequence_parallel_enabled:
+    """
+
+    def __init__(self, input_size, output_size, *,
+                 bias=True, input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=torch.float32,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 gradient_accumulation_fusion=False,
+                 sequence_parallel_enabled: bool = False,
+                 ):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        # Divide the weight matrix along the last dimension.
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        self.weight = Parameter(torch.empty(self.output_size,
+                                            self.input_size,
+                                            dtype=params_dtype))
+        self.master_weight = _initialize_affine_weight_cpu(
+            self.weight, self.output_size, self.input_size,
+            self.input_size, 1, init_method,
+            stride=stride, return_master_weight=keep_master_weight_for_test,
+            params_dtype=params_dtype)
+
+        if bias:
+            self.bias = Parameter(torch.empty(self.output_size,
+                                                dtype=params_dtype))
+            setattr(self.bias, 'sequence_parallel', sequence_parallel_enabled)
+
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+
+    def forward(self, input_):
+        """Forward of RowParallelLinear
+
+        Args:
+            input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
+
+        Returns:
+            - output
+            - bias
+        """
+        if not self.skip_bias_add:
+            output = F.linear(input_, self.weight, self.bias)
+            output_bias = None
+        else:
+            output = F.linear(input_, self.weight, None)
+            output_bias = self.bias
+        return output, output_bias 
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/transformer.py b/training/benchmarks/gpt2/pytorch/model/layers/transformer.py
new file mode 100644
index 000000000..ded57e471
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/layers/transformer.py
@@ -0,0 +1,985 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Transformer."""
+import math
+from typing import Optional, List
+from contextlib import nullcontext
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.nn import LayerNorm
+
+from model.models.module import MegatronModule
+from model.layers import layers
+from model.models.enums import AttnMaskType, LayerType, AttnType
+from model.layers.fused_softmax import FusedScaleMaskSoftmax
+from model.layers.fused_bias_gelu import bias_gelu_impl
+from model.layers.utils import attention_mask_func
+import config
+
+rearrange = None
+
+flash_attn_unpadded_func = None
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty(
+        (1,),
+        dtype = inp.dtype,
+        device = inp.device,
+        requires_grad = requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+
+""" We use the following notation throughout this file:
+     h: hidden size
+     n: number of attention heads
+     p: number of model parallel partitions
+     np: n/p
+     hp: h/p
+     hn: h/n
+     b: batch size
+     s: sequence length
+     l: number of layers
+    Transformer takes input of size [s, b, h] and returns a
+    tensor of the same size. We use the following arguments:
+        hyperparameters: transformer hyperparameters
+"""
+
+class DropPath(MegatronModule):
+    """Drop paths (Stochastic Depth) per sample
+    (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=0.):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_state):
+        if self.drop_prob == 0. or not self.training:
+            return hidden_state
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        # hidden_state: [s, b, h]
+        shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2)
+        random_tensor = keep_prob + \
+            torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device)
+        random_tensor.floor_()  # binarize
+        output = hidden_state.div(keep_prob) * random_tensor
+        return output
+
+def _args_to_kwargs():
+
+    common_kwargs = {
+        "params_dtype": config.params_dtype,
+        "use_cpu_initialization": True,
+        "perform_initialization": True,
+        "gradient_accumulation_fusion": False,
+        "sequence_parallel_enabled": False,
+    }
+    return common_kwargs
+
+class ParallelMLP(MegatronModule):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, init_method, output_layer_init_method):
+        super(ParallelMLP, self).__init__()
+
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = layers.ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            gather_output=False,
+            init_method=init_method,
+            skip_bias_add=True,
+            **_args_to_kwargs())
+
+        self.bias_gelu_fusion = False
+        self.activation_func = None
+
+        self.bias_gelu_fusion = config.bias_gelu_fusion
+        self.activation_func = F.gelu
+
+        # Project back to h.
+        self.dense_4h_to_h = layers.RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+            **_args_to_kwargs())
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4hp]
+        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
+
+        if self.bias_gelu_fusion:
+            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        return output, output_bias
+
+class CoreAttention(MegatronModule):
+
+    def __init__(self, layer_number,
+                 attn_mask_type=AttnMaskType.padding):
+        super(CoreAttention, self).__init__()
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.attn_mask_type = attn_mask_type
+        self.sequence_parallel = False
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+
+        self.scale_mask_softmax = FusedScaleMaskSoftmax(
+            self.fp16, self.bf16,
+            self.attn_mask_type,
+            config.masked_softmax_fusion,
+            attention_mask_func,
+            self.attention_softmax_in_fp32,
+            coeff)
+
+        # Dropout. Note that for a single iteration, this layer will generate
+        # different outputs on different number of parallel partitions but
+        # on average it should not be partition dependent.
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer,
+                value_layer, attention_mask):
+
+        # ===================================
+        # Raw attention scores. [b, np, s, s]
+        # ===================================
+
+        # [b, np, sq, sk]
+        output_size = (query_layer.size(1),
+                       query_layer.size(2),
+                       query_layer.size(0),
+                       key_layer.size(0))
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(output_size[2],
+                                       output_size[0] * output_size[1], -1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(output_size[3],
+                                   output_size[0] * output_size[1], -1)
+
+        # preallocting input tensor: [b * np, sq, sk]
+        matmul_input_buffer = torch.zeros(
+            (output_size[0]*output_size[1], output_size[2], output_size[3]),
+            dtype=query_layer.dtype, device=key_layer.device)
+
+        # Raw attention scores. [b * np, sq, sk]
+        matmul_result = torch.baddbmm(
+            matmul_input_buffer,
+            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+            beta=0.0, alpha=(1.0/self.norm_factor))
+
+        # change view to [b, np, sq, sk]
+        attention_scores = matmul_result.view(*output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [b, np, sq, sk]
+        attention_probs = self.scale_mask_softmax(attention_scores,
+                                                  attention_mask)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+
+        # =========================
+        # Context layer. [sq, b, hp]
+        # =========================
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.size(1),
+                       value_layer.size(2),
+                       query_layer.size(0),
+                       value_layer.size(3))
+
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.view(value_layer.size(0),
+                                       output_size[0] * output_size[1], -1)
+
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.view(output_size[0] * output_size[1],
+                                               output_size[2], -1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(*output_size)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class FlashSelfAttention(torch.nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
+                 device=None, dtype=None):
+        super().__init__()
+        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
+                                                      'e.g., with pip install flash-attn')
+        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(self, q, k, v):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+
+        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
+        assert all((i.is_cuda for i in (q,k,v)))
+
+        batch_size, seqlen_q = q.shape[0], q.shape[1]
+        seqlen_k = k.shape[1]
+
+        q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
+        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
+                                    device=q.device)
+
+        if self.training:
+            # during training q,k,v always have same seqlen
+            assert seqlen_k == seqlen_q
+
+            is_causal = self.causal
+            cu_seqlens_k = cu_seqlens_q
+        else:
+            # turn off FA causal mask after first inference autoregressive iteration
+            # only on first autoregressive step q,k,v have same seqlen
+            is_causal = seqlen_q == seqlen_k
+            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
+                        device=q.device)
+            self.dropout_p = 0
+
+        output = flash_attn_unpadded_func(
+            q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
+            self.dropout_p,
+            softmax_scale=self.softmax_scale, causal=is_causal
+        )
+
+        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+        return output
+
+
+class ParallelAttention(MegatronModule):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, init_method,
+                 output_layer_init_method, layer_number,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.padding):
+        super(ParallelAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+        self.params_dtype = config.params_dtype
+        self.sequence_parallel = False
+
+        self.use_flash_attn = config.use_flash_attn \
+            and attention_type == AttnType.self_attn \
+            and self.attn_mask_type == AttnMaskType.causal
+        if self.use_flash_attn:
+            if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
+                                  'pip install flash-attn')
+            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
+                                                          'self-attention for now')
+            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
+                                                                'supports causal mask for now')
+            if rearrange is None:
+                raise ImportError('einops is not installed, please install with pip install einops')
+
+        projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        # Strided linear layer.
+        if attention_type == AttnType.self_attn:
+            self.query_key_value = layers.ColumnParallelLinear(
+                config.hidden_size,
+                3 * projection_size,
+                gather_output=False,
+                init_method=init_method,
+                **_args_to_kwargs())
+        else:
+            assert attention_type == AttnType.cross_attn
+            self.query = layers.ColumnParallelLinear(
+                config.hidden_size,
+                projection_size,
+                gather_output=False,
+                init_method=init_method,
+                **_args_to_kwargs())
+
+
+            self.key_value = layers.ColumnParallelLinear(
+                config.hidden_size,
+                2 * projection_size,
+                gather_output=False,
+                init_method=init_method,
+                **_args_to_kwargs())
+
+        self.core_attention = CoreAttention(self.layer_number,
+                                            self.attn_mask_type)
+
+        if self.use_flash_attn:
+            self.core_attention_flash = FlashSelfAttention(
+                causal=True, attention_dropout=config.attention_dropout
+            )
+
+        # Output.
+        self.dense = layers.RowParallelLinear(
+            projection_size,
+            config.hidden_size,
+            input_is_parallel=True,
+            init_method=output_layer_init_method,
+            skip_bias_add=True,
+            **_args_to_kwargs())
+
+    def _allocate_memory(self, inference_max_sequence_len, batch_size):
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            dtype=self.params_dtype,
+            device=torch.cuda.current_device())
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, inference_params=None,
+                rotary_pos_emb=None):
+        # hidden_states: [sq, b, h]
+
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        if inference_params:
+            if self.layer_number not in inference_params.key_value_memory_dict:
+                inf_max_seq_len = inference_params.max_sequence_len
+                inf_max_batch_size = inference_params.max_batch_size
+                inference_key_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_value_memory = self._allocate_memory(
+                    inf_max_seq_len, inf_max_batch_size)
+                inference_params.key_value_memory_dict[self.layer_number] = (
+                    inference_key_memory, inference_value_memory)
+            else:
+                inference_key_memory, inference_value_memory = \
+                    inference_params.key_value_memory_dict[self.layer_number]
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+
+        if self.attention_type == AttnType.self_attn:
+            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
+            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer,
+             key_layer,
+             value_layer) = torch.split(mixed_x_layer, mixed_x_layer.size()[-1] // 3, mixed_x_layer.dim() - 1)
+
+        else:
+            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
+            mixed_kv_layer, _ = self.key_value(encoder_output)
+
+            # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 2 * self.hidden_size_per_attention_head)
+            mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
+
+            # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
+            (key_layer,
+             value_layer) = torch.split(mixed_kv_layer, mixed_kv_layer.size()[-1] // 2, mixed_kv_layer.dim() - 1)
+
+            # Attention head [sq, b, h] --> [sq, b, hp]
+            query_layer, _ = self.query(hidden_states)
+            # [sq, b, hp] --> [sq, b, np, hn]
+            new_tensor_shape = query_layer.size()[:-1] + \
+                (self.num_attention_heads_per_partition,
+                 self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(*new_tensor_shape)
+
+        # ==================================
+        # Adjust key and value for inference
+        # ==================================
+
+        if inference_params:
+            batch_start = inference_params.batch_size_offset
+            batch_end = batch_start + key_layer.size(1)
+            assert batch_end <= inference_key_memory.size(1)
+            sequence_start = inference_params.sequence_len_offset
+            sequence_end = sequence_start + key_layer.size(0)
+            assert sequence_end <= inference_key_memory.size(0)
+            # Copy key and values.
+            inference_key_memory[sequence_start:sequence_end,
+                                 batch_start:batch_end, ...] = key_layer
+            inference_value_memory[sequence_start:sequence_end,
+                                   batch_start:batch_end, ...] = value_layer
+            key_layer = inference_key_memory[
+                :sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[
+                :sequence_end, batch_start:batch_end, ...]
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        if not self.use_flash_attn:
+            context_layer = self.core_attention(
+                query_layer, key_layer, value_layer, attention_mask)
+        else:
+            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
+                       for x in (query_layer, key_layer, value_layer)]
+            if not self.sequence_parallel:
+                context_layer = self.core_attention_flash(q, k, v)
+            else:
+                context_layer = self.core_attention_flash(q, k, v)
+            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+
+        output, bias = self.dense(context_layer)
+        return output, bias
+
+
+def bias_dropout_add(x, bias, residual, prob, training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def get_bias_dropout_add(training):
+    def _bias_dropout_add(x, bias, residual, prob):
+        return bias_dropout_add(x, bias, residual, prob, training)
+    return _bias_dropout_add
+
+
+@torch.jit.script
+def bias_dropout_add_fused_train(x: torch.Tensor,
+                                 bias: torch.Tensor,
+                                 residual: torch.Tensor,
+                                 prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, True)
+
+
+@torch.jit.script
+def bias_dropout_add_fused_inference(x: torch.Tensor,
+                                     bias: torch.Tensor,
+                                     residual: torch.Tensor,
+                                     prob: float) -> torch.Tensor:
+    return bias_dropout_add(x, bias, residual, prob, False)
+
+
+class ParallelTransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_number, layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 drop_path_rate=0.):
+
+        super(ParallelTransformerLayer, self).__init__()
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+
+        self.apply_residual_connection_post_layernorm \
+            = config.apply_residual_connection_post_layernorm
+
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        # Layernorm on the input data.
+        # self.input_layernorm = LayerNorm(
+        #     config.hidden_size,
+        #     eps=config.layernorm_epsilon)
+        self.input_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            dtype=torch.float)
+            # eps=config.layernorm_epsilon,
+            # no_persist_layer_norm=config.no_persist_layer_norm,
+            # sequence_parallel=config.sequence_parallel)
+
+        # Self attention.
+        self.self_attention = ParallelAttention(
+            init_method,
+            output_layer_init_method,
+            layer_number,
+            attention_type=AttnType.self_attn,
+            attn_mask_type=self_attn_mask_type)
+        self.hidden_dropout = config.hidden_dropout
+        self.bias_dropout_fusion = config.bias_dropout_fusion
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
+
+        # Layernorm on the attention output
+        # self.post_attention_layernorm = LayerNorm(
+        #     config.hidden_size,
+        #     eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = LayerNorm(
+            config.hidden_size,
+            eps=config.layernorm_epsilon,
+            dtype=torch.float)
+            # eps=config.layernorm_epsilon,
+            # no_persist_layer_norm=config.no_persist_layer_norm,
+            # sequence_parallel=config.sequence_parallel)
+
+        if self.layer_type == LayerType.decoder:
+            self.inter_attention = ParallelAttention(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                attention_type=AttnType.cross_attn)
+            # Layernorm on the attention output.
+            # self.post_inter_attention_layernorm = LayerNorm(
+            #     config.hidden_size,
+            #     eps=config.layernorm_epsilon)
+            self.post_inter_attention_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                dtype=torch.float)
+                # eps=config.layernorm_epsilon,
+                # no_persist_layer_norm=config.no_persist_layer_norm,
+                # sequence_parallel=config.sequence_parallel)
+
+        # MLP
+        self.mlp = ParallelMLP(init_method, output_layer_init_method)
+
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        self.bias_dropout_add_exec_handler = \
+                nullcontext if use_nvfuser else torch.enable_grad
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, attention_bias = \
+            self.self_attention(
+                layernorm_output,
+                attention_mask,
+                inference_params=inference_params)
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        if self.drop_path is None:
+            # jit scripting for a nn.module (with dropout) is not
+            # trigerring the fusion kernel. For now, we use two
+            # different nn.functional routines to account for varying
+            # dropout semantics during training and inference phases.
+            if self.bias_dropout_fusion:
+                if self.training:
+                    bias_dropout_add_func = bias_dropout_add_fused_train
+                else:
+                    bias_dropout_add_func = bias_dropout_add_fused_inference
+            else:
+                bias_dropout_add_func = get_bias_dropout_add(self.training)
+
+            with self.bias_dropout_add_exec_handler():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    attention_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+        else:
+            out = torch.nn.functional.dropout(attention_output + attention_bias,
+                                              p=self.hidden_dropout,
+                                              training=self.training)
+            layernorm_input = residual + self.drop_path(out)
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        if self.layer_type == LayerType.decoder:
+            attention_output, attention_bias = \
+                self.inter_attention(layernorm_output,
+                                     enc_dec_attn_mask,
+                                     encoder_output=encoder_output)
+            # residual connection
+            if self.apply_residual_connection_post_layernorm:
+                residual = layernorm_output
+            else:
+                residual = layernorm_input
+
+            with self.bias_dropout_add_exec_handler():
+                layernorm_input = bias_dropout_add_func(
+                    attention_output,
+                    attention_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+
+            # Layer norm post the decoder attention
+            layernorm_output = self.post_inter_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        if self.drop_path is None:
+            with self.bias_dropout_add_exec_handler():
+                output = bias_dropout_add_func(
+                    mlp_output,
+                    mlp_bias.expand_as(residual),
+                    residual,
+                    self.hidden_dropout)
+
+            # Jit compiled function creates 'view' tensor. This tensor
+            # potentially gets saved in the MPU checkpoint function context,
+            # which rejects view tensors. While making a viewless tensor here
+            # won't result in memory savings (like the data loader, or
+            # p2p_communication), it serves to document the origin of this
+            # 'view' tensor.
+            output = make_viewless_tensor(inp = output,
+                                                     requires_grad = output.requires_grad,
+                                                     keep_graph = True)
+
+        else:
+            out = torch.nn.functional.dropout(mlp_output + mlp_bias,
+                                              p=self.hidden_dropout,
+                                              training=self.training)
+            output = residual + self.drop_path(out)
+
+        return output
+
+
+class NoopTransformerLayer(MegatronModule):
+    """A single 'no-op' transformer layer.
+
+    The sole purpose of this layer is for when a standalone embedding layer
+    is used (i.e., args.standalone_embedding_stage == True). In this case,
+    zero transformer layers are assigned when pipeline rank == 0. Additionally,
+    when virtual pipeline rank >= 1, zero total model parameters are created
+    (virtual rank 0 contains the input embedding). This results in the model's
+    input and output tensors being the same, which causes an error when
+    performing certain memory optimiations on the output tensor (e.g.,
+    deallocating it). Thus, this layer disconnects the input from the output
+    via a clone. Since ranks containing a no-op layer are generally under-
+    utilized (both compute and memory), there's no worry of any performance
+    degredation.
+    """
+
+    def __init__(self, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        return hidden_states.clone()
+
+
+def _get_num_layers(is_decoder=False):
+    """Compute the number of transformer layers resident on the current rank."""
+    if not is_decoder:
+        num_layers = config.encoder_num_layers
+    else:
+        num_layers = config.decoder_num_layers
+    return num_layers
+
+
+class ParallelTransformer(MegatronModule):
+    """Transformer class."""
+
+    def __init__(self, init_method, output_layer_init_method,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.padding,
+                 post_layer_norm=True,
+                 pre_process=True, post_process=True,
+                 drop_path_rate=0.0):
+        super(ParallelTransformer, self).__init__()
+
+        self.layer_type = layer_type
+        self.bf16 = config.bf16
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_tensor = None
+        self.drop_path_rate = drop_path_rate
+        self.transformer_impl = config.transformer_impl
+
+        self.sequence_parallel = False
+
+        self.microbatch_count = 0
+
+        # Number of layers.
+        self.num_layers = _get_num_layers(
+            layer_type == LayerType.decoder)
+
+        self.drop_path_rates = [rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return ParallelTransformerLayer(
+                init_method,
+                output_layer_init_method,
+                layer_number,
+                layer_type=layer_type,
+                self_attn_mask_type=self_attn_mask_type,
+                drop_path_rate=self.drop_path_rates[layer_number - 1])
+
+        offset = 0 
+
+        if self.num_layers == 0:
+            # When a standalone embedding stage is used (e.g.,
+            # args.standalone_embedding_stage == True), virtual pipeline ranks
+            # on pipeline rank 0 will have zero transformer layers assigned to
+            # them. This results in the model's input and output tensors to be
+            # the same, which will cause failure for certain output tensor
+            # optimizations (e.g., pipeline output deallocation). To remedy
+            # this, we assign a 'no-op' layer on these ranks, which will
+            # disconnect the input tensor from the output tensor.
+            self.num_layers = 1
+            self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
+        else:
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            # self.final_layernorm = LayerNorm(
+            #     config.hidden_size,
+            #     eps=config.layernorm_epsilon)
+            self.final_layernorm = LayerNorm(
+                config.hidden_size,
+                eps=config.layernorm_epsilon,
+                dtype=torch.float)
+                #no_persist_layer_norm=config.no_persist_layer_norm,
+                #sequence_parallel=config.sequence_parallel)
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask,
+                encoder_output=None, enc_dec_attn_mask=None,
+                inference_params=None):
+        # hidden_states: [s, b, h]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(
+            hidden_states,
+            requires_grad=True,
+            keep_graph=True,
+        )
+
+        rng_context = nullcontext()
+
+        with rng_context:
+            # The fp8_autocast context manager is a no-op when enabled=True
+            # The if...else serves to short circuit name resolution for fp8_autocast
+            with nullcontext():
+
+                # Forward pass.
+                forward_kwargs = {
+                    'encoder_output': encoder_output,
+                    'enc_dec_attn_mask': enc_dec_attn_mask,
+                    'inference_params': inference_params,
+                }
+
+                for index in range(self.num_layers):
+                    layer = self._get_layer(index)
+                    hidden_states = layer(
+                        hidden_states,
+                        attention_mask,
+                        **forward_kwargs)
+
+                # Skip counter update for eval and activation checkpointing
+                if torch.is_grad_enabled() and self.training:
+                    self.microbatch_count += 1
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
\ No newline at end of file
diff --git a/training/benchmarks/gpt2/pytorch/model/layers/utils.py b/training/benchmarks/gpt2/pytorch/model/layers/utils.py
new file mode 100644
index 000000000..c1e5e39a8
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/layers/utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for layers."""
+
+import math
+
+import torch
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
+                                       (1.0 + 0.044715 * x * x)))
+def openai_gelu(x):
+    return gelu_impl(x)
+
+#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
\ No newline at end of file
diff --git a/training/benchmarks/gpt2/pytorch/model/losses/__init__.py b/training/benchmarks/gpt2/pytorch/model/losses/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py b/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py
new file mode 100755
index 000000000..0722a9cde
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py
@@ -0,0 +1,32 @@
+import torch
+
+
+def cross_entropy(outputs, target):
+    """
+    Compute the cross entropy loss of output and target. 
+
+    para:   outputs, [b, s, vocab_size]
+            target, [b, s]
+    return: loss, [b, s]
+    """
+
+    logits = outputs.clone()
+    # logits = outputs
+    logits_max = torch.max(logits, dim=-1)[0]
+
+    # Subtract the maximum value.
+    logits.sub_(logits_max.unsqueeze(dim=-1))
+    # Sum of exponential of logits along vocab dimension across all GPUs.
+    exp_logits = logits.exp()
+    sum_exp_logits = exp_logits.sum(dim=-1)
+
+    logits_2d = logits.view(-1, logits.size()[-1])
+    target_1d = target.view(-1)
+    arange_1d = torch.arange(start=0,
+                             end=logits_2d.size()[0],
+                             device=logits_2d.device)
+    predit_ligits_1d = logits_2d[arange_1d, target_1d]
+    predit_ligits = predit_ligits_1d.view_as(target)
+
+    loss = torch.log(sum_exp_logits) - predit_ligits
+    return loss
diff --git a/training/benchmarks/gpt2/pytorch/model/models/__init__.py b/training/benchmarks/gpt2/pytorch/model/models/__init__.py
new file mode 100644
index 000000000..a00a56fac
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+try:
+    from ..layers.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+except Exception:
+    # Make LayerNorm has the same parameters as FusedLayerNorm
+    from torch.nn import LayerNorm as TorchLayerNorm
+    class LayerNorm(TorchLayerNorm):
+        """Inherit from torch.nn.LayerNorm but eliminate extra kwargs"""
+        def __init__(self, normalized_shape, eps=1e-5,
+                    no_persist_layer_norm=True,
+                    sequence_parallel=False,
+                    apply_layernorm_1p=False):
+                super().__init__(
+                    normalized_shape, eps = eps)
+
+from .utils import RMSNorm
+from .gpt_model import GPTModel
+from .language_model import get_language_model
+from .module import Float16Module
diff --git a/training/benchmarks/gpt2/pytorch/model/models/enums.py b/training/benchmarks/gpt2/pytorch/model/models/enums.py
new file mode 100644
index 000000000..2ea483051
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/enums.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
+    retro_decoder_with_retriever = 5
+ 
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+# For backward compatibility with old model checkpoints
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+    retro_encoder = 3
+    retro_decoder = 4
diff --git a/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py b/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py
new file mode 100644
index 000000000..d43ee2f22
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT-2 model."""
+
+import torch
+
+import config
+from model.models.module import MegatronModule
+from model.losses.cross_entropy import cross_entropy
+from model.models.enums import AttnMaskType
+from model.models.language_model import get_language_model
+from model.models.utils import init_method_normal,scaled_init_method_normal
+import math
+
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def post_language_model_processing(lm_output, labels, logit_weights, fp16_lm_cross_entropy):
+    # Output. Format [s b h]
+    output = torch.nn.functional.linear(lm_output, logit_weights, None)
+
+    if labels is None:
+        # [s b h] => [b s h]
+        return output.transpose(0,1).contiguous()
+    else:
+        # [b s] => [s b]
+        labels = labels.transpose(0,1).contiguous()
+        if fp16_lm_cross_entropy:
+            assert output.dtype == torch.half
+            loss = cross_entropy(output, labels)
+        else:
+            loss = cross_entropy(output.float(), labels)
+        # [s b] => [b, s]
+        loss = loss.transpose(0,1).contiguous()
+        return loss
+
+
+class GPTModel(MegatronModule):
+    """GPT-2 Language model."""
+
+    def __init__(self,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 pre_process=True,
+                 post_process=True):
+        super(GPTModel, self).__init__()
+
+        self.parallel_output = parallel_output
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = config.fp16_lm_cross_entropy
+
+        self.language_model, self._language_model_key = get_language_model(
+            num_tokentypes=num_tokentypes,
+            add_pooler=False,
+            encoder_attn_mask_type=AttnMaskType.causal,
+            init_method=init_method_normal(config.init_method_std),
+            scaled_init_method=scaled_init_method_normal(config.init_method_std,
+                                                         config.num_layers),
+            pre_process=self.pre_process,
+            post_process=self.post_process)
+
+        self.initialize_word_embeddings(init_method_normal)
+
+    def set_input_tensor(self, input_tensor):
+        """See megatron.model.transformer.set_input_tensor()"""
+        self.language_model.set_input_tensor(input_tensor)
+
+    def forward(self, input_ids, position_ids, attention_mask, labels=None,
+                tokentype_ids=None, inference_params=None):
+
+        lm_output = self.language_model(
+            input_ids,
+            position_ids,
+            attention_mask,
+            inference_params=inference_params)
+
+        return post_language_model_processing(
+                lm_output, labels,
+                self.word_embeddings_weight(),
+                self.fp16_lm_cross_entropy)
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+
+        state_dict_ = {}
+        state_dict_[self._language_model_key] \
+            = self.language_model.state_dict_for_save_checkpoint(
+                prefix=prefix, keep_vars=keep_vars)
+        # Save word_embeddings.
+        if self.post_process and not self.pre_process:
+            state_dict_[self._word_embeddings_for_head_key] \
+                = self.word_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Load word_embeddings.
+        if self.post_process and not self.pre_process:
+            self.word_embeddings.load_state_dict(
+                state_dict[self._word_embeddings_for_head_key], strict=strict)
+        if self._language_model_key in state_dict:
+            state_dict = state_dict[self._language_model_key]
+        self.language_model.load_state_dict(state_dict, strict=strict)
+
+
diff --git a/training/benchmarks/gpt2/pytorch/model/models/language_model.py b/training/benchmarks/gpt2/pytorch/model/models/language_model.py
new file mode 100644
index 000000000..d61b7cb19
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/language_model.py
@@ -0,0 +1,502 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Transformer based language model."""
+
+import torch
+import torch.nn.functional as F
+
+from .enums import AttnMaskType, LayerType, ModelType
+from .module import MegatronModule
+from ..layers.transformer import ParallelTransformer
+from model.layers.layers import VocabParallelEmbedding
+from .utils import get_linear_layer
+from .utils import init_method_normal, scaled_init_method_normal
+
+import config
+
+def get_language_model(num_tokentypes, add_pooler,
+                       encoder_attn_mask_type, init_method=None,
+                       scaled_init_method=None, add_encoder=True,
+                       add_decoder=False,
+                       decoder_attn_mask_type=AttnMaskType.causal,
+                       pre_process=True, post_process=True):
+    """Build language model and return along with the key to save."""
+
+    if init_method is None:
+        init_method = init_method_normal(config.init_method_std)
+
+    if scaled_init_method is None:
+        scaled_init_method = scaled_init_method_normal(config.init_method_std,
+                                                       config.num_layers)
+
+    # Language model.
+    language_model = TransformerLanguageModel(
+        init_method,
+        scaled_init_method,
+        encoder_attn_mask_type,
+        num_tokentypes=num_tokentypes,
+        add_encoder=add_encoder,
+        add_decoder=add_decoder,
+        decoder_attn_mask_type=decoder_attn_mask_type,
+        add_pooler=add_pooler,
+        pre_process=pre_process,
+        post_process=post_process
+    )
+    # key used for checkpoints.
+    language_model_key = 'language_model'
+
+    return language_model, language_model_key
+
+
+class Pooler(MegatronModule):
+    """Pooler layer.
+
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+
+    def __init__(self, hidden_size, init_method):
+        super(Pooler, self).__init__()
+        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [s, b, h]
+        # sequence_index: index of the token to pool.
+
+        # gather data along sequence dimensions
+        # same pooler is run on all tensor parallel nodes
+
+        pooled = hidden_states[sequence_index, :, :]
+        pooled = self.dense(pooled)
+        pooled = torch.tanh(pooled)
+        return pooled
+
+
+class Embedding(MegatronModule):
+    """Language model embeddings.
+
+    Arguments:
+        hidden_size: hidden size
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        init_method: weight initialization method
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 vocab_size,
+                 max_sequence_length,
+                 embedding_dropout_prob,
+                 init_method,
+                 num_tokentypes=0):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.init_method = init_method
+        self.num_tokentypes = num_tokentypes
+
+        # Word embeddings (parallel).
+        self.word_embeddings = VocabParallelEmbedding(
+            vocab_size, self.hidden_size,
+            init_method=self.init_method,
+            params_dtype=config.params_dtype,
+            use_cpu_initialization=True,
+            perform_initialization=True
+        )
+        self._word_embeddings_key = 'word_embeddings'
+
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(
+            max_sequence_length, self.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        self.init_method(self.position_embeddings.weight)
+
+        # Token type embedding.
+        # Add this as an optional field that can be added through
+        # method call so we can load a pretrain model without
+        # token types and add them as needed.
+        self._tokentype_embeddings_key = 'tokentype_embeddings'
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes,
+                                                           self.hidden_size)
+            # Initialize the token-type embeddings.
+            self.init_method(self.tokentype_embeddings.weight)
+        else:
+            self.tokentype_embeddings = None
+
+        self.fp32_residual_connection = config.fp32_residual_connection 
+        self.sequence_parallel = False
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+        if self.num_tokentypes > 0:
+            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True
+
+    def add_tokentype_embeddings(self, num_tokentypes):
+        """Add token-type embedding. This function is provided so we can add
+        token-type embeddings in case the pretrained model does not have it.
+        This allows us to load the model normally and then add this embedding.
+        """
+        if self.tokentype_embeddings is not None:
+            raise Exception('tokentype embeddings is already initialized')
+        if torch.distributed.get_rank() == 0:
+            print('adding embedding for {} tokentypes'.format(num_tokentypes),
+                  flush=True)
+        self.num_tokentypes = num_tokentypes
+        self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes,
+                                                       self.hidden_size)
+        # Initialize the token-type embeddings.
+        self.init_method(self.tokentype_embeddings.weight)
+
+    def forward(self, input_ids, position_ids, tokentype_ids=None):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+        if tokentype_ids is not None:
+            assert self.tokentype_embeddings is not None
+            embeddings = embeddings + self.tokentype_embeddings(tokentype_ids)
+        else:
+            assert self.tokentype_embeddings is None
+
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+
+        # Dropout.
+        embeddings = self.embedding_dropout(embeddings)
+
+        return embeddings
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] \
+            = self.word_embeddings.state_dict(prefix=prefix,
+                                              keep_vars=keep_vars)
+        state_dict_[self._position_embeddings_key] \
+            = self.position_embeddings.state_dict(prefix=prefix,
+                                                  keep_vars=keep_vars)
+        if self.num_tokentypes > 0:
+            state_dict_[self._tokentype_embeddings_key] \
+                = self.tokentype_embeddings.state_dict(prefix=prefix,
+                                                       keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] \
+                        = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] \
+                        = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
+
+        # Tokentype embedding.
+        if self.num_tokentypes > 0:
+            state_dict_ = {}
+            if self._tokentype_embeddings_key in state_dict:
+                state_dict_ = state_dict[self._tokentype_embeddings_key]
+            else:
+                # for backward compatibility.
+                for key in state_dict.keys():
+                    if 'tokentype_embeddings' in key:
+                        state_dict_[key.split('tokentype_embeddings.')[1]] \
+                            = state_dict[key]
+            if len(state_dict_.keys()) > 0:
+                self.tokentype_embeddings.load_state_dict(state_dict_,
+                                                          strict=strict)
+            else:
+                print('***WARNING*** expected tokentype embeddings in the '
+                      'checkpoint but could not find it', flush=True)
+
+
+class TransformerLanguageModel(MegatronModule):
+    """Transformer language model.
+
+    Arguments:
+        transformer_hparams: transformer hyperparameters
+        vocab_size: vocabulary size
+        max_sequence_length: maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob: dropout probability for embeddings
+        num_tokentypes: size of the token-type embeddings. 0 value
+                        will ignore this embedding
+    """
+
+    def __init__(self,
+                 init_method,
+                 output_layer_init_method,
+                 encoder_attn_mask_type,
+                 num_tokentypes=0,
+                 add_encoder=True,
+                 add_decoder=False,
+                 decoder_attn_mask_type=AttnMaskType.causal,
+                 add_pooler=False,
+                 pre_process=True,
+                 post_process=True):
+        super(TransformerLanguageModel, self).__init__()
+
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.hidden_size = config.hidden_size
+        self.num_tokentypes = num_tokentypes
+        self.init_method = init_method
+        self.add_encoder = add_encoder
+        self.encoder_attn_mask_type = encoder_attn_mask_type
+        self.add_decoder = add_decoder
+        self.decoder_attn_mask_type = decoder_attn_mask_type
+        self.add_pooler = add_pooler
+        self.encoder_hidden_state = None
+
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = Embedding(self.hidden_size,
+                                       config.padded_vocab_size,
+                                       config.max_position_embeddings,
+                                       config.hidden_dropout,
+                                       self.init_method,
+                                       self.num_tokentypes)
+            self._embedding_key = 'embedding'
+
+        # Transformer.
+        # Encoder (usually set to True, False if part of an encoder-decoder
+        # architecture and in encoder-only stage).
+        if self.add_encoder:
+            self.encoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                self_attn_mask_type=self.encoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process
+            )
+            self._encoder_key = 'encoder'
+        else:
+            self.encoder = None
+
+        # Decoder (usually set to False, True if part of an encoder-decoder
+        # architecture and in decoder-only stage).
+        if self.add_decoder:
+            self.decoder = ParallelTransformer(
+                self.init_method,
+                output_layer_init_method,
+                layer_type=LayerType.decoder,
+                self_attn_mask_type=self.decoder_attn_mask_type,
+                pre_process=self.pre_process,
+                post_process=self.post_process)
+            self._decoder_key = 'decoder'
+        else:
+            self.decoder = None
+
+        if self.post_process:
+            # Pooler.
+            if self.add_pooler:
+                self.pooler = Pooler(self.hidden_size, self.init_method)
+                self._pooler_key = 'pooler'
+
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+
+        if self.add_encoder and self.add_decoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with both encoder and decoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_encoder:
+            assert len(input_tensor) == 1, \
+                'input_tensor should only be length 1 for stage with only encoder'
+            self.encoder.set_input_tensor(input_tensor[0])
+        elif self.add_decoder:
+            if len(input_tensor) == 2:
+                self.decoder.set_input_tensor(input_tensor[0])
+                self.encoder_hidden_state = input_tensor[1]
+            elif len(input_tensor) == 1:
+                self.decoder.set_input_tensor(None)
+                self.encoder_hidden_state = input_tensor[0]
+            else:
+                raise Exception('input_tensor must have either length 1 or 2')
+        else:
+            raise Exception('Stage must have at least either encoder or decoder')
+
+    def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask,
+                dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None,
+                enc_dec_attn_mask=None, tokentype_ids=None,
+                inference_params=None,
+                pooling_sequence_index=0,
+                enc_hidden_states=None, output_enc_hidden=False):
+
+        # Encoder embedding.
+        if self.pre_process:
+            encoder_input = self.embedding(enc_input_ids, enc_position_ids,
+                                           tokentype_ids=tokentype_ids)
+        else:
+            encoder_input = None
+
+        # Run encoder.
+        if enc_hidden_states is None:
+            if self.encoder is not None:
+                encoder_output = self.encoder(
+                    encoder_input,
+                    enc_attn_mask,
+                    inference_params=inference_params)
+            else:
+                encoder_output = self.encoder_hidden_state
+        else:
+            encoder_output = enc_hidden_states.to(encoder_input.dtype)
+
+        if self.post_process:
+            if self.add_pooler:
+                pooled_output = self.pooler(encoder_output,
+                                            pooling_sequence_index)
+
+        # output_enc_hidden refers to when we just need the encoder's
+        # output. For example, it is helpful to compute
+        # similarity between two sequences by average pooling
+        if not self.add_decoder or output_enc_hidden:
+            if self.add_pooler and self.post_process:
+                return encoder_output, pooled_output
+            else:
+                return encoder_output
+
+        # Decoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(dec_input_ids,
+                                           dec_position_ids)
+        else:
+            decoder_input = None
+
+        # Run decoder.
+        decoder_output = self.decoder(
+            decoder_input,
+            dec_attn_mask,
+            encoder_output=encoder_output,
+            enc_dec_attn_mask=enc_dec_attn_mask,
+            inference_params=inference_params)
+
+        if self.add_pooler and self.post_process:
+            return decoder_output, encoder_output, pooled_output
+        else:
+            return decoder_output, encoder_output
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+
+        state_dict_ = {}
+        if self.pre_process:
+            state_dict_[self._embedding_key] \
+                = self.embedding.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                keep_vars=keep_vars)
+        if self.add_encoder:
+            state_dict_[self._encoder_key] \
+                = self.encoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+        if self.post_process:
+            if self.add_pooler:
+                state_dict_[self._pooler_key] \
+                    = self.pooler.state_dict_for_save_checkpoint(prefix=prefix,
+                                                                 keep_vars=keep_vars)
+        if self.add_decoder:
+            state_dict_[self._decoder_key] \
+                = self.decoder.state_dict_for_save_checkpoint(prefix=prefix,
+                                                              keep_vars=keep_vars)
+
+        return state_dict_
+
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+
+        # Embedding.
+        if self.pre_process:
+            if self._embedding_key in state_dict:
+                state_dict_ = state_dict[self._embedding_key]
+            else:
+                # for backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if '_embeddings' in key:
+                        state_dict_[key] = state_dict[key]
+            self.embedding.load_state_dict(state_dict_, strict=strict)
+
+        # Encoder.
+        if self.add_encoder:
+            if self._encoder_key in state_dict:
+                state_dict_ = state_dict[self._encoder_key]
+            # For backward compatibility.
+            elif 'transformer' in state_dict:
+                state_dict_ = state_dict['transformer']
+            else:
+                # For backward compatibility.
+                state_dict_ = {}
+                for key in state_dict.keys():
+                    if 'transformer.' in key:
+                        state_dict_[key.split('transformer.')[1]] = state_dict[key]
+
+            # For backward compatibility.
+            state_dict_self_attention = {}
+            for key in state_dict_.keys():
+                if '.attention.' in key:
+                    state_dict_self_attention[key.replace(".attention.",
+                        ".self_attention.")] = state_dict_[key]
+                else:
+                    state_dict_self_attention[key] = state_dict_[key]
+            state_dict_ = state_dict_self_attention
+
+            self.encoder.load_state_dict(state_dict_, strict=strict)
+
+        # Pooler.
+        if self.post_process:
+            if self.add_pooler:
+                assert 'pooler' in state_dict, \
+                    'could not find data for pooler in the checkpoint'
+                self.pooler.load_state_dict(state_dict[self._pooler_key],
+                                            strict=strict)
+        # Decoder.
+        if self.add_decoder:
+            assert 'decoder' in state_dict, \
+                'could not find data for pooler in the checkpoint'
+            self.decoder.load_state_dict(state_dict[self._decoder_key],
+                                         strict=strict)
diff --git a/training/benchmarks/gpt2/pytorch/model/models/module.py b/training/benchmarks/gpt2/pytorch/model/models/module.py
new file mode 100644
index 000000000..8b09c46a1
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/module.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+
+    def __init__(self, share_word_embeddings=True):
+        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+    def word_embeddings_weight(self):
+        if self.pre_process:
+            return self.language_model.embedding.word_embeddings.weight
+        else:
+            if not self.share_word_embeddings:
+                raise Exception('word_embeddings_weight() called for last '
+                                'stage, but share_word_embeddings is false')
+            return self.word_embeddings.weight
+
+
+    def initialize_word_embeddings(self, init_method_normal):
+        if not self.share_word_embeddings:
+            raise Exception('initialize_word_embeddings() was called but '
+                            'share_word_embeddings is false')
+        return
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+
+
+
+class Float16Module(MegatronModule):
+
+    def __init__(self, module, args):
+        super(Float16Module, self).__init__()
+
+        if args.fp16:
+            self.add_module('module', module.half())
+            def float16_convertor(val):
+                return val.half()
+        elif args.bf16:
+            self.add_module('module', module.bfloat16())
+            def float16_convertor(val):
+                return val.bfloat16()
+        else:
+            raise Exception('should not be here')
+
+        self.float16_convertor = float16_convertor
+
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+
+    def forward(self, *inputs, **kwargs):
+        inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        outputs = float16_to_fp32(outputs)
+        return outputs
+
+
+    def state_dict(self, prefix='', keep_vars=False):
+        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix,
+                                                          keep_vars=keep_vars)
+
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
diff --git a/training/benchmarks/gpt2/pytorch/model/models/utils.py b/training/benchmarks/gpt2/pytorch/model/models/utils.py
new file mode 100644
index 000000000..a32740c7f
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/model/models/utils.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for models."""
+
+import math
+
+import torch
+import config
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+
+def get_linear_layer(rows, columns, init_method):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    if get_args().perform_initialization:
+        init_method(layer.weight)
+    with torch.no_grad():
+        layer.bias.zero_()
+    return layer
+
+
+class RMSNorm(torch.nn.Module):
+    """An alternate to layer normalization, without mean centering and the learned bias
+    paper: [Root mean square layer normalization](https://arxiv.org/abs/1910.07467)
+    code: https://github.com/facebookresearch/llama/blob/main/llama/model.py#L33
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
diff --git a/training/benchmarks/gpt2/pytorch/mpu/__init__.py b/training/benchmarks/gpt2/pytorch/mpu/__init__.py
new file mode 100644
index 000000000..e889e1e7a
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/mpu/__init__.py
@@ -0,0 +1,11 @@
+import torch
+
+_DATA_PARALLEL_GLOBAL_RANKS = None
+if torch.distributed.is_initialized():
+    _DATA_PARALLEL_GLOBAL_RANKS = [i for i in range(torch.distributed.get_world_size())]
+
+def get_data_parallel_rank():
+    return torch.distributed.get_rank()
+
+def get_data_parallel_world_size():
+    return torch.distributed.get_world_size()
\ No newline at end of file
diff --git a/training/benchmarks/gpt2/pytorch/optimizer/__init__.py b/training/benchmarks/gpt2/pytorch/optimizer/__init__.py
new file mode 100644
index 000000000..1e8edec9e
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/optimizer/__init__.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+from torch.optim import SGD
+from torch.optim import AdamW as Adam
+
+from optimizer.distrib_optimizer import DistributedOptimizer
+from optimizer.grad_scaler import ConstantGradScaler, DynamicGradScaler
+from optimizer.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
+import config
+
+def get_param_groups(module,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate.
+    """
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
+    for name, param in module.named_parameters():
+        if not param.requires_grad:
+            continue
+
+        if no_weight_decay_cond is not None:
+            no_wd = no_weight_decay_cond(name, param)
+        else:
+            # do not regularize biases nor Norm parameters
+            no_wd = name.endswith(".bias") or len(param.shape) == 1
+
+        if scale_lr_cond is not None:
+            scale_lr = scale_lr_cond(name, param)
+        else:
+            scale_lr = False
+
+        if not no_wd and not scale_lr:
+            wd_no_scale_lr.append(param)
+        elif not no_wd and scale_lr:
+            wd_scale_lr.append(param)
+        elif no_wd and not scale_lr:
+            no_wd_no_scale_lr.append(param)
+        else:
+            no_wd_scale_lr.append(param)
+
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def get_megatron_optimizer(model,
+                           no_weight_decay_cond=None,
+                           scale_lr_cond=None,
+                           lr_mult=1.0):
+    # Base optimizer.
+    param_groups = get_param_groups(model,
+                                    no_weight_decay_cond,
+                                    scale_lr_cond,
+                                    lr_mult)
+
+    if config.optimizer == 'adam':
+        optimizer = Adam(param_groups,
+                         lr=config.lr,
+                         weight_decay=config.weight_decay,
+                         betas=(config.adam_beta1, config.adam_beta2),
+                         eps=config.adam_eps)
+    elif config.optimizer == 'sgd':
+        optimizer = SGD(param_groups,
+                        lr=config.lr,
+                        weight_decay=config.weight_decay,
+                        momentum=config.sgd_momentum)
+    else:
+        raise Exception('{} optimizer is not supported.'.format(
+            config.optimizer))
+
+    # Determine whether the params have main-grad field.
+    params_have_main_grad = False
+    if config.DDP_impl == 'local':
+        params_have_main_grad = True
+
+    # Mixed precision optimizer.
+    # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
+    #   from the MixedPrecisionOptimizer, which manages any optimizer where
+    #   the model params and main params are distinct.
+    if config.fp16 or config.bf16 or config.use_distributed_optimizer:
+
+        # Grad scaler:
+        #    if loss-scale is provided, instantiate the constant scaler.
+        #    if we are using fp16 and loss-scale is not present, use a
+        #       dynamic scaler.
+        #    otherwise we are running in bf16 with no loss-scale so
+        #       leave it as None.
+        grad_scaler = None
+
+        # Constant loss scale.
+        if config.loss_scale:
+            grad_scaler = ConstantGradScaler(config.loss_scale)
+
+        # Dynamic loss scale.
+        else:
+            if config.fp16:
+                grad_scaler = DynamicGradScaler(
+                    initial_scale=config.initial_loss_scale,
+                    min_scale=config.min_loss_scale,
+                    growth_factor=2.0,
+                    backoff_factor=0.5,
+                    growth_interval=config.loss_scale_window,
+                    hysteresis=config.hysteresis)
+
+        # Megatron optimizer.
+        opt_ty = DistributedOptimizer \
+            if config.use_distributed_optimizer else \
+            Float16OptimizerWithFloat16Params
+        return opt_ty(optimizer,
+                      config.clip_grad,
+                      params_have_main_grad,
+                      config.use_contiguous_buffers_in_local_ddp,
+                      config.fp16,
+                      config.bf16,
+                      config.params_dtype,
+                      grad_scaler,
+                      model)
+
+    # FP32.
+    
+    return FP32Optimizer(optimizer, config.clip_grad,
+                         params_have_main_grad,
+                         config.use_contiguous_buffers_in_local_ddp,
+                         model)
diff --git a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
new file mode 100644
index 000000000..e2f053054
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Gradient clipping."""
+
+import torch
+from torch import inf
+
+from model.models.module import param_is_not_shared
+
+
+def clip_grad_norm_fp32(parameters, grads_for_norm,
+                        max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters whose gradients
+       are in fp32.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single
+            Tensor that will be used for calculating the grad norm.
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        model_parallel_group (group): given the nature of the distributed
+            optimizer, this is passed as an argument.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    if isinstance(grads_for_norm, torch.Tensor):
+        grads_for_norm = [grads_for_norm]
+
+    # Grads.
+    grads = []
+    for param in parameters:
+        if param.grad is not None:
+            assert param.grad.type() in ['torch.xpu.FloatTensor', 'torch.cuda.FloatTensor']
+            grads.append(param.grad.detach())
+
+    # Norm parameters.
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    total_norm = 0.0
+
+    # Calculate norm.
+    if norm_type == inf:
+        total_norm = max(grad.abs().max() for grad in grads_for_norm)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        total_norm = total_norm_cuda[0].item()
+
+    else:
+        if norm_type == 2.0:
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            # Use apex's multi-tensor applier for efficiency reasons.
+            # Multi-tensor applier takes a function and a list of list
+            # and performs the operation on that list all in one kernel.
+            if grads_for_norm:
+                grad_norm = torch.cuda.FloatTensor([item.norm() for item in grads_for_norm]).norm()
+            else:
+                grad_norm = torch.cuda.FloatTensor([0])
+            # Since we will be summing across data parallel groups,
+            # we need the pow(norm-type).
+            total_norm = grad_norm ** norm_type
+
+        else:
+            for grad in grads_for_norm:
+                grad_norm = torch.norm(grad, norm_type)
+                total_norm += grad_norm ** norm_type
+
+        # Sum across all model-parallel GPUs.
+        total_norm = total_norm.item() ** (1.0 / norm_type)
+
+    # Scale.
+    clip_coeff = max_norm / (total_norm + 1.0e-6)
+    if clip_coeff < 1.0:
+        dummy_overflow_buf = torch.cuda.IntTensor([0])
+        grads = [item * clip_coeff for item in grads]
+
+    return total_norm
diff --git a/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py b/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py
new file mode 100644
index 000000000..50125b83c
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py
@@ -0,0 +1,1011 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron distributed optimizer."""
+import math
+
+import torch
+from torch.optim import AdamW as Adam
+
+from mpu import get_data_parallel_world_size, get_data_parallel_rank
+from optimizer.optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
+import config
+
+class Range:
+    """
+    A range represents a start and end points for indexing a shard
+    from a full tensor.
+    """
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+        self.size = end - start
+    def normalize(self, start = 0):
+        return Range(start, start + self.size)
+    def __str__(self):
+        return "%d,%d [%d]" % (self.start, self.end, self.size)
+    def __len__(self):
+        return self.end - self.start
+
+
+class DistributedOptimizer(MixedPrecisionOptimizer):
+    """Distributed optimizer, for all data types (fp16, bf16, and fp32).
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    @classmethod
+    def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range):
+        """
+        Build mapping from param reference to grad buffer shard ranges.
+
+        This method builds a mapping from parameter references to grad
+        buffer shard ranges, specific to each data-parallel (DP) rank's
+        set of 'owned' parameters. Each grad buffer (padded to be an even
+        multiple of DP-world-size) is conceptually divided into DP-world-size
+        contiguous regions, where each DP rank 'owns' a contiguous regions.
+        Ownership in this sense means DP rank is responsible for reducing
+        the relevant subset of grads, and updating the relevant subset of
+        params.
+
+        This conceptual partitioning of the grad buffer does NOT respect
+        parameter boundaries, and as such it is assumed that each created
+        range references a shard (or subset) of the full parameter. It is
+        easiest to think of each DP rank as operating (i.e., reducing,
+        gathering) purely on views into the grad buffer, for all model-to-
+        main & main-to-model operations.
+
+        This method creates three ranges:
+        - The param's range within the entire grad buffer (i.e., world index).
+        - The param's range within the DP rank's local view of the grad buffer.
+        - The param's range within itself (i.e., its shard).
+        """
+
+        # Param range map.
+        param_world_index_map = model._grad_buffer_param_index_map[dtype]
+        param_range_map = {}
+        for param, param_world_indexes in param_world_index_map.items():
+
+            # Param range.
+            param_world_start, param_world_end = param_world_indexes
+            param_local_start = max(
+                0,
+                param_world_start - gbuf_world_range.start)
+            param_local_end = min(
+                gbuf_world_range.size,
+                param_world_end - gbuf_world_range.start)
+
+            # Add param, if within local gbuf range.
+            if param_local_end > param_local_start:
+                param_local_range = Range(param_local_start, param_local_end)
+                param_world_range = param_local_range.normalize(
+                    param_local_start + gbuf_world_range.start)
+                sub_param_start = max(0, gbuf_world_range.start-param_world_start)
+                sub_param_range = param_local_range.normalize(sub_param_start)
+                param_range_map[param] = {
+                    "gbuf_world" : param_world_range,
+                    "gbuf_local" : param_local_range,
+                    "param" : sub_param_range,
+                }
+
+        return param_range_map
+
+
+    @classmethod
+    def build_model_gbuf_range(cls, model, dtype):
+        """
+        Build mapping between params and their grad buffers.
+
+        This method does the initial setup for the method above. This setup
+        includes determining the shard ranges into the DDP's grad buffer for
+        each data-parallel (DP) rank. Each DP rank keeps range info for
+        all other DP ranks, for the purpose of creating args for
+        reduce-scatter and all-gather.
+        """
+
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+
+        # Grad buffer range.
+        grad_buffer = model._grad_buffers[dtype]
+        gbuf_size = grad_buffer.numel
+        max_gbuf_range_size = int(math.ceil(gbuf_size / data_parallel_world_size))
+
+        # All world ranges. (i.e., across all data parallel ranks)
+        gbuf_world_all_ranges = []
+        for r in range(data_parallel_world_size):
+            gbuf_world_start = r * max_gbuf_range_size
+            gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size)
+            gbuf_world_range = Range(gbuf_world_start, gbuf_world_end)
+            gbuf_world_all_ranges.append(gbuf_world_range)
+
+        # Local DP's ranges.
+        gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank]
+        gbuf_local_range = gbuf_world_range.normalize()
+
+        # Get each param's ranges.
+        param_range_map = cls.build_model_gbuf_param_range_map(model,
+                                                               dtype,
+                                                               gbuf_world_range)
+
+        # Group into dict.
+        data = {
+            "local" : gbuf_local_range,
+            "world" : gbuf_world_range,
+            "world_all" : gbuf_world_all_ranges,
+            "param_map" : param_range_map,
+            "max_range_size" : max_gbuf_range_size,
+        }
+
+        return data
+
+
+    @classmethod
+    def build_model_gbuf_range_map(cls, model):
+        """
+        Create param-to-grad-buffer mappings, for grad buffer data types
+        within a specific virtual model.
+        """
+        return {
+            dtype : cls.build_model_gbuf_range(model, dtype)
+            for dtype in model._grad_buffers
+        }
+
+
+    @classmethod
+    def build_model_param_gbuf_map(cls, model_gbuf_ranges):
+        """
+        Create a reverse of the model_gbuf_ranges, for referencing in
+        opposite direction.
+        """
+        param_gbuf_map = {}
+        for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges):
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param, param_range_map in gbuf_range_map["param_map"].items():
+                    param_gbuf_map[param] = (model_index, dtype)
+        return param_gbuf_map
+
+
+    @classmethod
+    def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges):
+        """
+        Create optimizer groups.
+
+        Given the set of parameter shard ranges that are owned by the current
+        data-parallel (DP) rank, gather the set of parameters that will be
+        used (in the method below) to create the current DP's optimizer
+        groups.
+        """
+
+        num_groups = len(param_groups)
+
+        # Param group map.
+        # World param group map.
+        # - Store a mapping of <model_parameter:group_index> for all parameters
+        #   across all DP ranks. This is necessary because it is our first
+        #   cross reference between the DDP mappings and the optimizer group
+        #   parameters. This mapping only for use in the next step of building
+        #   the local mapping over this DP rank's parameters.
+        world_param_group_map = {}
+        for group_index, group in enumerate(param_groups):
+            for param in group["params"]:
+                assert param.requires_grad
+                world_param_group_map[param] = group_index
+
+        # Optimizer group ranges & param-group mapping.
+        # - Build a mapping from groups to their contained parameters, and also
+        #   from parameters to their containing group index and order within
+        #   the group. The group index and order are particularly important for
+        #   saving and loading checkpoints.
+        local_param_group_map = {}
+        group_ranges = [ {"params": []} for _ in param_groups ]
+        for model_gbuf_range_map in model_gbuf_ranges:
+            for dtype, gbuf_range_map in model_gbuf_range_map.items():
+                for param in gbuf_range_map["param_map"]:
+                    group_index = world_param_group_map[param]
+                    group_range = group_ranges[group_index]
+                    group_range["params"].append(param)
+                    local_param_group_map[param] = \
+                        (group_index, len(group_range["params"]) - 1)
+
+        # Squeeze zero-size group ranges.
+        for group_index, group_range in enumerate(group_ranges):
+            group_range["orig_group"] = param_groups[group_index]
+            group_range["orig_group_idx"] = param_groups[group_index]
+
+        return local_param_group_map, group_ranges
+
+
+    @classmethod
+    def build_model_and_main_param_groups(cls,
+                                          model_gbuf_ranges,
+                                          param_gbuf_map,
+                                          opt_group_ranges):
+        """
+        Create main parameter groups needed for the optimizer step.
+
+        These groups encompass both: 1) groups used by this class, for
+        reducing/gather, and 2) groups used by the inner optimizer for the
+        parameter update. Given that the conceptual grad buffer partitioning
+        (created in earlier method) doesn't respect parameter boundaries,
+        the optimizer operates on shards of the model parameters, rather than
+        the full parameters.
+        """
+
+        # Parameter groups:
+        #   model_float16_groups: original float16 parameters
+        #   model_fp32_groups: original fp32 parameters
+        #   shard_float16_groups: shards of original float16 parameters
+        #   shard_fp32_groups: shards of original fp32 parameters
+        #   shard_fp32_from_float16_groups: fp32 copy of float16 parameters
+        model_float16_groups = []
+        model_fp32_groups = []
+        shard_float16_groups = []
+        shard_fp32_groups = []
+        shard_fp32_from_float16_groups = []
+
+        # Allocate (or slice) each group's param shard.
+        for group_index, group_range in enumerate(opt_group_ranges):
+
+            # Params of this group.
+            model_float16_params_this_group = []
+            model_fp32_params_this_group = []
+            shard_float16_params_this_group = []
+            shard_fp32_params_this_group = []
+            shard_fp32_from_float16_params_this_group = []
+            model_float16_groups.append(model_float16_params_this_group)
+            model_fp32_groups.append(model_fp32_params_this_group)
+            shard_float16_groups.append(shard_float16_params_this_group)
+            shard_fp32_groups.append(shard_fp32_params_this_group)
+            shard_fp32_from_float16_groups.append(
+                shard_fp32_from_float16_params_this_group)
+
+            for model_param in group_range["params"]:
+
+                assert model_param.requires_grad
+
+                model_index, dtype = param_gbuf_map[model_param]
+                gbuf_range = model_gbuf_ranges[model_index][dtype]
+                param_range = gbuf_range["param_map"][model_param]["param"]
+
+                # fp16, bf16 params.
+                if model_param.type() in ['torch.cuda.HalfTensor',
+                                          'torch.cuda.BFloat16Tensor',
+                                          'torch.xpu.HalfTensor',
+                                          'torch.xpu.BFloat16Tensor']:
+
+                    # Clone model -> main.
+                    shard_model_param = model_param.detach().view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param = shard_model_param.clone().float()
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+                        shard_main_param.shared = model_param.shared
+
+                    # Add to group.
+                    model_float16_params_this_group.append(model_param)
+                    shard_float16_params_this_group.append(shard_model_param)
+                    shard_fp32_from_float16_params_this_group.append(shard_main_param)
+
+                # fp32 params.
+                elif model_param.type() in ['torch.cuda.FloatTensor',
+                                            'torch.xpu.FloatTensor']:
+                    shard_model_param = model_param.view(-1) \
+                        [param_range.start:param_range.end]
+                    model_fp32_params_this_group.append(model_param)
+                    shard_fp32_params_this_group.append(shard_model_param)
+                    if hasattr(model_param, 'shared'):
+                        shard_model_param.shared = model_param.shared
+
+                else:
+                    raise TypeError('Wrapped parameters must be one of '
+                                    'torch.cuda.FloatTensor,  '
+                                    'torch.cuda.HalfTensor, or '
+                                    'torch.cuda.BFloat16Tensor, or '
+                                    'torch.xpu.FloatTensor, or '
+                                    'torch.xpu.HalfTensor, or '
+                                    'torch.xpu.BFloat16Tensor. '
+                                    'Received {}'.format(model_param.type()))
+
+            # Update optimizer's params.
+            group_range["orig_group"]["params"] = [
+                *shard_fp32_params_this_group,
+                *shard_fp32_from_float16_params_this_group,
+            ]
+
+        return (
+            model_float16_groups,
+            model_fp32_groups,
+            shard_float16_groups,
+            shard_fp32_groups,
+            shard_fp32_from_float16_groups,
+        )
+
+
+    def __init__(self, optimizer, clip_grad, 
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler, models):
+        """
+        See top of class definition for argument descriptions.
+
+        The steps in this method create the core mapping between DDP grad
+        buffers, parameters, and parameter shard ranges, that is needed for
+        converting between model param indexes and main parameter shard
+        indexes. This method also updates the optimizer parameter groups
+        with the newly created shards.
+        """
+
+        super().__init__(
+            optimizer, clip_grad, 
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, params_dtype, grad_scaler, models)
+
+        # Verify that contiguous buffers are being used.
+        # - Note: this should already be checked in arguments.py.
+        assert use_contiguous_buffers_in_local_ddp
+        assert isinstance(optimizer, Adam), \
+            "Only Adam currently supported, due to checkpointing requirements."
+
+        # Model grad buffer ranges.
+        self.model_gbuf_ranges = []
+        for model_index, model in enumerate(self.models):
+            self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model))
+        self.model_param_gbuf_map = \
+            self.build_model_param_gbuf_map(self.model_gbuf_ranges)
+
+        # Optimizer ranges.
+        self.model_param_group_index_map, self.opt_group_ranges = \
+            self.build_optimizer_group_ranges(self.optimizer.param_groups,
+                                              self.model_gbuf_ranges)
+
+        # Allocate main param shards.
+        (
+            self.model_float16_groups,
+            self.model_fp32_groups,
+            self.shard_float16_groups,
+            self.shard_fp32_groups,
+            self.shard_fp32_from_float16_groups,
+        ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges,
+                                                   self.model_param_gbuf_map,
+                                                   self.opt_group_ranges)
+
+        # Initialize param buffers.
+        # - These are views on the DDP model's grad buffers, that share
+        #   storage & have their own dtype. This is safe because the param
+        #   dtype size is always <= grad dtype size.
+        self.param_buffers = []
+        for model_index, model in enumerate(self.models):
+            current_param_buffers = {}
+            for dtype, grad_buffer in model._grad_buffers.items():
+
+                # Handle older/newer method for getting untyped storage.
+                try:
+                    storage = grad_buffer.data.storage()._untyped()
+                except:
+                    storage = grad_buffer.data.storage().untyped()
+
+                # FIXME: xpu do not support init torch.Tensor from XPU-typed tensor, use torch.frombuffer instead
+                # Typed param buffer.
+                #param_buffer = torch.tensor(
+                #    storage,
+                #    dtype = params_dtype,
+                #    device = grad_buffer.data.device)
+                param_buffer = torch.frombuffer(grad_buffer.data.cpu().numpy(), dtype = params_dtype).to(grad_buffer.data.device)
+                param_buffer = param_buffer[:grad_buffer.numel_padded]
+                current_param_buffers[dtype] = param_buffer
+            self.param_buffers.append(current_param_buffers)
+
+        # Update optimizer groups.
+        # - Also, leverage state_dict() and load_state_dict() to
+        #   recast preexisting per-param state tensors.
+        self.optimizer.param_groups = \
+            [ g["orig_group"] for g in self.opt_group_ranges ]
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+
+
+    def get_model_param_range_map(self, param):
+        """
+        Given a model param, get the index sub-range of the param that this
+        data-parallel rank owns.
+        """
+        model_index, dtype = self.model_param_gbuf_map[param]
+        gbuf_range_map = self.model_gbuf_ranges[model_index][dtype]
+        param_range_map = gbuf_range_map["param_map"][param]
+        return param_range_map
+
+
+    def get_model_parallel_group(self):
+        """
+        With the distributed optimizer, the model parallel group is the
+        entire world.
+        """
+        return None
+
+
+    def state_dict(self):
+        """
+        The state dict contains all non-DP-rank-dependent (i.e., non-parameter-
+        related) optimizer variables. The returned state dict can be stored in
+        the standard model/RNG checkpoint file. The parameter and dependent
+        optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate
+        checkpoint file by calling 'save_parameter_state()'.
+        """
+
+        state_dict = {}
+
+        # Optimizer state (do not store parameter state here).
+        state_dict['optimizer'] = {
+            k : v
+            for k, v in self.optimizer.state_dict().items()
+            if k != "state"
+        }
+        for param_group in state_dict["optimizer"]["param_groups"]:
+            del param_group["params"]
+
+        # Grad scaler state.
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        """Load the state dict.
+
+        As detailed in state_dict(), the state dict contains all non-
+        parameter-related variables. This method is notably longer than
+        state_dict(), because the Torch optimizers state has yet to be
+        allocated at this point, and so we must do a cross referencing between
+        the optimizers state (and the ordering it expects for parameter state)
+        and this DP rank's shards. The optimizer at this point does not contain
+        any tensor dimension information, so we must get these dimensions from
+        the DP shards mapped during DistributedOptimizer.__init__().
+
+        The tensor parameter state is loaded via load_parameter_state(), and
+        so this method also must populate the loaded state dict with dummy
+        tensor data (i.e., via torch.empty() below). This will be overwritten
+        during load_parameter_state().
+
+        ** Note: Torch optimizer's state structure. **
+        The Torch optimizer stores its state in two levels. The top level is a
+        list of groups, where each group contains a list of integer indexes
+        (corresponding to parameters) that index into a master parameter list
+        that is shared by all groups. As such, three values are necessary for
+        maintaining this ordering:
+
+        - group_index : The group to which a parameter belongs.
+        - group_order : The index of a parameter within its group.
+        - state_order : The index of a parameter within the shared parameter
+            list.
+        """
+
+        # Get the Torch optimizer's state dict.
+        # - This 'inner' optimizer at this point is unallocated, and only
+        #   contains an integer odering of parameters within each group, and
+        #   the ordering of parameters within its flattened parameter state
+        #   list.
+        inner_state_dict = self.optimizer.state_dict()
+        state_dict_param_groups = [{
+            **group,
+            "params" : list(inner_state_dict["param_groups"][idx]["params"]),
+        } for idx, group in enumerate(state_dict["optimizer"]["param_groups"])]
+
+        # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below)
+        # - Real data is overwritten during load_parameter_state().
+        state_dict_state = []
+        for gbuf_range_maps in self.model_gbuf_ranges:
+            for gbuf_range_map in gbuf_range_maps.values():
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Get parameter ordering information (see method docstring
+                    # for details).
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    state_order = inner_state_dict["param_groups"] \
+                        [group_index]["params"][group_order]
+
+                    # Allocate dummy tensors.
+                    numel = len(param_range_map["gbuf_world"])
+                    init_shard = lambda : torch.empty(
+                        (numel,),
+                        dtype=torch.float32,
+                        device=torch.cuda.current_device())
+
+                    state_dict_state.append((state_order, {
+                        "exp_avg" : init_shard(),
+                        "exp_avg_sq" : init_shard(),
+                    }))
+
+        # Sort by state order (see method docstring for details).
+        state_dict_state.sort(key = lambda s : s[0])
+        state_dict_state = {s[0]:s[1] for s in state_dict_state}
+
+        # Optimizer.
+        self.optimizer.load_state_dict({
+            "state" : state_dict_state,
+            "param_groups" : state_dict_param_groups,
+        })
+
+        # Grad scaler.
+        if 'grad_scaler' in state_dict:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+
+
+    def save_parameter_state(self, filename):
+        """Save parameter state (i.e., parameter & optimizer tensors).
+
+        This method performs three steps:
+        - For each DP rank, copy param & optimizer shards to contiguous CPU
+          buffers. (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        - Gather contiguous buffers on DP rank 0 and concatenate to world
+          buffers.
+        - Save world buffers to disk (i.e., distrib_opt.pt).
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
+        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
+
+        # Collect param states.
+        state = {}
+        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
+
+            # Iterate grad buffers (by data type).
+            dtype_state = {}
+            assert len(gbuf_range_maps) == 1, "single dtype supported, for now."
+            for dtype, gbuf_range_map in gbuf_range_maps.items():
+
+                # Compute local DP contiguous shard's size.
+                model = self.models[model_idx]
+                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
+                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
+                local_shards = {key:torch.empty((gbuf_local_numel,),
+                                             dtype=torch.float32,
+                                             device="cpu")
+                             for key in ("param", "exp_avg", "exp_avg_sq")}
+
+                # Build contiguous DP rank shards (for param + optim states).
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Main param & optimizer states.
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    main_param = self.optimizer.param_groups \
+                        [group_index]["params"][group_order]
+                    optim_state = self.optimizer.state[main_param]
+
+                    tensors = {
+                        "param" : main_param,
+                        **optim_state,
+                    }
+
+                    # Copy states into contiguous shard.
+                    gbuf_local_start = param_range_map["gbuf_local"].start
+                    gbuf_local_end = param_range_map["gbuf_local"].end
+                    for key in local_shards:
+                        local_shards[key][gbuf_local_start:gbuf_local_end] \
+                            .data.copy_(tensors[key].detach().cpu())
+
+                # Gather contiguous shards on DP rank 0.
+                world_tensors = {}
+                for key, send_tensor in local_shards.items():
+
+                    # Gather tensor list.
+                    if data_parallel_rank == 0:
+                        recv_tensors = [torch.empty((gbuf_local_numel,),
+                                                    dtype=torch.float32,
+                                                    device="cpu")
+                                        for _ in range(data_parallel_world_size)]
+                    else:
+                        recv_tensors = None
+
+                    # Gather.
+                    torch.distributed.gather(
+                        send_tensor,
+                        recv_tensors,
+                        data_parallel_global_ranks[0],
+                        data_parallel_group_gloo,
+                    )
+
+                    # Concatenate.
+                    if data_parallel_rank == 0:
+                        world_tensors[key] = torch.cat(recv_tensors)
+
+                # Collect world state.
+                dtype_state[dtype] = world_tensors
+            state[model_idx] = dtype_state
+
+        # Save param state.
+        if data_parallel_rank == 0:
+            torch.save(state, filename)
+
+
+    def load_parameter_state(self, filename):
+        """Load parameter state (i.e., parameter & optimizer tensors).
+
+        This method performs the reverse of save_parameter_state():
+        - Load world buffers from disk (i.e., distrib_opt.pt).
+        - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP
+          rank receives its relevant subset of the world buffers).
+        - For each DP rank, copy param & optimizer shards from contiguous CPU
+          buffers. (e.g., one buffer each for main_param, exp_avg, and
+          exp_avg_sq).
+        """
+
+        # Data parallelism variables.
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group_gloo = mpu.get_data_parallel_group_gloo()
+        data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS)
+
+        # Load on DP rank 0.
+        if data_parallel_rank == 0:
+            loaded_state = torch.load(filename)
+
+        # Scatter tensors to all DP ranks.
+        for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges):
+            for dtype, gbuf_range_map in gbuf_range_maps.items():
+
+                # Compute local DP contiguous shard's size.
+                model = self.models[model_idx]
+                gbuf_world_numel = model._grad_buffers[dtype].numel_padded
+                gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size)
+
+                # Contiguous local shards (received from DP rank 0).
+                local_shards = {key:torch.empty((gbuf_local_numel,),
+                                                dtype=torch.float32,
+                                                device="cpu")
+                                for key in ("param", "exp_avg", "exp_avg_sq")}
+
+                # Scatter local shards from DP rank 0.
+                for key, recv_tensor in local_shards.items():
+
+                    # Scatter tensor list.
+                    if data_parallel_rank == 0:
+                        world_tensor = loaded_state[model_idx][dtype][key]
+                        gbuf_start_idxs = \
+                            list(range(0, gbuf_world_numel, gbuf_local_numel))
+                        send_tensors = [world_tensor[i:(i+gbuf_local_numel)]
+                                        for i in gbuf_start_idxs]
+                    else:
+                        send_tensors = None
+
+                    # Scatter.
+                    torch.distributed.scatter(
+                        recv_tensor,
+                        send_tensors,
+                        data_parallel_global_ranks[0],
+                        data_parallel_group_gloo,
+                    )
+
+                # Copy local contiguous shards to param/optim shards.
+                for model_param, param_range_map in \
+                    gbuf_range_map["param_map"].items():
+
+                    # Main param & optimizer states.
+                    group_index, group_order = \
+                        self.model_param_group_index_map[model_param]
+                    main_param = self.optimizer.param_groups \
+                        [group_index]["params"][group_order]
+                    optim_state = self.optimizer.state[main_param]
+
+                    tensors = {
+                        "param" : main_param,
+                        **optim_state,
+                    }
+
+                    # Copy states into contiguous shard.
+                    gbuf_local_start = param_range_map["gbuf_local"].start
+                    gbuf_local_end = param_range_map["gbuf_local"].end
+                    for key in local_shards:
+                        tensors[key].data.copy_(
+                            local_shards[key][gbuf_local_start:gbuf_local_end])
+
+
+    def zero_grad(self, set_to_none=True):
+        """
+        Zero grads.
+
+        We only need to zero the model related parameters, i.e.,
+        model_float16_groups & model_fp32_groups. We additionally zero
+        the remaining groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point.
+        """
+        for groups in (
+                self.model_float16_groups,
+                self.model_fp32_groups,
+                self.shard_float16_groups, # grad empty/unused here?
+                self.shard_fp32_groups, # throws grad-access warning
+                self.shard_fp32_from_float16_groups):
+            for group in groups:
+                _zero_grad_group_helper(group, set_to_none)
+
+
+    @staticmethod
+    def get_model_buffer_dp_views(model_buffers):
+        """
+        Get shard views of each of the DDP's param/grad buffers.
+
+        In this nested list, the top level is grouped by the virtual model
+        index and the buffer's data type. The sub-level is a list of
+        shards of that buffer, where each shard in the list represents
+        a contiguous view of the buffer, that is owned by a data-parallel
+        rank. The shard boundary does not respect parameter boundaries, and
+        so the elements of some parameters are split across data parallel
+        ranks.
+
+        Additionally, return references to the entire buffers, for use
+        in _reduce_scatter_base and _all_gather_base.
+        """
+
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+
+        # Buffer views.
+        view_items = []
+        for model_index, buffers in enumerate(model_buffers):
+            for dtype, buf in buffers.items():
+
+                assert buf.numel() % data_parallel_world_size == 0
+                shard_size = int(buf.numel() / data_parallel_world_size)
+                buf_views = [buf[(r*shard_size):((r+1)*shard_size)]
+                             for r in range(data_parallel_world_size)]
+                view_items.append((model_index, dtype, buf, buf_views))
+
+        return view_items
+
+
+    def get_model_grad_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views([
+            {dtype : mem_buffer.data}
+            for model in self.models
+            for dtype, mem_buffer in model._grad_buffers.items()])
+
+
+    def get_model_param_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views(self.param_buffers)
+
+
+    def reduce_model_grads(self, args, timers):
+        """
+        Reduce-scatter model grads.
+
+        The DDP's grad buffer is used for the reduce-scatter, and thus no
+        tensors are dynamically allocated.
+
+        Note: this is a different order of reduction, versus the non-
+        distributed optimizer, which reduces: 1) layernorm grads, 2) all
+        grads, 3) embedding grads.
+        """
+
+        # All-reduce layer-norm grads (for sequence parallelism).
+        timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=config.barrier_with_L1_time)
+        self.allreduce_layernorm_grads(args)
+        timers('layernorm-grads-all-reduce').stop()
+
+        # All-reduce embedding grads.
+        timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=config.barrier_with_L1_time)
+        self.allreduce_embedding_grads(args)
+        timers('embedding-grads-all-reduce').stop()
+
+        # Reduce-scatter setup.
+        timers('grads-reduce-scatter', log_level=1).start(
+            barrier=config.barrier_with_L1_time)
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_world_size = mpu.get_data_parallel_world_size()
+        data_parallel_group = mpu.get_data_parallel_group()
+
+        # Scale grad buffers by '1 / data_parallel_world_size'.
+        for model in self.models:
+            for dtype, gbuf in model._grad_buffers.items():
+                gbuf.data /= data_parallel_world_size
+
+        # Reduce-scatter all grads.
+        gbuf_view_items = self.get_model_grad_buffer_dp_views()
+        for index, (model_index, dtype, gbuf, gbuf_views) \
+            in enumerate(gbuf_view_items):
+
+            torch.distributed._reduce_scatter_base(
+                gbuf_views[data_parallel_rank],
+                gbuf,
+                group = data_parallel_group,
+            )
+
+        timers('grads-reduce-scatter').stop()
+
+
+    def gather_model_params(self, args, timers):
+        """
+        All-gather updated model params.
+
+        The DDP's param buffer is used for the all-gather, and thus no
+        tensors are dynamically allocated. After the all-gather, the params
+        can be copied from the param buffer to the param.
+        """
+
+        timers('params-all-gather', log_level=1).start(
+            barrier=config.barrier_with_L1_time)
+
+        data_parallel_rank = mpu.get_data_parallel_rank()
+        data_parallel_group = mpu.get_data_parallel_group()
+
+        # All-gather updated main params.
+        # - All param buffer views are guaranteed to have the same num elements
+        #   across all data parallel ranks, due to grad buffer padding that is
+        #   done in distributed.py, and extended to the param buffers. Thus,
+        #   all sub-views will have consistent start/end indexes across data
+        #   parallel ranks.
+        pbuf_view_items = self.get_model_param_buffer_dp_views()
+        for index, (model_index, dtype, pbuf, pbuf_views) \
+            in enumerate(pbuf_view_items):
+
+            torch.distributed._all_gather_base(
+                pbuf,
+                pbuf_views[data_parallel_rank],
+                group = data_parallel_group,
+            )
+
+        # Copy from param buffer to each param.
+        for model_id, model in enumerate(self.models):
+            for dtype, param_map in model._grad_buffer_param_index_map.items():
+                for param, (buf_start, buf_end) in param_map.items():
+                    param_buf = self.param_buffers[model_id][dtype]
+                    param_buf_shard = param_buf[buf_start:buf_end]
+                    param.view(-1).detach().copy_(param_buf_shard)
+
+        timers('params-all-gather').stop()
+
+
+    def _collect_main_grad_data_for_unscaling(self):
+        """
+        Note: this should be equivalent to the float-16 optimizer's method,
+        but writtent differently, so the two should be combined.
+        """
+        return [
+            param.grad.data
+            for group in self.optimizer.param_groups
+            for param in group["params"]
+        ]
+
+
+    def _get_model_and_main_params_data_float16(self):
+        """
+        Get aligned list of model and main params.
+        """
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.shard_float16_groups,
+                                           self.shard_fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+
+    def _copy_model_grads_to_main_grads(self):
+        """
+        Copy model grads to main grads.
+
+        Since this step follows a reduce-scatter through the DDP's grad
+        buffer, this method is responsible for copying the updated grads
+        from the grad buffer to the main shard's grad field.
+        """
+
+        # Utility method for copying group grads.
+        def copy_group_grads(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups,
+                                                     shard_main_groups):
+                for model_param, shard_main_param in zip(model_group,
+                                                         shard_main_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    model_grad = model_param.main_grad
+                    shard_model_grad = model_grad.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param.grad = shard_model_grad.float()
+
+        # Copy model groups to shard groups.
+        copy_group_grads(self.model_float16_groups,
+                         self.shard_fp32_from_float16_groups)
+        copy_group_grads(self.model_fp32_groups,
+                         self.shard_fp32_groups)
+
+
+    def _copy_main_params_to_model_params(self):
+        """
+        Copy main params to model params.
+
+        Since this step is followed by an all-gather through the DDP's grad
+        buffer, this method is responsible for copying the updated params
+        from the main shards into the correct position in the grad buffer.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(shard_main_groups, model_groups):
+            for shard_main_group, model_group in zip(shard_main_groups,
+                                                     model_groups):
+                for shard_main_param, model_param in zip(shard_main_group,
+                                                         model_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    world_range = param_range_map["gbuf_world"]
+
+                    assert world_range.size == shard_main_param.nelement()
+
+                    model_id, dtype = self.model_param_gbuf_map[model_param]
+                    model_param_buffer = self.param_buffers[model_id][dtype]
+
+                    shard_model_param = model_param_buffer.view(-1) \
+                        [world_range.start:world_range.end]
+
+                    shard_model_param.data.copy_(shard_main_param)
+
+        # Copy shard groups to model groups.
+        copy_group_params(self.shard_fp32_from_float16_groups,
+                          self.model_float16_groups)
+        copy_group_params(self.shard_fp32_groups,
+                          self.model_fp32_groups)
+
+
+    def _copy_model_params_to_main_params(self):
+        """
+        Copy model params to main params.
+
+        During finetuning, this method is used to reload the main params from
+        the model params. This copy does not make use of the grad buffer as
+        an intermediary.
+        """
+
+        # Utility method for copying group params.
+        def copy_group_params(model_groups, shard_main_groups):
+            for model_group, shard_main_group in zip(model_groups,
+                                                     shard_main_groups):
+                for model_param, shard_main_param in zip(model_group,
+                                                         shard_main_group):
+
+                    param_range_map = self.get_model_param_range_map(model_param)
+                    param_range = param_range_map["param"]
+                    assert param_range.size == shard_main_param.nelement()
+
+                    shard_model_param = model_param.view(-1) \
+                        [param_range.start:param_range.end]
+                    shard_main_param.data.copy_(shard_model_param)
+
+        # Copy model groups to shard groups.
+        copy_group_params(self.model_float16_groups,
+                          self.shard_fp32_from_float16_groups)
+        copy_group_params(self.model_fp32_groups,
+                          self.shard_fp32_groups)
diff --git a/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py b/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py
new file mode 100644
index 000000000..a45225aed
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron grad scaler."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+class MegatronGradScaler(ABC):
+
+    def __init__(self, initial_scale):
+        """Initialize scale value with the input initial scale."""
+        assert initial_scale > 0.0
+        self._scale = torch.cuda.FloatTensor([initial_scale])
+
+    @property
+    def scale(self):
+        return self._scale
+
+    @property
+    def inv_scale(self):
+        return self._scale.double().reciprocal().float()
+
+    @abstractmethod
+    def update(self, found_inf):
+        pass
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class ConstantGradScaler(MegatronGradScaler):
+
+    def update(self, found_inf):
+        pass
+
+    def state_dict(self):
+        return dict()
+
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class DynamicGradScaler(MegatronGradScaler):
+
+    def __init__(self, initial_scale, min_scale,
+                 growth_factor, backoff_factor,
+                 growth_interval, hysteresis):
+        """"Grad scaler with dynamic scale that gets adjusted
+        during training."""
+        super(DynamicGradScaler, self).__init__(initial_scale)
+
+        # Lower bound on the scale.
+        assert min_scale > 0.0
+        assert min_scale <= initial_scale
+        self.min_scale = torch.cuda.FloatTensor([min_scale])
+        # Growth and backoff factors for the scale.
+        assert growth_factor > 1.0
+        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+        assert backoff_factor < 1.0
+        assert backoff_factor > 0.0
+        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+        # Interval over which if we don't see any inf/nan,
+        # we will scale the grad scale by the growth factor.
+        assert growth_interval > 0
+        self.growth_interval = growth_interval
+        # Number of inf/nans we should see before scaling down
+        # the grad scale by the backoff factor.
+        assert hysteresis > 0
+        self.hysteresis = hysteresis
+
+        # Trackers.
+        self._growth_tracker = 0
+        self._hysteresis_tracker = self.hysteresis
+
+
+    def update(self, found_inf):
+
+        # If we have an inf/nan, growth tracker is set to 0
+        # and hysterisis tracker is reduced by 1.
+        if found_inf:
+            self._growth_tracker = 0
+            self._hysteresis_tracker -= 1
+            # Now if we are out of hysteresis count, scale down the loss.
+            if self._hysteresis_tracker <= 0:
+                self._scale = torch.max(self._scale * self.backoff_factor,
+                                        self.min_scale)
+        else:
+            # If there is no nan/inf, increment the growth tracker.
+            self._growth_tracker += 1
+            # If we have had enough consequitive intervals with no nan/inf:
+            if self._growth_tracker == self.growth_interval:
+                # Reset the tracker and hysteresis trackers,
+                self._growth_tracker = 0
+                self._hysteresis_tracker = self.hysteresis
+                # and scale up the loss scale.
+                self._scale = self._scale * self.growth_factor
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['scale'] = self._scale
+        state_dict['growth_tracker'] = self._growth_tracker
+        state_dict['hysteresis_tracker'] = self._hysteresis_tracker
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._growth_tracker = state_dict['growth_tracker']
+        self._hysteresis_tracker = state_dict['hysteresis_tracker']
diff --git a/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py b/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py
new file mode 100644
index 000000000..2d8f68886
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py
@@ -0,0 +1,645 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron optimizer."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+from model.models.module import param_is_not_shared
+from optimizer.clip_grads import clip_grad_norm_fp32
+
+def _zero_grad_group_helper(group, set_to_none):
+    """Zero out the gradient for a group of parameters.
+    Note: copied from torch.optim.optimizer."""
+    for param in group:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()
+
+
+def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
+    """Use multi-tensor-applier to copy values from one list to another.
+    We don't have a blfoat16 implementation so for now if the overflow_buf
+    is not provided, we default back to simple loop copy to be compatible
+    with bfloat16."""
+    if overflow_buf:
+        overflow_buf.fill_(0)
+        # Scaling with factor `1.0` is equivalent to copy.
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             overflow_buf,
+                             [this, that],
+                             1.0)
+    else:
+        for this_, that_ in zip(this, that):
+            that_.copy_(this_)
+
+
+
+class MegatronOptimizer(ABC):
+
+
+    def __init__(self, optimizer, clip_grad,
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
+
+        """Input optimizer is the base optimizer for example Adam."""
+        self.optimizer = optimizer
+        assert self.optimizer, 'no optimizer is provided.'
+        # Set gradient clipping and logging params.
+        self.clip_grad = clip_grad
+        self.params_have_main_grad = params_have_main_grad
+        self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp
+
+        # 'models' are retained for access to the contiguous grad buffers.
+        # (see distributed optimizer)
+        self.models = models
+
+        if self.use_contiguous_buffers_in_local_ddp:
+            assert self.params_have_main_grad, \
+                "use of contiguous buffer requires that params have main grad"
+
+
+    def get_parameters(self):
+        params = []
+        for param_group in self.optimizer.param_groups:
+            for param in param_group['params']:
+                params.append(param)
+        return params
+
+
+    def get_main_grads_for_grad_norm(self):
+
+        # Filter parameters based on:
+        #   - grad should not be none
+        #   - parameter should not be shared
+        #   - should not be a replica due to tensor model parallelism
+        params = self.get_parameters()
+        grads_for_norm = []
+        for param in params:
+            grad = param.grad
+            grad_not_none = grad is not None
+            is_not_shared = param_is_not_shared(param)
+            if grad_not_none and is_not_shared:
+                grads_for_norm.append(grad)
+
+        return grads_for_norm
+
+
+    def clip_grad_norm(self, clip_grad):
+        params = self.get_parameters()
+        grads_for_norm = self.get_main_grads_for_grad_norm()
+        return clip_grad_norm_fp32(
+            params, grads_for_norm, clip_grad)
+
+
+    @abstractmethod
+    def zero_grad(self, set_to_none=True):
+        pass
+
+
+    @abstractmethod
+    def get_loss_scale(self):
+        """The output should be a cuda tensor of size 1."""
+        pass
+
+
+    def scale_loss(self, loss):
+        """Simple scaling."""
+        return self.get_loss_scale() * loss
+
+
+    @abstractmethod
+    def reload_model_params(self):
+        """Refreshes any internal state from the current model parameters.
+        Call whenever the parameters are changed outside of the optimizer.
+        For example, when we load a model from a checkpoint  without loading
+        the optimizer, the model parameters are updated but for fp16 optimizer
+        with main parameters, the main parameters need to also be updated."""
+        pass
+
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+    # Promote state so it can be retrieved or set via
+    # "optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+
+    # Promote param_groups so it can be retrieved or set via
+    # "optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+
+    @abstractmethod
+    def step(self, args, timers):
+        pass
+
+
+    def gather_model_params(self, args, timers):
+        """
+        For the case of a non-distributed-optimizer, there is nothing to
+        do here.
+        """
+        pass
+
+
+    def allreduce_word_embedding_grads(self, args):
+        """
+        All-reduce word embedding grads.
+
+        Reduce grads across first and last stages to ensure that word_embeddings
+        parameters stay in sync. This should only run for models that support
+        pipelined model parallelism (BERT and GPT-2).
+        """
+        pass
+
+
+    def allreduce_position_embedding_grads(self, args):
+        """
+        All-reduce position_embeddings grad across first (encoder) and
+        split (decoder) stages to ensure that position embeddings parameters
+        stay in sync. This should only run for T5 models with pipeline
+        parallelism.
+        """
+        pass
+
+
+    def allreduce_embedding_grads(self, args):
+        """All-reduce both word and position embeddings."""
+        self.allreduce_word_embedding_grads(args)
+        self.allreduce_position_embedding_grads(args)
+
+
+    def allreduce_layernorm_grads(self, args):
+        """All-reduce layernorm grads (for sequence parallelism)."""
+
+        # All-reduce layernorm parameters across model parallel nodes
+        # when sequence parallelism is used
+        pass
+
+
+    def reduce_model_grads(self, args, timers):
+        """All-reduce all grads, and all-reduce embeddings."""
+
+        self.allreduce_layernorm_grads(args)
+
+        # All-reduce embedding grads.
+        self.allreduce_embedding_grads(args)
+
+
+class MixedPrecisionOptimizer(MegatronOptimizer):
+    """Base class for both the float-16 and the distributed optimizer.
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        params_dtype: used by distributed optimizer.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    def __init__(self, optimizer, clip_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler,
+                 models):
+
+        super().__init__(
+            optimizer, clip_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
+
+        self.fp16 = fp16
+        self.bf16 = bf16
+        self.params_dtype = params_dtype
+        self.grad_scaler = grad_scaler
+
+        # None grad scaler is only supported for bf16.
+        if self.grad_scaler is None:
+            assert not self.fp16, 'fp16 expects a grad scaler.'
+
+        # Tensor used to determine if a nan/if has happend.
+        # Any non-zero value indicates inf/nan.
+        # Note that we keep this for the cases that grad scaler is none.
+        # We still record nan/inf if we have a bfloat16 with a grad scaler.
+        if self.grad_scaler:
+            self.found_inf = torch.cuda.FloatTensor([0.0])
+
+        # Dummy tensor needed for apex multi-apply tensor.
+        # For bfloat, we don't have multi-tensor apply and for now
+        # we set it to none so the multi-tensor apply gets ignored.
+        if bf16:
+            self._dummy_overflow_buf = None
+        else:
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+
+        # In case grad scaler is not passed, define the unity scale.
+        if self.grad_scaler is None:
+            self._scale_one = torch.cuda.FloatTensor([1.0])
+
+
+    def get_loss_scale(self):
+        if self.grad_scaler is None:
+            return self._scale_one
+        return self.grad_scaler.scale
+
+
+    def reload_model_params(self):
+        self._copy_model_params_to_main_params()
+
+
+    def _unscale_main_grads_and_check_for_nan(self):
+
+        # Collect main grads.
+        main_grads = self._collect_main_grad_data_for_unscaling()
+
+        # Reset found inf.
+        self.found_inf.fill_(0.0)
+
+        # Unscale and set found inf/nan
+        torch._amp_foreach_non_finite_check_and_unscale_(
+            main_grads, self.found_inf, self.grad_scaler.inv_scale)
+
+        # Check for nan.
+        found_inf_flag = (self.found_inf.item() > 0)
+
+        return found_inf_flag
+
+
+    @torch.no_grad()
+    def step(self, args, timers):
+
+        # Copy gradients from model params to main params.
+        self._copy_model_grads_to_main_grads()
+
+        # Do unscale, check for inf, and update grad scaler only for
+        # the case that grad scaler is provided.
+        if self.grad_scaler:
+
+            # Unscale and check for inf/nan.
+            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
+
+            # We are done with scaling gradients
+            # so we can update the loss scale.
+            self.grad_scaler.update(found_inf_flag)
+
+            # If we found inf/nan, skip the update.
+            if found_inf_flag:
+                return False, None, None
+
+        # Clip the main gradients.
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+
+        # Count the zeros in the grads.
+        num_zeros_in_grad = None
+
+        # Step the optimizer.
+        self.optimizer.step()
+
+        # Update params from main params.
+        self._copy_main_params_to_model_params()
+
+        # Successful update.
+        return True, grad_norm, num_zeros_in_grad
+
+
+class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
+    """Float16 optimizer for fp16 and bf16 data types.
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a continuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        use_contiguous_buffers_in_local_ddp: if true, the local DDP model
+            is using a contiguous buffer to hold the model grads.
+        fp16: if true, the model is running in fp16.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+        models: list of models (i.e., the virtual pipelining models). This
+            is used by the distributed optimizer for mapping parameters.
+    """
+
+    def __init__(self, optimizer, clip_grad,
+                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+                 fp16, bf16, params_dtype, grad_scaler, models):
+
+        super().__init__(
+            optimizer, clip_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            fp16, bf16, params_dtype, grad_scaler, models)
+
+        # ======================
+        # main parameter stuff
+        # ======================
+
+        # Three groups of parameters:
+        #   float16_groups: original float16 parameters
+        #   fp32_from_float16_groups: fp32 copy of float16 parameters
+        #   fp32_from_fp32_groups: original fp32 parameters
+        self.float16_groups = []
+        self.fp32_from_float16_groups = []
+        self.fp32_from_fp32_groups = []
+
+        # For all the groups in the original optimizer:
+        for param_group in self.optimizer.param_groups:
+            float16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_float16_params_this_group = []
+            # For all the parameters in this group:
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+
+                    # float16 params:
+                    if param.type() in ['torch.cuda.HalfTensor',
+                                        'torch.cuda.BFloat16Tensor',
+                                        'torch.xpu.HalfTensor',
+                                        'torch.xpu.BFloat16Tensor']:
+                        float16_params_this_group.append(param)
+                        # Create a copy
+                        main_param = param.detach().clone().float()
+
+                        if hasattr(param, 'shared'):
+                            main_param.shared = param.shared
+                        # Replace the optimizer params with the new fp32 copy.
+                        param_group['params'][i] = main_param
+
+                        fp32_from_float16_params_this_group.append(main_param)
+                        # Reset existing state dict key to the new main param.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[main_param] \
+                                = self.optimizer.state.pop(param)
+                    # fp32 params.
+                    elif param.type() in ['torch.cuda.FloatTensor',
+                                          'torch.xpu.FloatTensor']:
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+
+                    else:
+                        raise TypeError('Wrapped parameters must be one of '
+                                        'torch.cuda.FloatTensor, or '
+                                        'torch.cuda.HalfTensor, or '
+                                        'torch.cuda.BFloat16Tensor, or '
+                                        'torch.xpu.FloatTensor, or '
+                                        'torch.xpu.HalfTensor, or '
+                                        'torch.xpu.BFloat16Tensor. '
+                                        'Received {}'.format(param.type()))
+
+            self.float16_groups.append(float16_params_this_group)
+            self.fp32_from_float16_groups.append(
+                fp32_from_float16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+
+    def zero_grad(self, set_to_none=True):
+        """We only need to zero the model related parameters, i.e.,
+        float16_groups & fp32_from_fp32_groups. We additionally zero
+        fp32_from_float16_groups as a memory optimization to reduce
+        fragmentation; in the case of set_to_none==True, the space
+        used by this field can be safely deallocated at this point."""
+        for group in self.float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_fp32_groups:
+            _zero_grad_group_helper(group, set_to_none)
+
+
+    def _collect_main_grad_data_for_unscaling(self):
+
+        main_grads = []
+
+        # fp32 params from float16 ones.
+        for main_group in self.fp32_from_float16_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+
+        # Append fp32 parameters.
+        for main_group in self.fp32_from_fp32_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+
+        return main_grads
+
+
+    def _get_model_and_main_params_data_float16(self):
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+
+    def _copy_model_grads_to_main_grads(self):
+        # This only needs to be done for the float16 group.
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                if self.params_have_main_grad and hasattr(model_param, 'main_grad'):
+                    main_param.grad = model_param.main_grad.float()
+                else:
+                    if model_param.grad is not None:
+                        main_param.grad = model_param.grad.float()
+
+                # Safe to deallocate model's grad/main_grad after copying.
+                # (If using contiguous buffers, main_grad's memory should
+                # persist and therefore should not be deallocated.)
+                model_param.grad = None
+                if self.params_have_main_grad and \
+                   not self.use_contiguous_buffers_in_local_ddp:
+                    model_param.main_grad = None
+
+        # For fp32 grads, we need to reset the grads to main grad.
+        if self.params_have_main_grad:
+            for model_group in self.fp32_from_fp32_groups:
+                for model_param in model_group:
+                    model_param.grad = model_param.main_grad
+
+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        model_param.main_grad = None
+
+
+    def _copy_main_params_to_model_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def _copy_model_params_to_main_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=model_data, that=main_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['optimizer'] = self.optimizer.state_dict()
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        # Optimizer.
+        optimizer_key = 'optimizer'
+        if optimizer_key not in state_dict:
+            optimizer_key = 'optimizer_state_dict'
+
+        self.optimizer.load_state_dict(state_dict[optimizer_key])
+
+        # Grad scaler.
+        if 'grad_scaler' in state_dict:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+
+        # Copy data for the main params.
+        fp32_from_float16_params_key = 'fp32_from_fp16_params'
+        if fp32_from_float16_params_key not in state_dict:
+            fp32_from_float16_params_key = 'fp32_from_fp16'
+        for current_group, saved_group in zip(
+                self.fp32_from_float16_groups,
+                state_dict[fp32_from_float16_params_key]):
+            for current_param, saved_param in zip(current_group, saved_group):
+                current_param.data.copy_(saved_param.data)
+
+
+class FP32Optimizer(MegatronOptimizer):
+
+    def __init__(self, optimizer, clip_grad,
+                 params_have_main_grad,
+                 use_contiguous_buffers_in_local_ddp,
+                 models):
+
+        super(FP32Optimizer, self).__init__(
+            optimizer, clip_grad,
+            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
+            models)
+
+        self._scale = torch.cuda.FloatTensor([1.0])
+
+
+    def zero_grad(self, set_to_none=True):
+        """Copied from torch.optim.optimizer"""
+        for group in self.optimizer.param_groups:
+            _zero_grad_group_helper(group['params'], set_to_none)
+
+
+    def get_loss_scale(self):
+        """FP32 optimizer does not do any scaling."""
+        return self._scale
+
+
+    @torch.no_grad()
+    def step(self):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+
+        # Copy main_grads to grads.
+        if self.params_have_main_grad:
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    param.grad = param.main_grad
+
+                    # Safe to de-reference model's main_grad after copying.
+                    # (If using contiguous buffers, main_grad's memory should
+                    # persist and therefore should not be deallocated.)
+                    if not self.use_contiguous_buffers_in_local_ddp:
+                        param.main_grad = None
+
+        # Clip gradients.
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+
+        # count the zeros in the grads
+        num_zeros_in_grad = None
+        # Update parameters.
+        self.optimizer.step()
+
+        # No overflow for FP32 optimizer.
+        return True, grad_norm, num_zeros_in_grad
+
+
+    def reload_model_params(self):
+        pass
+
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.load_state_dict(state_dict)
diff --git a/training/benchmarks/gpt2/pytorch/run_pretraining.py b/training/benchmarks/gpt2/pytorch/run_pretraining.py
new file mode 100644
index 000000000..e4c3d0738
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/run_pretraining.py
@@ -0,0 +1,144 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+"""GPT2 Pretraining"""
+
+import argparse
+import os
+import random
+import sys
+import time
+from functools import partial
+
+import numpy as np
+import torch
+
+from train.trainer import Trainer
+from train import trainer_adapter
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+from dataloaders.gpt_dataset import build_train_test_datasets, build_train_test_data_dataloaders
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
+from driver import Driver, Event, dist_pytorch, check
+
+logger = None
+
+
+def main():
+    import config
+    global logger
+
+    if config.use_env and 'LOCAL_RANK' in os.environ:
+        config.local_rank = int(os.environ['LOCAL_RANK'])
+
+    gpt2_driver = Driver(config, config.mutable_params)
+    gpt2_driver.setup_config(argparse.ArgumentParser("GPT2"))
+    gpt2_driver.setup_modules(globals(), locals())
+
+    logger = gpt2_driver.logger
+    dist_pytorch.init_dist_training_env(config)
+
+    check.check_config(config)
+
+    dist_pytorch.barrier(config.vendor)
+    gpt2_driver.event(Event.INIT_START)
+    init_start_time = logger.previous_log_time
+
+    random.seed(config.seed)
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(config.seed)
+
+    config.global_batch_size = config.train_batch_size * config.n_device * config.gradient_accumulation_steps
+
+    train_data_path = os.path.join(config.data_dir, config.train_data_prefix)
+    test_data_path = os.path.join(config.data_dir, config.test_data_prefix)
+    build_train_test_dataset_fn = partial(
+        build_train_test_datasets,
+        seq_length=config.seq_length,
+        seed=config.seed,
+        skip_warmup=(not config.mmap_warmup),
+        train_data_prefix=train_data_path,
+        test_data_prefix=test_data_path,
+    )
+    train_dataloader, eval_dataloader= build_train_test_data_dataloaders(build_train_test_dataset_fn)
+    
+    evaluator = Evaluator(config, eval_dataloader)
+    training_state = TrainingState()
+    trainer = Trainer(driver=gpt2_driver,
+                      adapter=trainer_adapter,
+                      evaluator=evaluator,
+                      training_state=training_state,
+                      device=config.device,
+                      config=config)
+
+    training_state._trainer = trainer
+
+    dist_pytorch.barrier(config.vendor)
+    trainer.init()
+
+    dist_pytorch.barrier(config.vendor)
+    init_evaluation_start = time.time()
+    training_state.eval_lambada_acc = evaluator.evaluate(
+        trainer)
+    init_evaluation_end = time.time()
+    init_evaluation_info = dict(
+        eval_lambada_acc=training_state.eval_lambada_acc,
+        time=init_evaluation_end - init_evaluation_start)
+    gpt2_driver.event(Event.INIT_EVALUATION, init_evaluation_info)
+
+    if not config.do_train:
+        return config, training_state
+
+    gpt2_driver.event(Event.INIT_END)
+    init_end_time = logger.previous_log_time
+    training_state.init_time = (init_end_time - init_start_time) / 1e+3
+
+    dist_pytorch.barrier(config.vendor)
+    epoch = -1
+    gpt2_driver.event(Event.TRAIN_START)
+    raw_train_start_time = logger.previous_log_time
+    while training_state.global_steps < config.max_steps and not training_state.end_training:
+        epoch += 1
+        training_state.epoch = epoch
+        trainer.train_one_epoch(train_dataloader)
+    gpt2_driver.event(Event.TRAIN_END)
+    raw_train_end_time = logger.previous_log_time
+    training_state.raw_train_time = (raw_train_end_time -
+                                     raw_train_start_time) / 1e+3
+    return config, training_state
+
+
+if __name__ == "__main__":
+    now = time.time()
+    config_updated, state = main()
+
+    if not dist_pytorch.is_main_process():
+        exit()
+
+    e2e_time = time.time() - now
+    trained_samples = state.num_trained_samples
+    if config_updated.do_train:
+        finished_info = {
+            "e2e_time": e2e_time,
+            "train_samples": trained_samples,
+            "train_time": state.raw_train_time,
+            "train_no_eval_time": state.no_eval_time,
+            "pure_training_computing_time": state.pure_compute_time,
+            "throughput(sps)_raw":
+            trained_samples / state.raw_train_time,
+            "throughput(sps)_no_eval":
+            trained_samples / state.no_eval_time,
+            "throughput(sps)_pure_compute":
+            trained_samples / state.pure_compute_time,
+            "converged": state.converged,
+            "final_accuracy": state.eval_lambada_acc,
+        }
+    else:
+        finished_info = {"e2e_time": e2e_time}
+    logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
+
diff --git a/training/benchmarks/gpt2/pytorch/schedulers/__init__.py b/training/benchmarks/gpt2/pytorch/schedulers/__init__.py
new file mode 100755
index 000000000..3e685cf5a
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/schedulers/__init__.py
@@ -0,0 +1 @@
+from .factory import create_scheduler
diff --git a/training/benchmarks/gpt2/pytorch/schedulers/factory.py b/training/benchmarks/gpt2/pytorch/schedulers/factory.py
new file mode 100755
index 000000000..9cf37c048
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/schedulers/factory.py
@@ -0,0 +1,34 @@
+from schedulers.optimizer_param_scheduler import OptimizerParamScheduler
+import config
+
+def create_scheduler(optimizer):
+    """Build the learning rate scheduler."""
+
+    # Iteration-based training.
+    if config.max_steps:
+        config.lr_decay_iters = config.max_steps
+        lr_decay_steps = config.lr_decay_iters * config.global_batch_size
+        wd_incr_steps = config.max_steps* config.global_batch_size
+        if config.lr_warmup_fraction is not None:
+            lr_warmup_steps = config.lr_warmup_fraction * lr_decay_steps
+        else:
+            lr_warmup_steps = config.lr_warmup_iters * config.global_batch_size
+    else:
+        raise Exception(
+            'either train-iters or train-samples should be provided.')
+
+    opt_param_scheduler = OptimizerParamScheduler(
+        optimizer,
+        max_lr=config.lr,
+        min_lr=config.min_lr,
+        lr_warmup_steps=lr_warmup_steps,
+        lr_decay_steps=lr_decay_steps,
+        lr_decay_style=config.lr_decay_style,
+        start_wd=config.start_weight_decay,
+        end_wd=config.end_weight_decay,
+        wd_incr_steps=wd_incr_steps,
+        wd_incr_style=config.weight_decay_incr_style,
+        )
+
+    return opt_param_scheduler
+
diff --git a/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py b/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py
new file mode 100644
index 000000000..aaa0a37f3
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Learning rate decay and weight decay incr functions."""
+
+import math
+
+class OptimizerParamScheduler(object):
+    """Anneals learning rate and weight decay"""
+
+    def __init__(self, optimizer, max_lr, min_lr,
+                 lr_warmup_steps, lr_decay_steps, lr_decay_style,
+                 start_wd, end_wd, wd_incr_steps, wd_incr_style,
+                 use_checkpoint_opt_param_scheduler=False,
+                 override_opt_param_scheduler=False):
+
+        # Class values.
+        self.optimizer = optimizer
+
+        self.max_lr = float(max_lr)
+        self.min_lr = min_lr
+        assert self.min_lr >= 0.0
+        assert self.max_lr >= self.min_lr
+
+        self.lr_warmup_steps = lr_warmup_steps
+        self.num_steps = 0
+        self.lr_decay_steps = lr_decay_steps
+        assert self.lr_decay_steps > 0
+        assert self.lr_warmup_steps < self.lr_decay_steps
+
+        self.lr_decay_style = lr_decay_style
+
+        self.start_wd = start_wd
+        self.end_wd = end_wd
+        assert self.start_wd >= 0.0
+        assert self.end_wd >= self.start_wd
+        self.wd_incr_steps = wd_incr_steps
+        self.wd_incr_style = wd_incr_style
+
+        self.override_opt_param_scheduler = override_opt_param_scheduler
+        self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
+        if self.override_opt_param_scheduler:
+            assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\
+                'use-checkpoint are set.'
+
+        # Set the learning rate
+        self.step(0)
+
+
+    def get_wd(self):
+        """ Weight decay incr functions"""
+        if self.num_steps > self.wd_incr_steps:
+            return self.end_wd
+
+        if self.wd_incr_style == 'constant':
+            assert self.start_wd == self.end_wd
+            return self.end_wd
+
+        incr_ratio = float(self.num_steps) / float(self.wd_incr_steps)
+        assert incr_ratio >= 0.0
+        assert incr_ratio <= 1.0
+        delta_wd = self.end_wd - self.start_wd
+
+        if self.wd_incr_style == 'linear':
+            coeff = incr_ratio
+        elif self.wd_incr_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0)
+        else:
+            raise Exception('{} weight decay increment style is not supported.'.format(
+                self.wd_incr_style))
+
+        return self.start_wd + coeff * delta_wd
+
+
+    def get_lr(self):
+        """Learning rate decay functions from:
+              https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
+
+        # Use linear warmup for the initial part.
+        if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps:
+            return self.max_lr * float(self.num_steps) / \
+                float(self.lr_warmup_steps)
+
+        # If the learning rate is constant, just return the initial value.
+        if self.lr_decay_style == 'constant':
+            return self.max_lr
+
+        # For any steps larger than `self.lr_decay_steps`, use `self.min_lr`.
+        if self.num_steps > self.lr_decay_steps:
+            return self.min_lr
+
+        # If we are done with the warmup period, use the decay style.
+        if self.lr_decay_style == 'inverse-square-root':
+            warmup_steps = max(self.lr_warmup_steps, 1)
+            num_steps = max(self.num_steps, 1)
+            lr = self.max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5)
+            return max(self.min_lr, lr)
+
+        num_steps_ = self.num_steps - self.lr_warmup_steps
+        decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps
+        decay_ratio = float(num_steps_) / float(decay_steps_)
+        assert decay_ratio >= 0.0
+        assert decay_ratio <= 1.0
+        delta_lr = self.max_lr - self.min_lr
+
+        if self.lr_decay_style == 'linear':
+            coeff = (1.0 - decay_ratio)
+        elif self.lr_decay_style == 'cosine':
+            coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        else:
+            raise Exception('{} decay style is not supported.'.format(
+                self.lr_decay_style))
+
+        return self.min_lr + coeff * delta_lr
+
+
+    def step(self, increment):
+        """Set lr for all parameters groups."""
+        self.num_steps += increment
+        new_lr = self.get_lr()
+        new_wd = self.get_wd()
+        for group in self.optimizer.param_groups:
+            group['lr'] = new_lr * group.get('lr_mult', 1.0)
+            group['weight_decay'] = new_wd * group.get('wd_mult', 1.0)
+
+
+    def state_dict(self):
+        state_dict = {
+            'max_lr': self.max_lr,
+            'lr_warmup_steps': self.lr_warmup_steps,
+            'num_steps': self.num_steps,
+            'lr_decay_style': self.lr_decay_style,
+            'lr_decay_steps': self.lr_decay_steps,
+            'min_lr': self.min_lr,
+            'start_wd': self.start_wd,
+            'end_wd': self.end_wd,
+            'wd_incr_style': self.wd_incr_style,
+            'wd_incr_steps': self.wd_incr_steps
+        }
+        return state_dict
+
+
+    def _check_and_set(self, cls_value, sd_value, name):
+        """Auxiliary function for checking the values in the checkpoint and
+        setting them."""
+        if self.override_opt_param_scheduler:
+            return cls_value
+
+        if not self.use_checkpoint_opt_param_scheduler:
+            assert cls_value == sd_value, \
+                f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \
+                f'value {sd_value} for {name} do not match'
+        return sd_value
+
+
+    def load_state_dict(self, sd):
+
+        if 'start_lr' in sd:
+            max_lr_ = sd['start_lr']
+        else:
+            max_lr_ = sd['max_lr']
+        self.max_lr = self._check_and_set(self.max_lr, max_lr_,
+                                          'learning rate')
+        
+        self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'],
+                                          'minimum learning rate')
+
+        if 'warmup_iter' in sd:
+            lr_warmup_steps_ = sd['warmup_iter']
+        elif 'warmup_steps' in sd:
+            lr_warmup_steps_ = sd['warmup_steps']
+        else:
+            lr_warmup_steps_ = sd['lr_warmup_steps']
+        self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps,
+                                                lr_warmup_steps_,
+                                                'warmup iterations')
+
+        if 'end_iter' in sd:
+            lr_decay_steps_ = sd['end_iter']
+        elif 'decay_steps' in sd:
+            lr_decay_steps_  = sd['decay_steps']
+        else:
+            lr_decay_steps_ = sd['lr_decay_steps']
+        self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_,
+                                               'total number of iterations')
+
+        if 'decay_style' in sd:
+            lr_decay_style_ = sd['decay_style']
+        else:
+            lr_decay_style_ = sd['lr_decay_style']
+        self.lr_decay_style = self._check_and_set(self.lr_decay_style,
+                                               lr_decay_style_,
+                                               'learning rate decay style')
+
+        if 'num_iters' in sd:
+            num_steps = sd['num_iters']
+        else:
+            num_steps = sd['num_steps']
+        self.step(increment=num_steps)
+
+
+        if 'start_wd' in sd:
+            self.start_wd = self._check_and_set(self.start_wd,
+                                                sd['start_wd'],
+                                                "start weight decay")
+            self.end_wd = self._check_and_set(self.end_wd,
+                                                sd['end_wd'],
+                                                "end weight decay")
+            self.wd_incr_steps = self._check_and_set(self.wd_incr_steps,
+                                                sd['wd_incr_steps'],
+                                                "total number of weight decay iterations")
+            self.wd_incr_style = self._check_and_set(self.wd_incr_style,
+                                                sd['wd_incr_style'],
+                                                "weight decay incr style")
+            
diff --git a/training/benchmarks/gpt2/pytorch/train/__init__.py b/training/benchmarks/gpt2/pytorch/train/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/gpt2/pytorch/train/evaluator.py b/training/benchmarks/gpt2/pytorch/train/evaluator.py
new file mode 100644
index 000000000..23d4445ff
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/train/evaluator.py
@@ -0,0 +1,46 @@
+import os
+import sys
+import torch
+
+from train.utils import process_batch_eval
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
+from driver import dist_pytorch
+
+class Evaluator:
+
+    def __init__(self, config, dataloader):
+        self.config = config
+        self.eval_dataloader = dataloader
+
+    def evaluate(self, trainer):
+        model = trainer.model
+        model.eval()
+
+        total_output = 0.0
+        num_examples = len(self.eval_dataloader.dataset)
+        with torch.no_grad():
+            # For all the batches in the dataset.
+            for iteration, batch in enumerate(self.eval_dataloader):
+
+                # Get the batch.
+                tokens, labels, attention_mask, position_ids, loss_mask = process_batch_eval(
+                    batch)
+                # Forward pass through the model.
+                output = model(tokens, position_ids, attention_mask)
+                # For accuracy, return the number of correctly predicted samples.
+                outputs = torch.argmax(output, -1)
+                correct = (outputs == labels).float()
+                correct[(1 - loss_mask).bool()] = 1
+                correct = correct.prod(-1)
+                output = correct.sum()
+
+                # Reduce across processes.
+                if dist_pytorch.is_dist_avail_and_initialized():
+                    torch.distributed.all_reduce(output)
+
+                total_output += output
+        acc = total_output / num_examples
+        model.eval()
+        return acc.item()
diff --git a/training/benchmarks/gpt2/pytorch/train/trainer.py b/training/benchmarks/gpt2/pytorch/train/trainer.py
new file mode 100644
index 000000000..0eade57d5
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/train/trainer.py
@@ -0,0 +1,190 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import math
+import time
+import os
+import sys
+
+import torch
+from torch.types import Device
+
+from model import create_model
+from schedulers import create_scheduler
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+from model.losses.cross_entropy import cross_entropy
+from train.utils import get_batch
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
+from driver import Driver, Event, dist_pytorch
+
+def _transpose_first_dim(t, num_splits, num_splits_first, model):
+    input_shape = t.size()
+    # We use a self_attention module but the values extracted aren't
+    # specific to self attention so should work for cross attention as well
+    while hasattr(model, 'module'):
+        model = model.module
+    attention_module = model.language_model.encoder.layers[0].self_attention
+    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
+    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
+    if num_splits_first:
+        """[num_splits * np * hn, h]
+        -->(view) [num_splits, np, hn, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_splits, num_attention_heads_per_partition,
+             hidden_size_per_attention_head) + input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(0, 1).contiguous()
+    else:
+        """[np * hn * num_splits, h]
+        -->(view) [np, hn, num_splits, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+
+        intermediate_shape = \
+            (num_attention_heads_per_partition,
+             hidden_size_per_attention_head, num_splits) +\
+             input_shape[1:]
+
+        t = t.view(*intermediate_shape)
+        t = t.transpose(1, 2).contiguous()
+    t = t.view(*input_shape)
+
+    return t
+
+
+class Trainer():
+
+    def __init__(self, driver: Driver, adapter, evaluator: Evaluator,
+                 training_state: TrainingState, device: Device, config):
+        super(Trainer, self).__init__()
+        self.config = config
+        self.driver = driver
+        self.adapter = adapter
+        self.training_state = training_state
+        self.grad_scaler = None
+
+        self.device = device
+        self.optimizer = None
+        self.bert_config = None
+        self.model = None
+        self.evaluator = evaluator
+        self.lr_scheduler = None
+        self.global_batch_size = None
+        self.overflow_buf = None
+
+    def init(self):
+        self.model_config, self.model = create_model(self.config)
+        self.model = self._init_model(self.model, self.device)
+        self.model = self.adapter.convert_model(self.config, self.model)
+        self.model = self.model.to(self.config.device)
+
+        self.optimizer = self.adapter.create_optimizer(self.config, self.model)
+        self.model, self.optimizer = self.adapter.model_to_fp16(
+            self.config, self.model, self.optimizer)
+        self.model = self.adapter.model_to_ddp(self.config, self.model)
+        self.lr_scheduler = create_scheduler(self.optimizer)
+
+    def _init_model(self, model, device):
+        model = model.to(device)
+        return model
+
+    def train_one_epoch(self, dataloader):
+        state = self.training_state
+        driver = self.driver
+        driver.event(Event.EPOCH_BEGIN, state.epoch)
+
+        no_eval_start = time.time()
+        for _, data in enumerate(dataloader):
+            data['text'] = data['text'].to(self.device)
+            tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data)
+
+            pure_compute_start = time.time()
+            state.global_steps += 1
+            state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size(
+                self.config)
+
+            driver.event(Event.STEP_BEGIN, step=state.global_steps)
+            self.train_one_step(tokens, position_ids, attention_mask, labels, loss_mask)
+
+            train_end = time.time()
+            state.pure_compute_time += train_end - pure_compute_start
+            state.no_eval_time += train_end - no_eval_start
+            
+            other_state = dict()
+            if state.global_steps % self.config.gradient_accumulation_steps == 0:
+                sequences_per_second = state.num_trained_samples / state.no_eval_time
+                other_state["seq/s"] = sequences_per_second
+
+            eval_result = None
+            if self.can_do_eval(state):
+                eval_start = time.time()
+                state.eval_lambada_acc = self.evaluator.evaluate(
+                    self)
+                eval_end = time.time()
+                eval_result = dict(
+                    global_steps=state.global_steps,
+                    eval_lambada_acc=state.eval_lambada_acc,
+                    time=eval_end - eval_start)
+
+            end_training = self.detect_training_status(state)
+
+            step_info = state.to_dict(**other_state)
+            driver.event(Event.STEP_END,
+                         message=step_info,
+                         step=state.global_steps,
+                         loss=state.loss)
+
+            if eval_result is not None:
+                driver.event(Event.EVALUATE, eval_result)
+
+            if end_training:
+                break
+            no_eval_start = time.time()
+
+        driver.event(Event.EPOCH_END, state.epoch)
+
+    def train_one_step(self, tokens, position_ids, attention_mask, labels, loss_mask):
+
+        state = self.training_state
+        self.model.train()
+
+        losses = self.model(tokens, position_ids, attention_mask, labels=labels)
+        #loss 为标量
+        loss = torch.sum(losses.view(-1) * loss_mask.view(-1)) / loss_mask.view(-1).sum()
+        state.loss = loss
+        self.adapter.backward(state.global_steps, loss,
+                              self.optimizer, self.lr_scheduler)
+        self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
+                          self.optimizer)
+
+
+    def detect_training_status(self, state: TrainingState):
+        if state.eval_lambada_acc >= self.config.target_acc:
+            state.converged_success()
+
+        if state.global_steps >= self.config.max_steps:
+            state.end_training = True
+
+        return state.end_training
+
+    def can_do_eval(self, state: TrainingState):
+        do_eval = all([
+            self.config.test_data_prefix is not None,
+            state.num_trained_samples >= self.config.eval_iter_start_samples,
+            self.config.eval_interval_samples > 0,
+            state.global_steps > 1,
+            state.global_steps %
+            math.ceil(self.config.eval_interval_samples /
+                      dist_pytorch.global_batch_size(self.config)) == 0,
+        ])
+
+        return do_eval or state.global_steps >= self.config.max_steps
+
diff --git a/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py b/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py
new file mode 100644
index 000000000..d9462ca05
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py
@@ -0,0 +1,57 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from typing import Tuple
+
+import torch
+from torch.optim import Optimizer
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+
+import config
+from optimizer import get_megatron_optimizer
+
+GPT_MODEL = torch.nn.Module
+
+def convert_model(config, model: GPT_MODEL) -> GPT_MODEL:
+    return model
+
+def create_optimizer(config, model: GPT_MODEL) -> Optimizer:
+    return get_megatron_optimizer(model)
+
+
+def model_to_fp16(config, model: GPT_MODEL,
+                  optimizer: Optimizer) -> Tuple[GPT_MODEL, Optimizer]:
+    if config.fp16:
+        model.half()
+    return model, optimizer
+
+
+def model_to_ddp(config, model: GPT_MODEL) -> GPT_MODEL:
+    use_ddp = dist.is_initialized()
+
+    if use_ddp:
+        if config.DDP_impl == 'native':
+            model = NativeDDP(
+                model,
+                device_ids=[config.local_rank])
+        else:
+            assert False, "Invalid DDP type"
+    return model
+
+
+def backward(step: int,
+             loss: torch.Tensor,
+             optimizer: Optimizer,
+             lr_scheduler):
+    if config.fp16:
+        optimizer.backward(loss)
+    else:
+        loss.backward()
+
+    if step % config.gradient_accumulation_steps == 0:
+        optimizer.step()
+        optimizer.zero_grad()
+        increment = config.train_batch_size * config.n_device * config.gradient_accumulation_steps
+        lr_scheduler.step(increment)
diff --git a/training/benchmarks/gpt2/pytorch/train/training_state.py b/training/benchmarks/gpt2/pytorch/train/training_state.py
new file mode 100644
index 000000000..3ca60b2b9
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/train/training_state.py
@@ -0,0 +1,79 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from dataclasses import dataclass
+
+import torch
+import inspect
+
+@dataclass
+class TrainingState:
+    _trainer = None
+    _status = 'aborted'  # later set to 'success' if termination criteria met
+
+    global_steps = 0
+    skipped_steps = 0
+    iter_dataloader_idx = 0
+
+    loss: float = 0.0
+    lambada_acc: float = 0.0
+
+    epoch: int = 1
+    num_trained_samples = 0
+    end_training: bool = False
+    converged: bool = False
+
+    eval_lambada_acc = 0
+
+    init_time = 0
+    raw_train_time = 0
+
+    no_eval_time = 0.0
+    pure_compute_time = 0.0
+
+    def status(self):
+        if self.converged:
+            self._status = "success"
+        return self._status
+
+    def converged_success(self):
+        self.end_training = True
+        self.converged = True
+
+    def _is_property(self, value):
+        status = [
+            not callable(value), not inspect.isclass(value),
+            not inspect.ismodule(value), not inspect.ismethod(value),
+            not inspect.isfunction(value), not inspect.isbuiltin(value),
+            "classmethod object" not in str(value)
+        ]
+        return all(status)
+
+    def to_dict(self, **kwargs):
+        state_dict = dict()
+
+        for var_name, value in self.__dict__.items():
+            if not var_name.startswith("_") and self._is_property(value):
+                state_dict[var_name] = value
+
+        lr = self._trainer.lr_scheduler.get_lr()
+        if isinstance(lr, (tuple, list)):
+            lr = lr[0]
+        state_dict["learning_rate"] = lr
+
+        exclude = [
+            "eval_lambada_acc", "skipped_steps",
+            "converged", "init_time", "raw_train_time"
+        ]
+        for exkey in exclude:
+            if exkey in state_dict:
+                state_dict.pop(exkey)
+
+        state_dict.update(kwargs)
+
+        for k in state_dict.keys():
+            if torch.is_tensor(state_dict[k]):
+                state_dict[k] = state_dict[k].item()
+
+        return state_dict
diff --git a/training/benchmarks/gpt2/pytorch/train/utils.py b/training/benchmarks/gpt2/pytorch/train/utils.py
new file mode 100644
index 000000000..25391a403
--- /dev/null
+++ b/training/benchmarks/gpt2/pytorch/train/utils.py
@@ -0,0 +1,101 @@
+import torch
+
+import config
+from dataloaders import get_tokenizer
+
+def get_ltor_masks_and_position_ids(data,
+                                    eod_token,
+                                    reset_position_ids,
+                                    reset_attention_mask,
+                                    eod_mask_loss):
+    """Build masks and position id for left to right model."""
+
+    # Extract batch size and sequence length.
+    train_batch_size, seq_length = data.size()
+
+    # Attention mask (lower triangular).
+    if reset_attention_mask:
+        att_mask_batch = train_batch_size
+    else:
+        att_mask_batch = 1
+    attention_mask = torch.tril(torch.ones(
+        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
+            att_mask_batch, 1, seq_length, seq_length)
+
+    # Loss mask.
+    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
+    if eod_mask_loss:
+        loss_mask[data == eod_token] = 0.0
+
+    # Position ids.
+    position_ids = torch.arange(seq_length, dtype=torch.long,
+                                device=data.device)
+    position_ids = position_ids.unsqueeze(0).expand_as(data)
+    # We need to clone as the ids will be modifed based on batch index.
+    if reset_position_ids:
+        position_ids = position_ids.clone()
+
+    if reset_position_ids or reset_attention_mask:
+        # Loop through the batches:
+        for b in range(train_batch_size):
+
+            # Find indecies where EOD token is.
+            eod_index = position_ids[b, data[b] == eod_token]
+            # Detach indecies from positions if going to modify positions.
+            if reset_position_ids:
+                eod_index = eod_index.clone()
+
+            # Loop through EOD indecies:
+            prev_index = 0
+            for j in range(eod_index.size()[0]):
+                i = eod_index[j]
+                # Mask attention loss.
+                if reset_attention_mask:
+                    attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+                # Reset positions.
+                if reset_position_ids:
+                    position_ids[b, (i + 1):] -= (i + 1 - prev_index)
+                    prev_index = i + 1
+
+    # Convert attention mask to binary:
+    attention_mask = (attention_mask < 0.5)
+
+    return attention_mask, loss_mask, position_ids
+
+def get_batch(data):
+    """Generate a batch"""
+    tokenizer = get_tokenizer()
+    
+    # Unpack.
+    tokens_ = data['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        config.reset_position_ids,
+        config.reset_attention_mask,
+        config.eod_mask_loss)
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+def process_batch_eval(batch):
+    """Process batch and produce inputs for the model."""
+    tokenizer = get_tokenizer()
+
+    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
+    tokens_ = batch['text'].long().cuda().contiguous()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        config.reset_position_ids,
+        config.reset_attention_mask,
+        config.eod_mask_loss)
+
+    return tokens, labels, attention_mask, position_ids, loss_mask
diff --git a/training/nvidia/gpt2-pytorch/README.md b/training/nvidia/gpt2-pytorch/README.md
new file mode 100644
index 000000000..2b981c65b
--- /dev/null
+++ b/training/nvidia/gpt2-pytorch/README.md
@@ -0,0 +1,42 @@
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/gpt2/README.md#测试数据集下载)
+
+### Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic     
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-1.12.0a0+bd13bc6
+   - 依赖软件版本：
+     - cuda: 11.6
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | 自然语言编码            |                                             |
+| 模型           | megatron-gpt2-345m      |                                             |
+| 数据集         | lambada               |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | nvidia A100             |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练序列数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)，单位为samples/s(seq_length=1024)|
+| 训练结果       | lambada_acc,见“性能指标”    | lambada任务准确率                         |                                      |
+
+* 性能指标
+
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | lambada_acc | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- |
+| A100单机单卡（1x1）  |  fp32      | bs=32,lr=0.00015 |  6362.3   | 9.6    | 14.2    | 14.2 |   | 30.2/40.0 |
+| A100单机8卡（1x8）  |  fp32      | bs=32,lr=0.00015 |  10352.7   | 68.6    | 97.5    | 97.6 |  0.60 | 31.3/40.0 |
diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py b/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py
new file mode 100755
index 000000000..1d910c1db
--- /dev/null
+++ b/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+gradient_accumulation_steps = 8
+max_steps = 15000
\ No newline at end of file
diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py b/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py
new file mode 100755
index 000000000..27e5ca7f5
--- /dev/null
+++ b/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py
@@ -0,0 +1 @@
+from config_common import *
diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py b/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py
new file mode 100755
index 000000000..27e5ca7f5
--- /dev/null
+++ b/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py
@@ -0,0 +1 @@
+from config_common import *
diff --git a/training/nvidia/gpt2-pytorch/config/config_common.py b/training/nvidia/gpt2-pytorch/config/config_common.py
new file mode 100755
index 000000000..47dcfb35d
--- /dev/null
+++ b/training/nvidia/gpt2-pytorch/config/config_common.py
@@ -0,0 +1,6 @@
+vendor = 'nvidia'
+
+# disable fp16
+fp16 = False
+
+dist_backend = "nccl"
diff --git a/training/nvidia/gpt2-pytorch/extern/.gitkeep b/training/nvidia/gpt2-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 81bf848d5..9d37dd8bc 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -75,4 +75,5 @@
     
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2",
 }

From 14bbdc0cc395bc3b9f21d7a2c3511752aa30e7d1 Mon Sep 17 00:00:00 2001
From: Jianbang Yang <yangjianbang112@gmail.com>
Date: Mon, 18 Sep 2023 14:35:53 +0800
Subject: [PATCH 08/18] Add T5-Small training model (#201)

* add t5 small

* t5_small use huggingface accelerate

* fix coding style for t5_small model

* update t5_small bs config

* add MFU information in t5-small nvidia README

* fix t5_small doc typo
---
 README.md                                     |   2 +-
 training/benchmarks/t5_small/README.md        |  68 +++++++++
 .../t5_small/pytorch/config/__init__.py       |   2 +
 .../t5_small/pytorch/config/_base.py          |  46 ++++++
 .../t5_small/pytorch/config/mutable_params.py |   5 +
 .../create_train_eval_data.py                 |  89 +++++++++++
 .../t5_small/pytorch/dataloaders/__init__.py  |   1 +
 .../pytorch/dataloaders/dataloader.py         |  83 +++++++++++
 .../t5_small/pytorch/model/__init__.py        |  19 +++
 .../t5_small/pytorch/optimizers/__init__.py   |  27 ++++
 .../t5_small/pytorch/run_pretraining.py       | 137 +++++++++++++++++
 .../t5_small/pytorch/schedulers/__init__.py   |  11 ++
 .../t5_small/pytorch/train/__init__.py        |   0
 .../t5_small/pytorch/train/evaluator.py       |  84 +++++++++++
 .../t5_small/pytorch/train/trainer.py         | 141 ++++++++++++++++++
 .../t5_small/pytorch/train/trainer_adapter.py |   9 ++
 .../t5_small/pytorch/train/training_state.py  |  41 +++++
 training/nvidia/t5_small-pytorch/README.md    |  57 +++++++
 .../config/config_A100x1x1.py                 |   2 +
 .../config/config_A100x1x8.py                 |   2 +
 .../config/config_A100x2x8.py                 |   2 +
 .../t5_small-pytorch/config/requirements.txt  |   8 +
 .../nvidia/t5_small-pytorch/extern/.gitkeep   |   0
 training/run_benchmarks/config/test_conf.py   |   1 +
 24 files changed, 836 insertions(+), 1 deletion(-)
 create mode 100644 training/benchmarks/t5_small/README.md
 create mode 100755 training/benchmarks/t5_small/pytorch/config/__init__.py
 create mode 100755 training/benchmarks/t5_small/pytorch/config/_base.py
 create mode 100755 training/benchmarks/t5_small/pytorch/config/mutable_params.py
 create mode 100644 training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py
 create mode 100644 training/benchmarks/t5_small/pytorch/dataloaders/__init__.py
 create mode 100644 training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py
 create mode 100644 training/benchmarks/t5_small/pytorch/model/__init__.py
 create mode 100644 training/benchmarks/t5_small/pytorch/optimizers/__init__.py
 create mode 100644 training/benchmarks/t5_small/pytorch/run_pretraining.py
 create mode 100644 training/benchmarks/t5_small/pytorch/schedulers/__init__.py
 create mode 100644 training/benchmarks/t5_small/pytorch/train/__init__.py
 create mode 100644 training/benchmarks/t5_small/pytorch/train/evaluator.py
 create mode 100644 training/benchmarks/t5_small/pytorch/train/trainer.py
 create mode 100644 training/benchmarks/t5_small/pytorch/train/trainer_adapter.py
 create mode 100644 training/benchmarks/t5_small/pytorch/train/training_state.py
 create mode 100644 training/nvidia/t5_small-pytorch/README.md
 create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x1x1.py
 create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x1x8.py
 create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x2x8.py
 create mode 100644 training/nvidia/t5_small-pytorch/config/requirements.txt
 create mode 100644 training/nvidia/t5_small-pytorch/extern/.gitkeep

diff --git a/README.md b/README.md
index 804d88c05..6aeab0301 100644
--- a/README.md
+++ b/README.md
@@ -290,7 +290,7 @@ under review表示对应case的支持已开发完毕，在review中；Incoming
 <tr height="16.80" style='height:16.80pt;'>
     <td class="xl65" height="33.60" rowspan="1" style='height:33.60pt;border-right:none;border-bottom:none;' x:str>T5_small</td>
     <td class="xl69" x:str>PyTorch</td>
-    <td class="xl69" x:str>under review</td>
+    <td class="xl69" x:str><a href="https://github.com/FlagOpen/FlagPerf/tree/main/training/nvidia/t5_small-pytorch" style="text-decoration:none" target="_parent">✅</a></td>
     <td class="xl69" x:str>Incoming</td>
     <td class="xl69" x:str>N/A</td>
       <td class="xl69" x:str>N/A</a></td>
diff --git a/training/benchmarks/t5_small/README.md b/training/benchmarks/t5_small/README.md
new file mode 100644
index 000000000..2711dd125
--- /dev/null
+++ b/training/benchmarks/t5_small/README.md
@@ -0,0 +1,68 @@
+
+## Model Introduction
+### What is T5-Small(Text-To-Text Transfer Transformer)?
+The developers of the Text-To-Text Transfer Transformer (T5) [write](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html):
+
+> With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.
+
+T5-Small is the checkpoint with 60 million parameters.
+
+- **Developed by:** Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. See [associated paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) and [GitHub repo](https://github.com/google-research/text-to-text-transfer-transformer#released-model-checkpoints)
+- **Model type:** Language model
+- **Language(s) (NLP):** English, French, Romanian, German
+- **License:** Apache 2.0
+- **Related Models:** [All T5 Checkpoints](https://huggingface.co/models?search=t5)
+- Resources for more information:
+  - [Research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf)
+  - [Google's T5 Blog Post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html)
+  - [GitHub Repo](https://github.com/google-research/text-to-text-transfer-transformer)
+  - [Hugging Face T5 Docs](https://huggingface.co/docs/transformers/model_doc/t5)
+
+## Model and Training Scripts source code
+Pytorch case:
+This repository includes software from https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization_no_trainer.py
+licensed under the Apache License 2.0.
+
+Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf.
+
+## Dataset and Model Checkpoints
+
+> Dataset website：https://huggingface.co/datasets/cnn_dailymail and https://github.com/abisee/cnn-dailymail
+
+> Model checkpoint website: https://huggingface.co/t5-small/tree/main
+
+We have already preprocessed the dataset and the model checkpoint files(The preprocessing script is `training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py`).
+The preprocessed can be downloaded directly from https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/t5_small_train.tar.
+No additional preprocessing steps need to be conducted.
+
+After decompressing, the dataset and model checkpoint files are organized as the following:
+
+```
+t5_small_train
+├── dataset                     # dataset files
+│   ├── eval_dataset.npz
+│   └── train_dataset.npz
+├── metrics                     # metrics for evaluation
+│   └── rouge
+│       └── rouge.py
+├── model                       # model checkpoint and config files
+│   ├── config.json
+│   ├── generation_config.json
+│   ├── model.safetensors
+│   ├── spiece.model
+│   ├── tokenizer.json
+│   └── tokenizer_config.json
+└── nltk_data                   # nltk data for evaluation
+    └── tokenizers
+        └── punkt
+```
+
+## Benchmark Task and Target Accuracy
+This experiment is to finetune a summarization task on CNN/Daily Mail dataset with t5-small pretrained checkpoints.
+After finetuning 3 epoches, the t5-small model is able to achieve a ROUGE-1 score of 41+, which matches the evaluation result on the [paper](https://arxiv.org/abs/1910.10683).
+
+## AI Frameworks && Accelerators supports
+
+|            | Pytorch | Paddle | TensorFlow2 |
+| ---------- | ------- | ------ | ----------- |
+| Nvidia GPU | [✅](../../nvidia/t5_small-pytorch/README.md)       | N/A    | N/A       |
diff --git a/training/benchmarks/t5_small/pytorch/config/__init__.py b/training/benchmarks/t5_small/pytorch/config/__init__.py
new file mode 100755
index 000000000..96e0aae70
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/config/__init__.py
@@ -0,0 +1,2 @@
+from ._base import *
+from .mutable_params import mutable_params
diff --git a/training/benchmarks/t5_small/pytorch/config/_base.py b/training/benchmarks/t5_small/pytorch/config/_base.py
new file mode 100755
index 000000000..f105c6d31
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/config/_base.py
@@ -0,0 +1,46 @@
+# DO NOT MODIFY THESE REQUIRED PARAMETERS
+
+# Required parameters
+vendor: str = None
+data_dir: str = None
+name: str = "t5_small"
+cudnn_benchmark: bool = False
+cudnn_deterministic: bool = True
+
+# Optional parameters
+
+# =========================================================
+# loss scale
+# =========================================================
+lr: float = 5e-5
+weight_decay = 0.0
+
+# =========================================================
+# train && evaluate
+# =========================================================
+train_batch_size: int = 32
+eval_batch_size: int = 32
+
+max_epoch: int = 3
+target_rouge1: float = 40.5
+
+do_train = True
+distributed: bool = True
+
+# =========================================================
+# utils
+# =========================================================
+seed: int = 0
+dist_backend: str = 'nccl'
+device: str = None
+
+# =========================================================
+# for driver
+# =========================================================
+local_rank: int = -1
+use_env: bool = True
+log_freq: int = 500
+print_freq: int = 500
+n_device: int = 1
+sync_bn: bool = False
+gradient_accumulation_steps: int = 1
diff --git a/training/benchmarks/t5_small/pytorch/config/mutable_params.py b/training/benchmarks/t5_small/pytorch/config/mutable_params.py
new file mode 100755
index 000000000..87649996a
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/config/mutable_params.py
@@ -0,0 +1,5 @@
+mutable_params = [
+    'vendor', 'data_dir', 'lr', 'weight_decay', 'train_batch_size',
+    'eval_batch_size', 'do_train', 'distributed', 'dist_backend', 'device',
+    'cudnn_benchmark', 'cudnn_deterministic'
+]
diff --git a/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py b/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py
new file mode 100644
index 000000000..6bc0dff06
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py
@@ -0,0 +1,89 @@
+import os
+
+import numpy as np
+import datasets
+from transformers import AutoTokenizer
+
+
+def save_dataset(ds, save_path):
+    np.savez(save_path,
+             input_ids=ds['input_ids'],
+             attention_mask=ds['attention_mask'],
+             labels=ds['labels'])
+
+
+def main():
+    data_prefix = 't5_small_train/dataset'
+    os.makedirs(data_prefix, exist_ok=True)
+    train_datapath = os.path.join(data_prefix, 'train_dataset.npz')
+    eval_datapath = os.path.join(data_prefix, 'eval_dataset.npz')
+
+    tokenizer = AutoTokenizer.from_pretrained('t5-small',
+                                              use_fast=True,
+                                              revision='main')
+
+    raw_datasets = datasets.load_dataset('cnn_dailymail', '3.0.0')
+
+    def preprocess_function(examples):
+        # remove pairs where at least one record is None
+        text_column = 'article'
+        summary_column = 'highlights'
+        prefix = 'summarize: '
+        max_source_length = 1024
+        max_target_length = 128
+        ignore_pad_token_for_loss = True
+        padding = "max_length"
+
+        inputs, targets = [], []
+        for i in range(len(examples[text_column])):
+            if examples[text_column][i] and examples[summary_column][i]:
+                inputs.append(examples[text_column][i])
+                targets.append(examples[summary_column][i])
+
+        inputs = [prefix + inp for inp in inputs]
+        model_inputs = tokenizer(inputs,
+                                 max_length=max_source_length,
+                                 padding=padding,
+                                 truncation=True)
+
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets,
+                           max_length=max_target_length,
+                           padding=padding,
+                           truncation=True)
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and ignore_pad_token_for_loss:
+            labels["input_ids"] = [[
+                (l if l != tokenizer.pad_token_id else -100) for l in label
+            ] for label in labels["input_ids"]]
+
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+
+    train_dataset = raw_datasets["train"]
+    train_dataset = train_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=32,
+        remove_columns=raw_datasets["train"].column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on train dataset",
+    ).with_format('numpy')
+    save_dataset(train_dataset, train_datapath)
+
+    eval_dataset = raw_datasets["validation"]
+    eval_dataset = eval_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=32,
+        remove_columns=raw_datasets["validation"].column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on train dataset",
+    ).with_format('numpy')
+    save_dataset(eval_dataset, eval_datapath)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py b/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py
new file mode 100644
index 000000000..83fa73435
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py
@@ -0,0 +1 @@
+from .dataloader import build_train_dataloader, build_eval_dataloader
diff --git a/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py b/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py
new file mode 100644
index 000000000..26f2e99bf
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py
@@ -0,0 +1,83 @@
+import os
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from torch.utils.data.dataloader import default_collate
+
+
+class T5Dataset(Dataset):
+    def __init__(self, filepath):
+        origin_data = np.load(filepath)
+        self.input_ids = origin_data['input_ids']
+        self.attention_mask = origin_data['attention_mask']
+        self.labels = origin_data['labels']
+
+    def __len__(self):
+        return len(self.input_ids)
+
+    def __getitem__(self, idx):
+        sample = {
+            'input_ids': self.input_ids[idx],
+            'attention_mask': self.attention_mask[idx],
+            'labels': self.labels[idx]
+        }
+        return sample
+
+
+def _prepare_decoder_input_ids_from_labels(input_ids):
+    """
+        https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/t5/modeling_t5.py#L1800
+        https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/t5/modeling_t5.py#L851
+    """
+    decoder_start_token_id = 0
+    pad_token_id = 0
+
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+    shifted_input_ids[..., 0] = decoder_start_token_id
+
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def my_collate(batch):
+    """
+        https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/data/data_collator.py#L600
+    """
+    new_batch = default_collate(batch)
+    new_batch["decoder_input_ids"] = _prepare_decoder_input_ids_from_labels(
+        new_batch["labels"])
+    return new_batch
+
+
+def build_train_dataloader(config):
+    train_dataset = T5Dataset(
+        os.path.join(config.data_dir, 'dataset', 'train_dataset.npz'))
+
+    data_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        batch_size=config.train_batch_size,
+        collate_fn=my_collate)
+    return data_loader
+
+
+def build_eval_dataloader(config):
+    eval_dataset = T5Dataset(
+        os.path.join(config.data_dir, 'dataset', 'eval_dataset.npz'))
+
+    data_loader = torch.utils.data.DataLoader(
+        eval_dataset, batch_size=config.eval_batch_size, collate_fn=my_collate)
+    return data_loader
+
+
+if __name__ == '__main__':
+    from collections import namedtuple
+    Config = namedtuple(
+        'Config',
+        ['data_dir', 'distributed', 'train_batch_size', 'eval_batch_size'])
+    config = Config('t5_small_train/dataset', False, 4, 4)
+    eval_dataloader = build_eval_dataloader(config)
+    for i, batch in enumerate(eval_dataloader):
+        break
diff --git a/training/benchmarks/t5_small/pytorch/model/__init__.py b/training/benchmarks/t5_small/pytorch/model/__init__.py
new file mode 100644
index 000000000..8f3c84c32
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/model/__init__.py
@@ -0,0 +1,19 @@
+import os
+from transformers import T5Config, T5ForConditionalGeneration, T5TokenizerFast
+
+
+def create_model(config):
+    model_path = os.path.join(config.data_dir, 'model')
+    hfconfig = T5Config.from_pretrained(model_path)
+    model = T5ForConditionalGeneration.from_pretrained(model_path,
+                                                       config=hfconfig)
+    tokenizer = T5TokenizerFast.from_pretrained(model_path)
+    return model, hfconfig, tokenizer
+
+
+if __name__ == '__main__':
+
+    from collections import namedtuple
+    Config = namedtuple('Config', ['data_dir'])
+    config = Config('t5_small_train')
+    model, tokenizer = create_model(config)
diff --git a/training/benchmarks/t5_small/pytorch/optimizers/__init__.py b/training/benchmarks/t5_small/pytorch/optimizers/__init__.py
new file mode 100644
index 000000000..b5cdf1e3e
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/optimizers/__init__.py
@@ -0,0 +1,27 @@
+import torch
+
+
+def create_optimizer(model, args):
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            args.weight_decay,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay":
+            0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr)
+    return optimizer
diff --git a/training/benchmarks/t5_small/pytorch/run_pretraining.py b/training/benchmarks/t5_small/pytorch/run_pretraining.py
new file mode 100644
index 000000000..a5d3feca0
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/run_pretraining.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# 标准库
+import os
+import sys
+import time
+from typing import Any, Tuple
+
+# 三方库
+
+# benchmarks目录 append到sys.path
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH,
+                                             "../../")))  # benchmarks目录
+# 本地库
+import config
+from driver import Event, dist_pytorch
+from driver.helper import InitHelper
+
+# 导入相关的模块、方法、变量。这里保持名称一致，实现可以不同。
+from train import trainer_adapter
+from train.evaluator import Evaluator
+from train.trainer import Trainer
+from train.training_state import TrainingState
+from dataloaders.dataloader import build_train_dataloader, build_eval_dataloader
+
+logger = None
+
+
+def main() -> Tuple[Any, Any]:
+    global logger
+    global config
+
+    # init
+    init_helper = InitHelper(config)
+    model_driver = init_helper.init_driver(globals(), locals())
+    config = model_driver.config
+    dist_pytorch.init_dist_training_env(config)
+    dist_pytorch.barrier(config.vendor)
+    model_driver.event(Event.INIT_START)
+
+    config.distributed = dist_pytorch.get_world_size() > 1
+    # logger
+    logger = model_driver.logger
+
+    train_dataloader = build_train_dataloader(config)
+    eval_dataloader = build_eval_dataloader(config)
+
+    seed = config.seed
+
+    init_helper.set_seed(seed, model_driver.config.vendor)
+
+    # 创建TrainingState对象
+    training_state = TrainingState()
+
+    # 构建 trainer：依赖 evaluator、TrainingState对象
+    evaluator = Evaluator(config)
+    trainer = Trainer(driver=model_driver,
+                      adapter=trainer_adapter,
+                      evaluator=evaluator,
+                      training_state=training_state,
+                      device=config.device,
+                      config=config)
+    training_state._trainer = trainer
+
+    # 设置分布式环境, trainer init()
+    dist_pytorch.barrier(config.vendor)
+    train_dataloader, eval_dataloader = trainer.init(train_dataloader,
+                                                     eval_dataloader)
+    dist_pytorch.barrier(config.vendor)
+
+    # evaluation统计
+    init_evaluation_start = time.time()  # evaluation起始时间，单位为秒
+
+    trainer.evaluate(trainer.model, eval_dataloader, device=trainer.device)
+
+    init_evaluation_end = time.time()  # evaluation结束时间，单位为秒
+
+    init_evaluation_info = dict(time=init_evaluation_end -
+                                init_evaluation_start)
+
+    model_driver.event(Event.INIT_EVALUATION, init_evaluation_info)
+
+    if not config.do_train:
+        return config, training_state
+
+    model_driver.event(Event.INIT_END)
+
+    # TRAIN_START
+    dist_pytorch.barrier(config.vendor)
+    model_driver.event(Event.TRAIN_START)
+    train_start_time = time.time()
+
+    # 训练过程
+    epoch = 0
+    while not training_state.end_training:
+        training_state.epoch = epoch
+        trainer.train_one_epoch(train_dataloader, eval_dataloader)
+        epoch += 1
+
+    # TRAIN_END事件
+    training_state.traintime = time.time() - train_start_time
+    model_driver.event(Event.TRAIN_END)
+
+    return config, training_state
+
+
+if __name__ == "__main__":
+    start = time.time()
+    config_update, state = main()
+    if not dist_pytorch.is_main_process():
+        sys.exit(0)
+
+    # 训练信息写日志
+    e2e_time = time.time() - start
+    if config_update.do_train:
+
+        finished_info = {
+            "e2e_time": e2e_time,
+            "train_time": state.traintime,
+            "train_no_eval_time": state.noevaltime,
+            "pure_training_computing_time": state.purecomputetime,
+            "throughput(ips)_raw": state.num_trained_samples / state.traintime,
+            "throughput(ips)_no_eval":
+            state.num_trained_samples / state.noevaltime,
+            "throughput(ips)_pure_compute":
+            state.num_trained_samples / state.purecomputetime,
+            "converged": state.converged,
+            "rouge1": state.rouge1,
+            "rouge2": state.rouge2,
+            "rougeL": state.rougeL,
+            "rougeLsum": state.rougeLsum,
+        }
+    else:
+        finished_info = {"e2e_time": e2e_time}
+    logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
diff --git a/training/benchmarks/t5_small/pytorch/schedulers/__init__.py b/training/benchmarks/t5_small/pytorch/schedulers/__init__.py
new file mode 100644
index 000000000..06d6fb0d1
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/schedulers/__init__.py
@@ -0,0 +1,11 @@
+from transformers import get_scheduler
+
+
+def create_scheduler(optimizer, train_dataloader, args):
+    lr_scheduler = get_scheduler(
+        name='linear',
+        optimizer=optimizer,
+        num_warmup_steps=0,
+        num_training_steps=len(train_dataloader) * args.max_epoch,
+    )
+    return lr_scheduler
diff --git a/training/benchmarks/t5_small/pytorch/train/__init__.py b/training/benchmarks/t5_small/pytorch/train/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/benchmarks/t5_small/pytorch/train/evaluator.py b/training/benchmarks/t5_small/pytorch/train/evaluator.py
new file mode 100644
index 000000000..174f54e69
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/train/evaluator.py
@@ -0,0 +1,84 @@
+import os
+
+import nltk
+import numpy as np
+import evaluate
+import torch
+import torch.distributed as dist
+
+
+def postprocess_text(preds, labels):
+    """
+        https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L621
+    """
+    preds = [pred.strip() for pred in preds]
+    labels = [label.strip() for label in labels]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+
+    return preds, labels
+
+
+def pad_across_processes(config, preds, labels, tokenizer):
+    if not config.distributed:
+        return preds, labels
+
+    max_pred_len = torch.tensor(preds.shape[1],
+                                dtype=torch.int64,
+                                device=config.device)
+    dist.all_reduce(max_pred_len, dist.ReduceOp.MAX)
+    max_pred_len = int(max_pred_len)
+
+    if max_pred_len > preds.shape[1]:
+        pad_index = tokenizer.pad_token_id
+        new_preds = preds.new_zeros(preds.shape[0], max_pred_len) + pad_index
+        new_preds[:, :preds.shape[1]] = preds
+        preds = new_preds
+
+    all_preds = [preds.clone() for _ in range(dist.get_world_size())]
+    dist.all_gather(all_preds, preds)
+
+    all_labels = [labels.clone() for _ in range(dist.get_world_size())]
+    dist.all_gather(all_labels, labels)
+
+    return torch.cat(all_preds, dim=0), torch.cat(all_labels, dim=0)
+
+
+class Evaluator:
+    """Evaluator"""
+    def __init__(self, config):
+        self.config = config
+        nltk.data.path.append(os.path.join(config.data_dir, 'nltk_data'))
+        self.metric_path = os.path.join(config.data_dir, 'metrics', 'rouge',
+                                        'rouge.py')
+        self.reset()
+
+    def reset(self):
+        self.metric = evaluate.load(self.metric_path)
+
+    def add_batch(self, tokenizer, preds, labels):
+        preds, labels = pad_across_processes(self.config, preds, labels,
+                                             tokenizer)
+
+        preds = preds.cpu().numpy()
+        labels = labels.cpu().numpy()
+
+        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = tokenizer.batch_decode(labels,
+                                                skip_special_tokens=True)
+
+        decoded_preds, decoded_labels = postprocess_text(
+            decoded_preds, decoded_labels)
+        self.metric.add_batch(
+            predictions=decoded_preds,
+            references=decoded_labels,
+        )
+
+    def compute_acc(self):
+        result = self.metric.compute(use_stemmer=True)
+        result = {k: round(v * 100, 4) for k, v in result.items()}
+        return result
diff --git a/training/benchmarks/t5_small/pytorch/train/trainer.py b/training/benchmarks/t5_small/pytorch/train/trainer.py
new file mode 100644
index 000000000..ba805aa0c
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/train/trainer.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+import time
+import torch
+import torch.utils.data
+from torch.types import Device
+import os
+import sys
+import torch.distributed as dist
+from accelerate import Accelerator
+
+from model import create_model
+from optimizers import create_optimizer
+from schedulers import create_scheduler
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+
+CURR_PATH = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
+from driver import Driver, Event, dist_pytorch
+
+
+class Trainer:
+    def __init__(self, driver: Driver, adapter, evaluator: Evaluator,
+                 training_state: TrainingState, device: Device, config):
+        super(Trainer, self).__init__()
+        self.driver = driver
+        self.adapter = adapter
+        self.training_state = training_state
+        self.device = device
+        self.config = config
+        self.evaluator = evaluator
+
+    def init(self, train_dataloader, eval_dataloader):
+        device = torch.device(self.config.device)
+        dist_pytorch.main_proc_print("Init progress:")
+        self.model, self.model_config, self.tokenizer = create_model(
+            self.config)
+        self.model.to(self.device)
+
+        self.model = self.adapter.convert_model(self.model)
+
+        self.optimizer = create_optimizer(self.model, self.config)
+        self.lr_scheduler = create_scheduler(self.optimizer, train_dataloader,
+                                             self.config)
+
+        self.accelerator = Accelerator()
+        self.model, self.optimizer, train_dataloader, eval_dataloader, self.lr_scheduler = self.accelerator.prepare(
+            self.model, self.optimizer, train_dataloader, eval_dataloader,
+            self.lr_scheduler)
+
+        return train_dataloader, eval_dataloader
+
+    def process_batch(self, batch, device: Device):
+        """Process batch and produce inputs for the model."""
+        for k, v in batch.items():
+            batch[k] = v.to(device, non_blocking=True)
+        return batch
+
+    def train_one_epoch(self, train_dataloader, eval_dataloader):
+
+        model = self.model
+        optimizer = self.optimizer
+        data_loader = train_dataloader
+        device = self.device
+        epoch = self.training_state.epoch
+        print("Epoch " + str(epoch + 1))
+
+        model.train()
+        noeval_start_time = time.time()
+
+        for step, batch in enumerate(data_loader):
+            batch = self.process_batch(batch, device)
+
+            pure_start_time = time.time()
+
+            outputs = model(**batch)
+            loss = outputs.loss
+
+            self.accelerator.backward(loss)
+            optimizer.step()
+            self.lr_scheduler.step()
+            optimizer.zero_grad()
+
+            if step % self.config.log_freq == 0:
+                print("Train Step " + str(step) + "/" + str(len(data_loader)) +
+                      ", Loss : " + str(float(loss)))
+
+            self.training_state.purecomputetime += time.time(
+            ) - pure_start_time
+
+        self.training_state.noevaltime += time.time() - noeval_start_time
+
+        eval_result = self.evaluate(self.model,
+                                    eval_dataloader,
+                                    device=self.device)
+
+        state = self.training_state
+        config = self.config
+
+        state.rouge1, state.rouge2, state.rougeL, state.rougeLsum = eval_result.values(
+        )
+        if state.rouge1 >= config.target_rouge1:
+            dist_pytorch.main_proc_print(
+                f"converged_success. eval_rouge1: {state.rouge1}, target_rouge1: {config.target_rouge1}"
+            )
+            state.converged_success()
+
+        if epoch + 1 >= config.max_epoch:
+            state.end_training = True
+        state.num_trained_samples += len(data_loader.dataset)
+
+    def evaluate(self, model, data_loader, device):
+        self.model.eval()
+        self.evaluator.reset()
+        for step, batch in enumerate(data_loader):
+            if step % self.config.log_freq == 0:
+                print("Eval Step " + str(step) + "/" + str(len(data_loader)))
+            batch = self.process_batch(batch, device)
+            input_ids, labels = batch['input_ids'], batch['labels']
+
+            # https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L178
+            # https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L651C2-L655
+            # according to huggingface run_summarization.py, max_length is 128, num_beams is 1
+            def _unwrap_model(model):
+                if hasattr(model, "module"):
+                    return _unwrap_model(model.module)
+                else:
+                    return model
+
+            model = _unwrap_model(model)
+            output = model.generate(input_ids,
+                                    max_length=128,
+                                    num_beams=self.model_config.num_beams)
+            self.evaluator.add_batch(self.tokenizer, output, labels)
+
+        result = self.evaluator.compute_acc()
+        dist_pytorch.main_proc_print(result)
+
+        return result
diff --git a/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py b/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py
new file mode 100644
index 000000000..ff46be1b8
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+from torch import nn
+
+
+def convert_model(model: nn.Module) -> nn.Module:
+    """convert_model"""
+    return model
diff --git a/training/benchmarks/t5_small/pytorch/train/training_state.py b/training/benchmarks/t5_small/pytorch/train/training_state.py
new file mode 100644
index 000000000..dbecce66f
--- /dev/null
+++ b/training/benchmarks/t5_small/pytorch/train/training_state.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2023 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+from dataclasses import dataclass
+
+
+@dataclass
+class TrainingState:
+    """TrainingState dataclass"""
+    _trainer = None
+    _status = 'aborted'  # later set to 'success' if termination criteria met
+
+    global_steps = 0
+
+    loss: float = 0.0
+    rouge1: float = 0.0
+    rouge2: float = 0.0
+    rougeL: float = 0.0
+    rougeLsum: float = 0.0
+
+    epoch: int = 1
+
+    end_training: bool = False
+    converged: bool = False
+
+    traintime = 0.0
+    noevaltime = 0.0
+    purecomputetime = 0.0
+
+    num_trained_samples = 0
+
+    def status(self):
+        """get status"""
+        if self.converged:
+            self._status = "success"
+        return self._status
+
+    def converged_success(self):
+        """converged success"""
+        self.end_training = True
+        self.converged = True
diff --git a/training/nvidia/t5_small-pytorch/README.md b/training/nvidia/t5_small-pytorch/README.md
new file mode 100644
index 000000000..b27ae4b3d
--- /dev/null
+++ b/training/nvidia/t5_small-pytorch/README.md
@@ -0,0 +1,57 @@
+### 1. 下载数据集和模型
+[下载链接](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/t5_small_train.tar) 
+
+### 2. 设置test_conf.py
+
+为了使得`training/nvidia/t5_small-pytorch/config/requirements.txt`里的依赖库均能被下载，需要将`training/run_benchmarks/config/test_conf.py`里的`PIP_SOURCE`的值修改为`https://pypi.tuna.tsinghua.edu.cn/simple`
+
+### 3. Nvidia GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-1.8.0a0+52ea372
+   - 依赖软件版本：
+     - cuda: 11.4
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                              |
+| -------------- | ----------------------- | ------------------------------------- |
+| 任务类别       | Summarization                |                                       |
+| 模型           | t5_small                |                                       |
+| 数据集         | CNN/Daily Mail            |                                       |
+| 超参修改       | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称   | nvidia A100             |                                       |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB              |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间               |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际样本数数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时     |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)      |
+| 训练结果       | rouge1,见“性能指标”        | rouge1分数            |
+| 训练结果       | rouge2,见“性能指标”        | rouge2分数            |
+| 训练结果       | rougeL,见“性能指标”        | rougeL分数            |
+| 训练结果       | rougeLsum,见“性能指标”        | rougeLsum分数            |
+| 额外修改项     | 无                      |                                       |
+
+* 性能指标
+
+| 配置               | precision | fix_hp | e2e_time | p_whole | p_train | p_core | rouge1  | rouge2  | rougeL  | rougeLsum  | mem |
+| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- |  ---- | ---- | ---- | ---- | ---- |
+| A100单机8卡（1x1） | fp32 | / | | | | | | | | | |
+| A100单机8卡（1x8） | fp32 | / | 996.11 | 338 | 398 | 400 | 41.12 | 18.84 | 29.15 | 38.32 | 35.3 /40.0 |
+| A100单机8卡（2x8） | fp32 | / | | | | | | | | | |
+
+注意: T5模型MFU数值较低, 为11.8%
+1x8训练的MFU计算过程如下:
+`MFU = 400.26068691305795 * 1024 * (60 * 10^6) * 6 / (156 * 1000^4) / 8 = 11.8%`
+
+其中, 1024为seq_len, 60 millions为参数量, (156 * 1000^4)为A100 tf32算力
+
diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py
new file mode 100644
index 000000000..c1a1569cc
--- /dev/null
+++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,2 @@
+train_batch_size = 32
+eval_batch_size = 32
diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py
new file mode 100644
index 000000000..c1a1569cc
--- /dev/null
+++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py
@@ -0,0 +1,2 @@
+train_batch_size = 32
+eval_batch_size = 32
diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py
new file mode 100644
index 000000000..c1a1569cc
--- /dev/null
+++ b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py
@@ -0,0 +1,2 @@
+train_batch_size = 32
+eval_batch_size = 32
diff --git a/training/nvidia/t5_small-pytorch/config/requirements.txt b/training/nvidia/t5_small-pytorch/config/requirements.txt
new file mode 100644
index 000000000..9ee08e5d1
--- /dev/null
+++ b/training/nvidia/t5_small-pytorch/config/requirements.txt
@@ -0,0 +1,8 @@
+transformers==4.31.0
+evaluate==0.4.0
+datasets==2.14.4
+accelerate==0.21.0
+tokenizers==0.13.3
+nltk==3.8.1
+absl-py==1.4.0
+rouge-score==0.1.2
diff --git a/training/nvidia/t5_small-pytorch/extern/.gitkeep b/training/nvidia/t5_small-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 9d37dd8bc..700f0c65a 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -75,5 +75,6 @@
     
     # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/",
     # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "t5_small:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/t5_small_train",
     # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2",
 }

From 5f9e761b3c112283fdd1b335e6745ca43d72d210 Mon Sep 17 00:00:00 2001
From: clveryang <50865584+clveryang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:42:16 +0800
Subject: [PATCH 09/18] iluvatar_infer_resnet50 (#259)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: 杨智超 <zhichao.yang@iluvatar.com>
---
 inference/benchmarks/resnet50/README.md     |   2 +-
 inference/inference_engine/iluvatar/ixrt.py | 193 +++++++++++---------
 2 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
index aaf3c14fa..0eee2b55f 100644
--- a/inference/benchmarks/resnet50/README.md
+++ b/inference/benchmarks/resnet50/README.md
@@ -120,6 +120,6 @@ find ./val -name "*JPEG" | wc -l
 | tensorrt | fp16      | 256  |613.4 | 1358.9   | 4469.4 | 1391.4   | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 |
 | tensorrt | fp32   | 256  | 474.4    | 1487.3      | 2653.2     | 1560.3        | 6091.6  | 16.1% | 76.2/76.2 | 28.86/40.0 |
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
-| ixrt     | fp16     | 256  | 136.4 | /      | /      | 1146.6 | 2679.9 | 11.5% | 76.2 | 4.3/32.0 |
+| ixrt     | fp16  (W16A32)   | 256  | 261.467 | /      | /      | 1389.332  | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 |
 | kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
 
diff --git a/inference/inference_engine/iluvatar/ixrt.py b/inference/inference_engine/iluvatar/ixrt.py
index 620cc32f3..44fc85c4b 100644
--- a/inference/inference_engine/iluvatar/ixrt.py
+++ b/inference/inference_engine/iluvatar/ixrt.py
@@ -1,10 +1,13 @@
-from ixrt import IxRT, RuntimeConfig, RuntimeContext
-import torch
 import os
-import subprocess
-from loguru import logger
+import torch
+from torch import autocast
+import tensorrt as trt
+
 import numpy as np
+import pycuda.driver as cuda
+import pycuda.autoinit
 import time
+import subprocess
 
 
 class InferModel:
@@ -16,114 +19,122 @@ def __init__(self, host_mem, device_mem):
             self.device = device_mem
 
         def __str__(self):
-            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
+            return "Host:\n" + str(self.host) + "\nDevice:\n" + str(
+                self.device)
 
         def __repr__(self):
             return self.__str__()
 
     def __init__(self, config, onnx_path, model):
-        self.str_to_numpy_dict = {
-            "int32": np.int32,
-            "float16": np.float16,
-            "float32": np.float32,
-        }
-        self.engine = self.build_engine(config, onnx_path)
-        self.outputs = self.allocate_buffers(self.engine)
-
-    def config_init_engine(self, config, onnx_path):
-        quant_file = None
-
-        runtime_config = RuntimeConfig()
+        self.config = config
 
-        input_shapes = [config.batch_size, 3, config.image_size, config.image_size]
-        runtime_config.input_shapes = [("input", input_shapes)]
-        runtime_config.device_idx = 0
+        self.logger = trt.Logger(trt.Logger.WARNING)
+        self.runtime = trt.Runtime(self.logger)
 
-        precision = "float16"
-        if precision == "int8":
-            assert quant_file, "Quant file must provided for int8 inferencing."
-
-        runtime_config.runtime_context = RuntimeContext(
-            precision,
-            "nhwc",
-            use_gpu=True,
-            pipeline_sync=True,
-            input_types=config.input_types,
-            output_types=config.output_types,
-            input_device="gpu",
-            output_device="gpu",
-        )
+        self.engine = self.build_engine(config, onnx_path)
 
-        runtime = IxRT.from_onnx(onnx_path, quant_file, runtime_config)
-        return runtime
+        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(
+            self.engine)
+
+        self.context = self.engine.create_execution_context()
+        self.numpy_to_torch_dtype_dict = {
+            bool: torch.bool,
+            np.uint8: torch.uint8,
+            np.int8: torch.int8,
+            np.int16: torch.int16,
+            np.int32: torch.int32,
+            np.int64: torch.int64,
+            np.float16: torch.float16,
+            np.float32: torch.float32,
+            np.float64: torch.float64,
+            np.complex64: torch.complex64,
+            np.complex128: torch.complex128,
+        }
+        self.str_to_torch_dtype_dict = {
+            "bool": torch.bool,
+            "uint8": torch.uint8,
+            "int8": torch.int8,
+            "int16": torch.int16,
+            "int32": torch.int32,
+            "int64": torch.int64,
+            "float16": torch.float16,
+            "float32": torch.float32,
+            "float64": torch.float64,
+            "complex64": torch.complex64,
+            "complex128": torch.complex128,
+        }
 
     def build_engine(self, config, onnx_path):
         if config.exist_compiler_path is None:
-            output_path = config.log_dir + "/" + config.ixrt_tmp_path
+            trt_path = config.log_dir + "/" + config.ixrt_tmp_path
 
-            dir_output_path = os.path.dirname(output_path)
-            os.makedirs(dir_output_path, exist_ok=True)
+            dir_trt_path = os.path.dirname(trt_path)
+            os.makedirs(dir_trt_path, exist_ok=True)
 
             time.sleep(10)
 
-            runtime = self.config_init_engine(config, onnx_path)
-            print(f"Build Engine File: {output_path}")
-            runtime.BuildEngine()
-            runtime.SerializeEngine(output_path)
-            print("Build Engine done!")
+            trtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + trt_path
+            if config.fp16:
+                trtexec_cmd += " --precision fp16"
+            if config.has_dynamic_axis:
+                trtexec_cmd += " --minShapes=" + config.minShapes
+                trtexec_cmd += " --optShapes=" + config.optShapes
+                trtexec_cmd += " --maxShapes=" + config.maxShapes
+
+            p = subprocess.Popen(trtexec_cmd, shell=True)
+            p.wait()
         else:
-            output_path = config.exist_compiler_path
-            print(f"Use existing engine: {output_path}")
+            trt_path = config.exist_compiler_path
 
-        runtime = IxRT()
-        runtime.LoadEngine(output_path, config.batch_size)
-        return runtime
+        with open(trt_path, "rb") as f:
+            return self.runtime.deserialize_cuda_engine(f.read())
 
     def allocate_buffers(self, engine):
-        output_map = engine.GetOutputShape()
-        output_io_buffers = []
-        output_types = {}
-        config = engine.GetConfig()
-        for key, val in config.runtime_context.output_types.items():
-            output_types[key] = str(val)
-        for name, shape in output_map.items():
-            # 1. apply memory buffer for output of the shape
-            buffer = np.zeros(
-                shape.dims, dtype=self.str_to_numpy_dict[output_types[name]]
-            )
-            buffer = torch.tensor(buffer).cuda()
-            # 2. put the buffer to a list
-            output_io_buffers.append([name, buffer, shape])
+        inputs = []
+        outputs = []
+        bindings = []
+        stream = cuda.Stream()
+
+        for binding in range(engine.num_bindings):
+            size = trt.volume(engine.get_binding_shape(binding))
+            dtype = trt.nptype(engine.get_binding_dtype(binding))
+
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            bindings.append(int(device_mem))
+
+            if engine.binding_is_input(binding):
+                inputs.append(self.HostDeviceMem(host_mem, device_mem))
+            else:
+                outputs.append(self.HostDeviceMem(host_mem, device_mem))
 
-        engine.BindIOBuffers(output_io_buffers)
-        return output_io_buffers
+        return inputs, outputs, bindings, stream
 
     def __call__(self, model_inputs: list):
-        batch_size = np.unique(np.array([i.size(dim=0) for i in model_inputs]))
-        batch_size = batch_size[0]
-        input_map = self.engine.GetInputShape()
-        input_io_buffers = []
 
         for i, model_input in enumerate(model_inputs):
-            model_input = torch.tensor(model_input.numpy(), dtype=torch.float32).cuda()
-            if not model_input.is_contiguous():
-                model_input = model_input.contiguous()
-            name, shape = list(input_map.items())[0]
-            _shape, _padding = shape.dims, shape.padding
-            _shape = [i + j for i, j in zip(_shape, _padding)]
-            _shape = [_shape[0], *_shape[2:], _shape[1]]
-            input_io_buffers.append([name, model_input, shape])
-
-        self.engine.BindIOBuffers(self.outputs)
-        self.engine.LoadInput(input_io_buffers)
-
-        # torch.cuda.synchronize()
-        self.engine.Execute()
-        # torch.cuda.synchronize()
-
-        gpu_io_buffers = []
-        for buffer in self.outputs:
-            # gpu_io_buffers.append([buffer[0], buffer[1], buffer[2]])
-            gpu_io_buffers.append(buffer[1])
-
-        return gpu_io_buffers, 0
+            model_input = model_input.cuda()
+
+            cuda.memcpy_dtod_async(
+                self.inputs[i].device,
+                model_input.data_ptr(),
+                model_input.element_size() * model_input.nelement(),
+                self.stream,
+            )
+
+        self.context.execute_async_v2(bindings=self.bindings,
+                                      stream_handle=self.stream.handle)
+        result = []
+        for out in self.outputs:
+            out_tensor = torch.empty(out.host.shape, device="cuda").to(
+                self.str_to_torch_dtype_dict[str(out.host.dtype)])
+            cuda.memcpy_dtod_async(
+                out_tensor.data_ptr(),
+                out.device,
+                out_tensor.element_size() * out_tensor.nelement(),
+                self.stream,
+            )
+            result.append(out_tensor)
+
+        self.stream.synchronize()
+        return result, 0

From aa4a3d7f9e20d5faac861f693aa9cdc09729c8a3 Mon Sep 17 00:00:00 2001
From: feldmanshan <145551134+feldmanshan@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:53:21 +0800
Subject: [PATCH 10/18] zixiao:add resnet50 inference configs && results (#256)

* zixiao:add resnet50 inference configs && results

* zixiao: modify resnet50 config & add log file

* zixiao: remote log file

* zixiao: fix resnet50 inference result
---
 inference/benchmarks/resnet50/README.md       |  19 ++
 .../vendor_config/zixiao_configurations.yaml  |   4 +
 .../docker_images/zixiao/pytorch/Dockerfile   |  85 ++++++
 .../zixiao/pytorch/packages/README.md         |   5 +
 .../zixiao/pytorch/pytorch_install.sh         |   6 +
 .../zixiao/pytorch/sdk_installers/README.md   |   8 +
 .../docker_images/zixiao/zixiao_analysis.py   |  16 ++
 .../docker_images/zixiao/zixiao_monitor.py    | 256 ++++++++++++++++++
 inference/inference_engine/zixiao/zxrt.py     | 114 ++++++++
 inference/tools/torch_sync.py                 |   4 +
 10 files changed, 517 insertions(+)
 create mode 100644 inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
 create mode 100644 inference/docker_images/zixiao/pytorch/Dockerfile
 create mode 100644 inference/docker_images/zixiao/pytorch/packages/README.md
 create mode 100644 inference/docker_images/zixiao/pytorch/pytorch_install.sh
 create mode 100644 inference/docker_images/zixiao/pytorch/sdk_installers/README.md
 create mode 100644 inference/docker_images/zixiao/zixiao_analysis.py
 create mode 100644 inference/docker_images/zixiao/zixiao_monitor.py
 create mode 100755 inference/inference_engine/zixiao/zxrt.py

diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
index 0eee2b55f..5bd96adb3 100644
--- a/inference/benchmarks/resnet50/README.md
+++ b/inference/benchmarks/resnet50/README.md
@@ -96,6 +96,24 @@ find ./val -name "*JPEG" | wc -l
 
    - IXRT: ixrt-0.4.0+corex.3.2.0
 
+#### 2.5 腾讯紫霄 C100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: C100
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-78-generic
+   - 加速卡驱动版本：2.4.12
+   - Docker 版本：24.0.4
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+   
+- 推理工具包
+
+   - zxrt 2.4.12
+
 ### 3. 运行情况
 
 * 指标列表
@@ -122,4 +140,5 @@ find ./val -name "*JPEG" | wc -l
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
 | ixrt     | fp16  (W16A32)   | 256  | 261.467 | /      | /      | 1389.332  | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 |
 | kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
+| zixiao | fp16   | 192  | 380    | /      | /     |  1528.373    | 5853.771  | / | 76.2/76.2 | / |
 
diff --git a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
new file mode 100644
index 000000000..14071e7ec
--- /dev/null
+++ b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
@@ -0,0 +1,4 @@
+compiler: zxrt
+no_validation: true
+batch_size: 192
+exist_onnx_path: onnxs/resnet50_pytorch.onnx
\ No newline at end of file
diff --git a/inference/docker_images/zixiao/pytorch/Dockerfile b/inference/docker_images/zixiao/pytorch/Dockerfile
new file mode 100644
index 000000000..ec332e343
--- /dev/null
+++ b/inference/docker_images/zixiao/pytorch/Dockerfile
@@ -0,0 +1,85 @@
+FROM amd64/ubuntu:20.04
+
+LABEL VERSION="1.1.9"
+
+ARG DEBIAN_FRONTEND=noninteractive
+USER root
+WORKDIR /tmp
+ARG MOFED_VER=5.4-3.1.0.0
+RUN if [ $MOFED_VER ]; then echo "MOFED_VER=$MOFED_VER" & exit 0;else echo "no MOFED_VER, please check" & sleep 1 & exit 1; fi
+
+
+ARG WHICH_MIRROR=mirrors.cloud.tencent.com/repo
+
+# change source to tencent cloud
+RUN sed -i 's#http://archive.ubuntu.com/ubuntu/#http://mirrors.cloud.tencent.com/ubuntu/#g' /etc/apt/sources.list && \
+    sed -i 's#http://security.ubuntu.com/ubuntu/#http://mirrors.cloud.tencent.com/ubuntu/#g' /etc/apt/sources.list && \
+    apt update && apt -yq install apt-transport-https wget
+RUN wget -O /etc/apt/sources.list http://${WHICH_MIRROR}/ubuntu20_sources.list && apt update -yq
+RUN mkdir /root/.pip && echo "[global]\nindex- url = https://mirrors.cloud.tencent.com/pypi/simple\ntrusted-host = mirrors.cloud.tencent.com" > /root/.pip/pip.conf
+
+################################ BASIC LIBRARY  #################################
+# install packages
+RUN apt-get update && apt-get install -yq apt-utils sudo vim curl \
+    autoconf automake dialog libtool pkg-config libffi-dev \
+    libexpat1-dev libpciaccess-dev libxml2-dev \
+    bison flex xutils-dev zlib1g-dev ninja-build git locate \
+    zip unzip g++ \
+    # install ssh
+    openssh-server openssh-client \
+    # fix hexdump missing issue
+    bsdmainutils \
+    # fix header missing for tensorflow verbs support
+    libibverbs-dev \
+    #install default python3.8
+    python3 python3-pip python3-dev python3-tk libjpeg-dev \
+    # RMA dependency library
+    graphviz dpatch swig gfortran chrpath tk tcl libnl-3-200 libnl-route-3-dev lsof \
+    libnl-3-dev libgfortran5 libnl-route-3-200 ethtool libnuma1 libnuma-dev udev \
+    # ECCL dependency library
+    libncurses5-dev hwloc libhwloc-dev libhwloc-common libboost-all-dev libevent-dev python2-dev && \
+    apt-get clean
+
+
+# Set timezone
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+RUN echo 'Asia/Shanghai' >/etc/timezone
+
+# Install miniconda
+# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
+# RUN curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \
+RUN wget -O ~/miniconda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \
+    chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /root/miniconda && \
+    rm ~/miniconda.sh && \
+    /root/miniconda/bin/conda config --set show_channel_urls yes && \
+    /root/miniconda/bin/conda create --name python38 python=3.8 -y && \
+    /root/miniconda/bin/conda clean -ya
+
+# hyperparamer, typing_extensions, numpy requests
+RUN /root/miniconda/envs/python38/bin/pip install \
+    --no-cache-dir \
+    -i https://pypi.tuna.tsinghua.edu.cn/simple \
+    hyperparameter \
+    typing_extensions \
+    numpy \
+    requests \
+    onnx \
+    onnxruntime \
+    attrs \
+    regex \
+    decorator \
+    loguru \
+    schedule \
+    munch \
+    pyyaml \
+    tqdm \
+    scipy
+
+RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu
+
+ENV PATH /root/miniconda/envs/python38/bin:$PATH
+ENV PATH=/usr/local/zx-smi/zx-smi-1.20.0:$PATH
+
+
+
diff --git a/inference/docker_images/zixiao/pytorch/packages/README.md b/inference/docker_images/zixiao/pytorch/packages/README.md
new file mode 100644
index 000000000..6be446920
--- /dev/null
+++ b/inference/docker_images/zixiao/pytorch/packages/README.md
@@ -0,0 +1,5 @@
+# 以下软件包需联系腾讯蓬莱实验室获取
+
+>联系邮箱: feldmanshan@tencent.com
+
+TopsInference-2.4.12-py3.8-none-any.wh
diff --git a/inference/docker_images/zixiao/pytorch/pytorch_install.sh b/inference/docker_images/zixiao/pytorch/pytorch_install.sh
new file mode 100644
index 000000000..9f1433527
--- /dev/null
+++ b/inference/docker_images/zixiao/pytorch/pytorch_install.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+pip3 install ./packages/TopsInference-2.4.12-py3.8-none-any.whl
+
+dpkg -i ./sdk_installers/topsruntime_2.4.12-1_amd64.deb
+dpkg -i ./sdk_installers/tops-sdk_2.4.12-1_amd64.deb
+
diff --git a/inference/docker_images/zixiao/pytorch/sdk_installers/README.md b/inference/docker_images/zixiao/pytorch/sdk_installers/README.md
new file mode 100644
index 000000000..644527273
--- /dev/null
+++ b/inference/docker_images/zixiao/pytorch/sdk_installers/README.md
@@ -0,0 +1,8 @@
+# 以下软件包需联系腾讯蓬莱实验室获取
+
+>联系邮箱: feldmanshan@tencent.com
+
+zixiao-x86_64-gcc-2.4.12.run
+
+topsruntime_2.4.12-1_amd64.deb
+tops-sdk_2.4.12-1_amd64.deb
\ No newline at end of file
diff --git a/inference/docker_images/zixiao/zixiao_analysis.py b/inference/docker_images/zixiao/zixiao_analysis.py
new file mode 100644
index 000000000..d23cf3f8b
--- /dev/null
+++ b/inference/docker_images/zixiao/zixiao_analysis.py
@@ -0,0 +1,16 @@
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0 ## usage_mem
+    max_mem = 16.0 
+    for line in logfile.readlines():
+        '''
+        zx-smi pwr DTemp MUsed Mem
+        '''
+        if "zx-smi" in line:
+            line = line[:-1]
+            usage = line.split(" ")[3]
+            usage = float(usage)*16/100
+            max_usage = max(max_usage, usage)
+    return round(max_usage, 2), max_mem, eval("30e12"), eval("120e12")
+
diff --git a/inference/docker_images/zixiao/zixiao_monitor.py b/inference/docker_images/zixiao/zixiao_monitor.py
new file mode 100644
index 000000000..af1f857c9
--- /dev/null
+++ b/inference/docker_images/zixiao/zixiao_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "zx-smi -dmon -s mp -i 0 -c 1 | grep '16'|awk '{print $3,$4,$9,$10}'"  ## pwr DTemp MUsed Mem
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n zx-smi " + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/zixiao_monitor.pid')
+    log_fn = str(log_path + '/zixiao_monitor.log')
+    err_fn = str(log_path + '/zixiao_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/zixiao_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/inference_engine/zixiao/zxrt.py b/inference/inference_engine/zixiao/zxrt.py
new file mode 100755
index 000000000..a2ceacf1c
--- /dev/null
+++ b/inference/inference_engine/zixiao/zxrt.py
@@ -0,0 +1,114 @@
+import onnx
+import onnxruntime
+import torch
+import os
+import subprocess
+from loguru import logger
+import numpy as np
+import time
+import TopsInference
+
+def type2dtype(types):
+    dtypes = []
+    for elem_type in types:
+        if elem_type == 1:
+            dtypes.append(TopsInference.DT_FLOAT32)
+        elif elem_type == 7:
+            dtypes.append(TopsInference.DT_INT64)
+        elif elem_type == 6:
+            dtypes.append(TopsInference.DT_INT32)
+        elif elem_type == 3:
+            dtypes.append(TopsInference.DT_INT8)
+        elif elem_type == 4:
+            dtypes.append(TopsInference.DT_UINT8)
+        elif elem_type == 9:
+            dtypes.append(TopsInference.DT_BOOL)
+        elif elem_type == 10:
+            dtypes.append(TopsInference.DT_FLOAT16)
+        else:
+            raise Exception("unknown default dtypes:{}, {}".format(elem_type))
+    return dtypes
+
+class InferModel:
+
+    def __init__(self, config, onnx_path, model):
+        self.input_names = []
+        self.engine = self.build_engine(config, onnx_path)
+        self.test_index = 0
+        self.batch_size = config.batch_size
+        self.zixiao_VG_num = 6
+
+    def build_engine(self, config, onnx_path):
+        self.handler = TopsInference.set_device(0, -1)
+        onnx_model = onnx.load(onnx_path)
+        self.input_shapes = []
+        self.input_dtype = []
+        for input in onnx_model.graph.input:
+            input_shape = input.type.tensor_type.shape.dim
+            input_shape = [a.dim_value for a in input_shape]
+            input_shape[0] = config.batch_size // 6
+            input_name = input.name
+            self.input_names.append(input_name)
+            self.input_shapes.append(input_shape)
+            self.input_dtype.append(input.type.tensor_type.elem_type)
+        self.input_dtype = type2dtype(self.input_dtype)
+        if config.fp16 == True:
+            set_input_dtype = []
+            for tops_dtype in self.input_dtype:
+                if tops_dtype == TopsInference.DT_FLOAT32:
+                    set_input_dtype.append(TopsInference.DT_FLOAT16)
+                else:
+                    set_input_dtype.append(tops_dtype)
+            self.input_dtype = set_input_dtype
+
+        onnx_parser = TopsInference.create_parser(TopsInference.ONNX_MODEL)
+        onnx_parser.set_input_names(self.input_names)
+        onnx_parser.set_input_dtypes(self.input_dtype)
+        onnx_parser.set_input_shapes(input_shape)
+
+        network = onnx_parser.read(onnx_path)
+        optimizer = TopsInference.create_optimizer()
+        if config.fp16 == True: 
+            optimizer.set_build_flag(TopsInference.KFP16_MIX)
+        engine = optimizer.build(network)
+        engine.save_executable(onnx_path+".bin")
+        engine = TopsInference.load(onnx_path+".bin")
+        self.streams = []
+        for i in range(12):
+            self.streams.append(TopsInference.create_stream())
+        return engine
+
+    def __call__(self, model_inputs: list):
+        inputs = []
+        outputs = []
+        foo_time_start = time.time()
+        for input in model_inputs:
+            inputs.append(input.numpy())
+        input_batch = inputs[0].shape[0]
+        # zixiao acceleration card has 6 compute cells
+        assert input_batch % self.zixiao_VG_num == 0
+        vg_batch = input_batch // self.zixiao_VG_num
+        foo_time = time.time() - foo_time_start
+        for i in range(self.zixiao_VG_num):
+            vg_input = []
+            foo_time_start_data_slice = time.time()
+            for input in inputs:
+                vg_input.append(input[vg_batch * i: vg_batch * (i + 1)])
+            foo_time += time.time() - foo_time_start_data_slice
+            outputs.append(self.engine.runV2(vg_input,
+                                             py_stream=self.streams[self.test_index % 12]))
+            self.test_index += 1
+        zx_outputs = []
+        for i in range(self.zixiao_VG_num):
+            zx_outputs.append([output for output in outputs[i].get()])
+        # concat vg_batch result
+        foo_time_start2 = time.time()
+        host_output = []
+        for i in range(len(zx_outputs[0])):
+            tmp_output = []
+            for j in range(self.zixiao_VG_num):
+                tmp_output.append(zx_outputs[j][i])
+            host_output.append(np.concatenate(tmp_output))
+        infer_output = [torch.from_numpy(output) for output in host_output]
+        foo_time += time.time() - foo_time_start2
+        return infer_output, foo_time
diff --git a/inference/tools/torch_sync.py b/inference/tools/torch_sync.py
index 4b2c631e5..85475d789 100644
--- a/inference/tools/torch_sync.py
+++ b/inference/tools/torch_sync.py
@@ -10,3 +10,7 @@ def torch_sync(config):
         # kunlunxin case
         # xpu sync already finsh after InferModel.__call__
         pass
+    if config.vendor == "zixiao":
+        # zixiao case
+        # zixiao sync already finsh after InferModel.__call__
+        pass
\ No newline at end of file

From 858c7228c3320d37c0a4d17fc8f9842aeb1d9164 Mon Sep 17 00:00:00 2001
From: feldmanshan <145551134+feldmanshan@users.noreply.github.com>
Date: Fri, 22 Sep 2023 17:00:31 +0800
Subject: [PATCH 11/18] zixiao: update zxrt.py & resnet50 result (#262)

* zixiao: update zxrt.py & resnet50 result

* zixiao: update resnet50 test batch_size
---
 inference/benchmarks/resnet50/README.md       |  2 +-
 .../vendor_config/zixiao_configurations.yaml  |  6 ++-
 inference/inference_engine/zixiao/zxrt.py     | 37 ++++++++++---------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
index 5bd96adb3..4e802f022 100644
--- a/inference/benchmarks/resnet50/README.md
+++ b/inference/benchmarks/resnet50/README.md
@@ -140,5 +140,5 @@ find ./val -name "*JPEG" | wc -l
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
 | ixrt     | fp16  (W16A32)   | 256  | 261.467 | /      | /      | 1389.332  | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 |
 | kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
-| zixiao | fp16   | 192  | 380    | /      | /     |  1528.373    | 5853.771  | / | 76.2/76.2 | / |
+| zixiao | fp16   | 32*6  | 261.103    | /      | /     |  193.151    | 6342.191  | / | 76.2/76.2 | / |
 
diff --git a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
index 14071e7ec..00586e225 100644
--- a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
+++ b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
@@ -1,4 +1,6 @@
 compiler: zxrt
 no_validation: true
-batch_size: 192
-exist_onnx_path: onnxs/resnet50_pytorch.onnx
\ No newline at end of file
+batch_size: 50000
+exist_onnx_path: onnxs/resnet50_pytorch.onnx
+repeat: 1
+zixiao_test_batch_size: 32
\ No newline at end of file
diff --git a/inference/inference_engine/zixiao/zxrt.py b/inference/inference_engine/zixiao/zxrt.py
index a2ceacf1c..f9f6b2b22 100755
--- a/inference/inference_engine/zixiao/zxrt.py
+++ b/inference/inference_engine/zixiao/zxrt.py
@@ -34,19 +34,18 @@ class InferModel:
     def __init__(self, config, onnx_path, model):
         self.input_names = []
         self.engine = self.build_engine(config, onnx_path)
-        self.test_index = 0
-        self.batch_size = config.batch_size
+        self.batch_size = config.zixiao_test_batch_size
         self.zixiao_VG_num = 6
 
     def build_engine(self, config, onnx_path):
-        self.handler = TopsInference.set_device(0, -1)
+        self.handler = TopsInference.set_device(4, -1)
         onnx_model = onnx.load(onnx_path)
         self.input_shapes = []
         self.input_dtype = []
         for input in onnx_model.graph.input:
             input_shape = input.type.tensor_type.shape.dim
             input_shape = [a.dim_value for a in input_shape]
-            input_shape[0] = config.batch_size // 6
+            input_shape[0] = config.zixiao_test_batch_size
             input_name = input.name
             self.input_names.append(input_name)
             self.input_shapes.append(input_shape)
@@ -84,31 +83,33 @@ def __call__(self, model_inputs: list):
         foo_time_start = time.time()
         for input in model_inputs:
             inputs.append(input.numpy())
-        input_batch = inputs[0].shape[0]
+        total_input_num = inputs[0].shape[0]
+        total_test_batch = (total_input_num + self.batch_size - 1) //  self.batch_size
         # zixiao acceleration card has 6 compute cells
-        assert input_batch % self.zixiao_VG_num == 0
-        vg_batch = input_batch // self.zixiao_VG_num
         foo_time = time.time() - foo_time_start
-        for i in range(self.zixiao_VG_num):
-            vg_input = []
+        for i in range(total_test_batch):
             foo_time_start_data_slice = time.time()
+            vg_input = []
             for input in inputs:
-                vg_input.append(input[vg_batch * i: vg_batch * (i + 1)])
+                vg_input.append(input[self.batch_size * i: self.batch_size * (i + 1)])
             foo_time += time.time() - foo_time_start_data_slice
-            outputs.append(self.engine.runV2(vg_input,
-                                             py_stream=self.streams[self.test_index % 12]))
-            self.test_index += 1
+            outputs.append(self.engine.runV2(vg_input, py_stream=self.streams[i % 12]))
+        # zixiao sync
+        for i in range(12):
+            outputs[i-12].get()
+        # zixiao sync done
+
+        # concat batch result
+        foo_time_start_d2h = time.time()
         zx_outputs = []
-        for i in range(self.zixiao_VG_num):
+        for i in range(total_test_batch):
             zx_outputs.append([output for output in outputs[i].get()])
-        # concat vg_batch result
-        foo_time_start2 = time.time()
         host_output = []
         for i in range(len(zx_outputs[0])):
             tmp_output = []
-            for j in range(self.zixiao_VG_num):
+            for j in range(total_test_batch):
                 tmp_output.append(zx_outputs[j][i])
             host_output.append(np.concatenate(tmp_output))
         infer_output = [torch.from_numpy(output) for output in host_output]
-        foo_time += time.time() - foo_time_start2
+        foo_time += time.time() - foo_time_start_d2h
         return infer_output, foo_time

From a8599e7f5f164e66833070370e4910d984dece90 Mon Sep 17 00:00:00 2001
From: KungYork <30741085+KungYork@users.noreply.github.com>
Date: Fri, 22 Sep 2023 17:20:46 +0800
Subject: [PATCH 12/18] kunlunxin: add BERT readme (#260)

* Add BERT readme

* Update 1x8 result in README.md

* Update header in README.md
---
 training/kunlunxin/bert-pytorch/README.md | 48 +++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 training/kunlunxin/bert-pytorch/README.md

diff --git a/training/kunlunxin/bert-pytorch/README.md b/training/kunlunxin/bert-pytorch/README.md
new file mode 100644
index 000000000..43d955af3
--- /dev/null
+++ b/training/kunlunxin/bert-pytorch/README.md
@@ -0,0 +1,48 @@
+### 模型Checkpoint下载
+[模型Checkpoint下载](../../benchmarks/bert/README.md#模型Checkpoint下载)
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/bert/README.md#测试数据集下载)
+
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：pytorch1.12.1-cpu-ubuntu20.04:v0.01
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-1.12.1+cpu
+
+#### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                         | 特殊说明                                    |
+| -------------- | ------------------------------ | ------------------------------------------- |
+| 任务类别       | 通用语言模型             |                                             |
+| 模型           | bert                            |                                             |
+| 数据集         | Wikipedia for bert              |                                             |
+| 数据精度       | precision,见“性能指标”         | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”            | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | R300                          |                                             |
+| 硬件存储使用   | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”          | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”           | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”           | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**        | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | acc,见“性能指标”               | 分类准确率(mlm_accuracy)                    |
+| 额外修改项     | 无                             |                                             |
+
+* 性能指标
+
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | mlm_accuracy   | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
+| R300单机单卡（1x1）  | fp32      | bs=8,lr=3.5e-04    |          |         |          |    |      |          |
+| R300单机8卡（1x8）  | fp32      | bs=8,lr=3.5e-04    |           |         |         |    | 0.36  |  26.5/32.0  |
+| R300两机8卡（2x8）  | fp32      | bs=8,lr=3.5e-04    |           |         |         |    |      |          |

From 3d4c4376c3c71e612a97235ffc3a3d24547662a1 Mon Sep 17 00:00:00 2001
From: clveryang <50865584+clveryang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 17:55:38 +0800
Subject: [PATCH 13/18] Iluvatar Ixrt environment (#265)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Ixrt environment

* add touch config

---------

Co-authored-by: 杨智超 <zhichao.yang@iluvatar.com>
---
 .../resnet50/vendor_config/iluvatar_configurations.yaml   | 4 ----
 inference/docker_images/iluvatar/pytorch/Dockerfile       | 6 +++---
 .../docker_images/iluvatar/pytorch/packages/README.md     | 8 +++++---
 .../docker_images/iluvatar/pytorch/pytorch_install.sh     | 2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml
index c721ede09..86ee38560 100644
--- a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml
+++ b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml
@@ -2,7 +2,3 @@ ixrt_tmp_path: iluvatar_tmp/resnet50-fp16.engine
 has_dynamic_axis: false
 repeat: 1
 image_size: 224
-exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx
-# exist_compiler_path: resnet50-fp16.engine
-output_types: {"output":"float32"}
-input_types: {"input": "float32"}
\ No newline at end of file
diff --git a/inference/docker_images/iluvatar/pytorch/Dockerfile b/inference/docker_images/iluvatar/pytorch/Dockerfile
index 3e72721cf..2b502c689 100644
--- a/inference/docker_images/iluvatar/pytorch/Dockerfile
+++ b/inference/docker_images/iluvatar/pytorch/Dockerfile
@@ -38,9 +38,9 @@ RUN apt-get install -y --fix-missing \
      
 
 # Configure anaconda
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
-    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
-    /root/miniconda/bin/conda clean -tipsy && \
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \
+    bash ./Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tip && \
     ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
     echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
     echo "conda activate base" >> ~/.bashrc && \
diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md
index 88a18b3dc..d528a13fb 100644
--- a/inference/docker_images/iluvatar/pytorch/packages/README.md
+++ b/inference/docker_images/iluvatar/pytorch/packages/README.md
@@ -2,8 +2,10 @@
 
 >联系邮箱: contact-us@iluvatar.com
 
-ixrt-0.4.0+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+ixrt-0.7.0+corex.latest.version-cp310-cp310-linux_x86_64.whl
 
-torch-1.13.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+torchvision-0.14.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
 
-torchvision-0.14.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
\ No newline at end of file
+pycuda-2022.2.2+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
+
+torch-1.13.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
\ No newline at end of file
diff --git a/inference/docker_images/iluvatar/pytorch/pytorch_install.sh b/inference/docker_images/iluvatar/pytorch/pytorch_install.sh
index 859591930..63cd26993 100644
--- a/inference/docker_images/iluvatar/pytorch/pytorch_install.sh
+++ b/inference/docker_images/iluvatar/pytorch/pytorch_install.sh
@@ -14,7 +14,7 @@ done
 search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
 for installer in $search_sdk_results; do
     echo "Install ${installer}"
-    sh "${installer}" -- --silent --driver --toolkit
+    sh "${installer}" -- --silent --toolkit
 done
 
 search_packages_results=`find ${PKG_DIR} -name "*.whl"`

From bf17dfbeb1e98e8851fbecb9d56ac56201e0e2ca Mon Sep 17 00:00:00 2001
From: gganduu_zz <gganduu_zz@163.com>
Date: Mon, 25 Sep 2023 16:13:01 +0800
Subject: [PATCH 14/18] Add ViT model for FlagPerf (#200)

* Add ViT model

* update the script based on zhiyuan's model

* Update script based on PR review

* Update ViT performance in README.md
---
 inference/benchmarks/vit_l_16/README.md       |   1 +
 inference/configs/host.yaml                   |   2 +-
 .../kunlunxin_configurations.yaml             |   5 +
 .../kunlunxin/kunlunxin_analysis.py           |  44 +-
 .../kunlunxin/kunlunxin_monitor.py            | 513 +++++++++---------
 .../kunlunxin/pytorch_1.13/Dockerfile         |   1 +
 inference/inference_engine/kunlunxin/xtcl.py  |   1 +
 inference/run.py                              |   4 +-
 8 files changed, 289 insertions(+), 282 deletions(-)
 create mode 100644 inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml

diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md
index 5998c0cf9..cd77ab738 100644
--- a/inference/benchmarks/vit_l_16/README.md
+++ b/inference/benchmarks/vit_l_16/README.md
@@ -83,4 +83,5 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16    | 64   |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 |
 | tensorrt | fp32   | 32 | 1275.9 | 482.4  | 491.1 | 555.5    | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 |
+| kunlunxin_xtcl | W32A16   | 32 | 2118.307 | / | / | 130.006    | 144.914 | 27.9% | 79.3/79.3 | / |
 
diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml
index f5ec9d0ac..25c7f796b 100644
--- a/inference/configs/host.yaml
+++ b/inference/configs/host.yaml
@@ -13,4 +13,4 @@ PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
 ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 CASES: 
-    "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
\ No newline at end of file
+    "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
diff --git a/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml
new file mode 100644
index 000000000..bf71dd82c
--- /dev/null
+++ b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,5 @@
+compiler: xtcl
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: true
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: /home/FlagPerf/inference/onnxs/vit_l_16_bs32_pytorch_fp16False.onnx
diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py
index 388f89cee..be1a60b1b 100644
--- a/inference/docker_images/kunlunxin/kunlunxin_analysis.py
+++ b/inference/docker_images/kunlunxin/kunlunxin_analysis.py
@@ -1,23 +1,21 @@
-def analysis_log(logpath):
-    logfile = open(logpath)
-
-    max_usage = 0.0 ## usage_mem
-    max_mem = 0.0 
-    for line in logfile.readlines():
-        '''
-        xpu_smi temp power mem w_mem use_rate
-        '''
-        if "xpu_smi" in line:
-            line = line[:-1]
-            usage = line.split(" ")[4]
-            usage = float(usage)
-            max_usage = max(max_usage, usage)
-            max_mem = line.split(" ")[5]
-            max_mem = float(max_mem)
-
-    return round(max_usage / 1024.0,
-                 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12")
-
-
-if __name__ == "__main__":
-    max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log")
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0 ## usage_mem
+    max_mem = 0.0 
+    for line in logfile.readlines():
+        '''
+        xpu_smi temp power mem w_mem use_rate
+        '''
+        if "xpu_smi" in line:
+            line = line[:-1]
+            usage = line.split(" ")[4]
+            usage = float(usage)
+            max_usage = max(max_usage, usage)
+            max_mem = line.split(" ")[5]
+            max_mem = float(max_mem)
+
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12")
+
+
diff --git a/inference/docker_images/kunlunxin/kunlunxin_monitor.py b/inference/docker_images/kunlunxin/kunlunxin_monitor.py
index ba5a877a1..7d31179ae 100644
--- a/inference/docker_images/kunlunxin/kunlunxin_monitor.py
+++ b/inference/docker_images/kunlunxin/kunlunxin_monitor.py
@@ -1,256 +1,257 @@
-# ！/usr/bin/env python3
-# encoding: utf-8
-'''
-Usage:  python3 sys-monitor.py -o operation -l [log_path]
-            -o, --operation     start|stop|restart|status
-            -l, --log           log path , ./logs/ default
-'''
-
-import os
-import sys
-import time
-import signal
-import atexit
-import argparse
-import datetime
-from multiprocessing import Process
-import subprocess
-import schedule
-
-
-class Daemon:
-    '''
-    daemon subprocess class.
-    usage: subclass this daemon and override the run() method.
-    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
-    verbose: debug mode, disabled default.
-    '''
-
-    def __init__(self,
-                 pid_file,
-                 log_file,
-                 err_file,
-                 gpu_log,
-                 log_path,
-                 rate=5,
-                 stdin=os.devnull,
-                 stdout=os.devnull,
-                 stderr=os.devnull,
-                 home_dir='.',
-                 umask=0o22,
-                 verbose=0):
-        self.stdin = stdin
-        self.stdout = stdout
-        self.stderr = stderr
-        self.home_dir = home_dir
-        self.verbose = verbose
-        self.pidfile = pid_file
-        self.logfile = log_file
-        self.errfile = err_file
-        self.gpufile = gpu_log
-        self.logpath = log_path
-        self.rate = rate
-        self.umask = umask
-        self.verbose = verbose
-        self.daemon_alive = True
-
-    def get_pid(self):
-        try:
-            with open(self.pidfile, 'r') as pf:
-                pid = int(pf.read().strip())
-        except IOError:
-            pid = None
-        except SystemExit:
-            pid = None
-        return pid
-
-    def del_pid(self):
-        if os.path.exists(self.pidfile):
-            os.remove(self.pidfile)
-
-    def run(self):
-        '''
-        NOTE: override the method in subclass
-        '''
-
-        def gpu_mon(file):
-            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
-            cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'"  ## temp power mem w_mem use_rate
-            process = subprocess.Popen(cmd,
-                                       shell=True,
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.STDOUT,
-                                       encoding='utf-8')
-            try:
-                out = process.communicate(timeout=10)
-            except subprocess.TimeoutExpired:
-                process.kill()
-                out = process.communicate()
-
-            if process.returncode != 0:
-                result = "error"
-            result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n"
-            with open(file, 'a') as f:
-                f.write(result)
-
-        def timer_gpu_mon():
-            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
-            gpu_process.start()
-
-        schedule.every(self.rate).seconds.do(timer_gpu_mon)
-        while True:
-            schedule.run_pending()
-            time.sleep(5)
-
-    def daemonize(self):
-        if self.verbose >= 1:
-            print('daemon process starting ...')
-        try:
-            pid = os.fork()
-            if pid > 0:
-                sys.exit(0)
-        except OSError as e:
-            sys.stderr.write('fork #1 failed: %d (%s)\n' %
-                             (e.errno, e.strerror))
-            sys.exit(1)
-        os.chdir(self.home_dir)
-        os.setsid()
-        os.umask(self.umask)
-        try:
-            pid = os.fork()
-            if pid > 0:
-                sys.exit(0)
-        except OSError as e:
-            sys.stderr.write('fork #2 failed: %d (%s)\n' %
-                             (e.errno, e.strerror))
-            sys.exit(1)
-        sys.stdout.flush()
-        sys.stderr.flush()
-        si = open(self.stdin, 'r')
-        so = open(self.stdout, 'a+')
-        if self.stderr:
-            se = open(self.stderr, 'a+')
-        else:
-            se = so
-        os.dup2(si.fileno(), sys.stdin.fileno())
-        os.dup2(so.fileno(), sys.stdout.fileno())
-        os.dup2(se.fileno(), sys.stderr.fileno())
-        atexit.register(self.del_pid)
-        pid = str(os.getpid())
-        with open(self.pidfile, 'w+') as f:
-            f.write('%s\n' % pid)
-
-    def start(self):
-        if not os.path.exists(self.logpath):
-            os.makedirs(self.logpath)
-        elif os.path.exists(self.gpufile):
-            os.remove(self.gpufile)
-        if self.verbose >= 1:
-            print('ready to start ......')
-        # check for a pid file to see if the daemon already runs
-        pid = self.get_pid()
-        if pid:
-            msg = 'pid file %s already exists, is it already running?\n'
-            sys.stderr.write(msg % self.pidfile)
-            sys.exit(1)
-        # start the daemon
-        self.daemonize()
-        self.run()
-
-    def stop(self):
-        if self.verbose >= 1:
-            print('stopping ...')
-        pid = self.get_pid()
-        if not pid:
-            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
-            sys.stderr.write(msg)
-            if os.path.exists(self.pidfile):
-                os.remove(self.pidfile)
-            return
-        # try to kill the daemon process
-        try:
-            i = 0
-            while 1:
-                os.kill(pid, signal.SIGTERM)
-                time.sleep(1)
-                i = i + 1
-                if i % 10 == 0:
-                    os.kill(pid, signal.SIGHUP)
-        except OSError as err:
-            err = str(err)
-            if err.find('No such process') > 0:
-                if os.path.exists(self.pidfile):
-                    os.remove(self.pidfile)
-            else:
-                print(str(err))
-                sys.exit(1)
-            if self.verbose >= 1:
-                print('Stopped!')
-
-    def restart(self):
-        self.stop()
-        self.start()
-
-    def status(self):
-        pid = self.get_pid()
-        if pid:
-            if os.path.exists('/proc/%d' % pid):
-                return pid
-        return False
-
-
-def parse_args():
-    ''' Check script input parameter. '''
-    parse = argparse.ArgumentParser(description='Sys monitor script')
-    parse.add_argument('-o',
-                       type=str,
-                       metavar='[operation]',
-                       required=True,
-                       help='start|stop|restart|status')
-    parse.add_argument('-l',
-                       type=str,
-                       metavar='[log_path]',
-                       required=False,
-                       default='./logs/',
-                       help='log path')
-    args = parse.parse_args()
-    return args
-
-
-def main():
-    sample_rate1 = 5
-    args = parse_args()
-    operation = args.o
-    log_path = args.l
-    pid_fn = str('/tmp/xpu_monitor.pid')
-    log_fn = str(log_path + '/kunlunxin_monitor.log')
-    err_fn = str(log_path + '/kunlunxin_monitor.err')
-    # result for gpu
-    gpu_fn = str(log_path + '/kunlunxin_monitor.log')
-
-    subdaemon = Daemon(pid_fn,
-                       log_fn,
-                       err_fn,
-                       gpu_fn,
-                       log_path,
-                       verbose=1,
-                       rate=sample_rate1)
-    if operation == 'start':
-        subdaemon.start()
-    elif operation == 'stop':
-        subdaemon.stop()
-    elif operation == 'restart':
-        subdaemon.restart()
-    elif operation == 'status':
-        pid = subdaemon.status()
-        if pid:
-            print('process [%s] is running ......' % pid)
-        else:
-            print('daemon process [%s] stopped' % pid)
-    else:
-        print("invalid argument!")
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'"  ## temp power mem w_mem use_rate
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/xpu_monitor.pid')
+    log_fn = str(log_path + '/kunlunxin_monitor.log')
+    err_fn = str(log_path + '/kunlunxin_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/kunlunxin_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
index 7227b9743..fa778e7e8 100644
--- a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
+++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
@@ -72,6 +72,7 @@ ENV TVM_DIR=/root/XTCL-ubuntu_x86_64
 
 
 
+
 ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH
 ENV PATH /root/miniconda/envs/python38/bin:$PATH
 
diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py
index 2643f51d5..3afbe9634 100755
--- a/inference/inference_engine/kunlunxin/xtcl.py
+++ b/inference/inference_engine/kunlunxin/xtcl.py
@@ -82,3 +82,4 @@ def __call__(self, model_inputs: list):
         return output_list, foo_time
 
 
+
diff --git a/inference/run.py b/inference/run.py
index a11fa4824..36cf49222 100644
--- a/inference/run.py
+++ b/inference/run.py
@@ -272,7 +272,7 @@ def start_monitors_in_cluster(dp_path, case_log_dir, nnodes):
 
     ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR,
                                 config.VENDOR + "_monitor.py")
-    start_mon_cmd = "cd " + dp_path + " && " + sys.executable \
+    start_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \
                     + " " + ven_mon_path + " -o restart -l "
     logger.debug("Run cmd in the cluster to start vendor's monitors: " +
                  start_mon_cmd)
@@ -299,7 +299,7 @@ def stop_monitors_in_cluster(dp_path, nnodes):
 
     ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR,
                                 config.VENDOR + "_monitor.py")
-    stop_mon_cmd = "cd " + dp_path + " && " + sys.executable \
+    stop_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \
                    + " " + ven_mon_path + " -o stop"
     logger.debug("Run cmd in the cluster to stop vendor's monitors: " +
                  stop_mon_cmd)

From dfc9a80041e43a3e4d2603563cdfb086d6220bef Mon Sep 17 00:00:00 2001
From: TWANG07 <91315832+TWANG07@users.noreply.github.com>
Date: Mon, 25 Sep 2023 16:34:45 +0800
Subject: [PATCH 15/18] support swin_transformer on XPU (#255)

* support swin_transformer on XPU

* support swin_transformer on XPU

---------

Co-authored-by: wangdongyu04 <wangdongyu04@baidu.com>
---
 .../swin_transformer-pytorch/README.md        | 25 +++++++++++++++++++
 .../config/config_R300x1x1.py                 |  3 +++
 .../config/config_R300x1x8.py                 |  3 +++
 .../config/config_R300x2x8.py                 |  3 +++
 .../config/config_common.py                   |  3 +++
 .../config/environment_variables.sh           | 25 +++++++++++++++++++
 .../config/requirements.txt                   |  8 ++++++
 .../swin_transformer-pytorch/extern/.gitkeep  |  0
 8 files changed, 70 insertions(+)
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/README.md
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_common.py
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/requirements.txt
 create mode 100644 training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep

diff --git a/training/kunlunxin/swin_transformer-pytorch/README.md b/training/kunlunxin/swin_transformer-pytorch/README.md
new file mode 100644
index 000000000..87adae7fd
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/README.md
@@ -0,0 +1,25 @@
+### 测试数据集下载
+参见[测试数据集下载](../../benchmarks/swin_transformer/README.md#数据集)
+
+### 昆仑芯 XPU 配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  -  机器型号: 昆仑芯AI加速器组R480-X8
+  -  加速卡型号: 昆仑芯AI加速卡R300
+  -  多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：pytorch1.12.1-cpu-ubuntu20.04:v0.01
+  - 训练框架版本：xmlir+9bb59e9e [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/9bb59e9e/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)
+  - 训练编译器版本：xacc+9bb59e9e [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/9bb59e9e/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)
+  - 依赖软件版本：pytorch-1.12.1+cpu
+
+### 运行情况
+| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
+| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
+| R300单机单卡（1x1）  | config_R300x1x1 |      |       |    |     |             |
+| R300单机8卡（1x8）  | config_R300x1x8 |   788401.61   |   81.00  |  80.598  |  1501200  |    555.28   |
+| R300两机8卡（2x8）  | config_R300x2x8 |      |       |    |     |             |
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py
new file mode 100644
index 000000000..52be8aa2f
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,3 @@
+from config_common import *
+
+train_batch_size = 32
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py
new file mode 100644
index 000000000..52be8aa2f
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py
@@ -0,0 +1,3 @@
+from config_common import *
+
+train_batch_size = 32
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py
new file mode 100644
index 000000000..52be8aa2f
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,3 @@
+from config_common import *
+
+train_batch_size = 32
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_common.py b/training/kunlunxin/swin_transformer-pytorch/config/config_common.py
new file mode 100644
index 000000000..60ac8485a
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/config_common.py
@@ -0,0 +1,3 @@
+vendor = "kunlunxin"
+dist_backend = "xccl"
+amp_enable = False
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh b/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..8c1fdbe0c
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh
@@ -0,0 +1,25 @@
+# =================================================
+# Export variables
+# =================================================
+
+export XMLIR_F_XPU_ENABLED_BOOL=true
+export XMLIR_TORCH_XCCL_ENABLED=true
+
+
+# =================================================
+# R480 config
+# =================================================
+
+export OMP_NUM_THREADS=1
+export XACC_ARGS="-L amp"
+export XACC=1
+export BKCL_PCIE_RING=1
+
+KLX_WEB_SERVER_URL=http://127.0.0.1:8000
+
+pip uninstall -y xacc || true
+pip install ${KLX_WEB_SERVER_URL}/flagperf/archives/9bb59e9e/xacc-0.1.0-cp38-cp38-linux_x86_64.whl
+pip uninstall -y xmlir || true
+pip install ${KLX_WEB_SERVER_URL}/flagperf/archives/9bb59e9e/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl
+
+python -m xacc.install
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt b/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt
new file mode 100644
index 000000000..3151c03d6
--- /dev/null
+++ b/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt
@@ -0,0 +1,8 @@
+psutil==5.9.5
+numpy>=1.15.4
+timm==0.4.12
+accelerate==0.20.3
+
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==1.12.1+cpu
+torchvision==0.13.1+cpu
\ No newline at end of file
diff --git a/training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep b/training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb

From eb867959ce56b65ec6034d15567d4b1fa5920313 Mon Sep 17 00:00:00 2001
From: zjm <815496138@qq.com>
Date: Mon, 25 Sep 2023 16:36:37 +0800
Subject: [PATCH 16/18] Kunlunxin add stable diffusion v 1_4  case (#227)

* kunlunxin inference

* xtcl support fp16 onnx

* Add stable diffusion fp32 case

* kunlunxin add yolov5 case

* update resnet50 fp16 performance

* add stable_diffusion_v1_4 kunlunxin mem_usage

---------

Co-authored-by: zhaoyixuan02 <zhaoyixuan02@baidu.com>
Co-authored-by: zhoujiamin01 <zhoujiamin01@baidu.com>
---
 inference/benchmarks/resnet50/README.md       |  5 +++--
 .../stable_diffusion_v1_4/README.md           |  1 +
 inference/benchmarks/yolov5/README.md         | 20 +++++++++++++++++++
 .../yolov5/pytorch/kunlunxin_requirements.txt |  2 ++
 .../kunlunxin_configurations.yaml             |  1 +
 .../kunlunxin_configurations.yaml             |  3 +++
 .../kunlunxin/pytorch_1.13/Dockerfile         |  5 +++--
 inference/inference_engine/kunlunxin/xtcl.py  | 15 +++++++++-----
 8 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt
 create mode 100644 inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml

diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
index 4e802f022..a1650ef8e 100644
--- a/inference/benchmarks/resnet50/README.md
+++ b/inference/benchmarks/resnet50/README.md
@@ -76,7 +76,7 @@ find ./val -name "*JPEG" | wc -l
 
 - 推理工具包
    
-   - XTCL 2.1
+   - XTCL daily 2023.09.23
 
 #### 2.3 天数智芯 MR-V100
 
@@ -139,6 +139,7 @@ find ./val -name "*JPEG" | wc -l
 | tensorrt | fp32   | 256  | 474.4    | 1487.3      | 2653.2     | 1560.3        | 6091.6  | 16.1% | 76.2/76.2 | 28.86/40.0 |
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
 | ixrt     | fp16  (W16A32)   | 256  | 261.467 | /      | /      | 1389.332  | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 |
-| kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
+| kunlunxin_xtcl | fp32   | 128  | / | /  | / | /      | /      | 12.1% | 76.2/76.2 | 4.52/32.0 |
+| kunlunxin_xtcl | fp16   | 256  | 164.675 | /  | /   |  1566.407 | 3317.012  | 12.1% | 76.2/76.2 | 4.52/32.0 |
 | zixiao | fp16   | 32*6  | 261.103    | /      | /     |  193.151    | 6342.191  | / | 76.2/76.2 | / |
 
diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
index 07aade914..0aa3ebb31 100644
--- a/inference/benchmarks/stable_diffusion_v1_4/README.md
+++ b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -56,5 +56,6 @@
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16    | 2   |1674.9 | 11.4        | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 |
 | tensorrt | fp32   | 2 | 1807.4 | 8.2 | 20.6 | 7.2  | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 |
+| kunlunxin_xtcl | fp32   | 2 | 213.822 | / | / | 4.755  | 9.471 | 20.1% | 26.524/25.3 | 0.07/32.0 |
 | null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
 | null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
diff --git a/inference/benchmarks/yolov5/README.md b/inference/benchmarks/yolov5/README.md
index 91354d40b..7e0ffa4df 100644
--- a/inference/benchmarks/yolov5/README.md
+++ b/inference/benchmarks/yolov5/README.md
@@ -53,6 +53,25 @@ find ./val -name "*JPEG" | wc -l
    - TensorRT 8.5.1.7
    - torch_tensorrt 1.3.0
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+     - pycocotools: 2.0.7
+
+- 推理工具包
+
+   - XTCL 2.1
+
 ### 3. 运行情况
 
 * 指标列表
@@ -75,3 +94,4 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | -------- | ----------- | ---------- | ------------- | ------------ |  ------------ |----------- | ---------- |
 | tensorrt | fp32   | 96  | 733.8    |    /   | /    | 53.8       | 361.4 |12.6%| 0.45 | 35.44/40.0 |
 | tensorrt | fp16   | 96  | 1665.8    |    /   | /    | 58.6     | 859 |15.0%| 0.45 | 26.15/40.0 |
+| kunlunxin_xtcl | fp32   | 96  | / |    /   | / | /   | / |18.9%| 0.451 | 26.42/32.0 |
diff --git a/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt b/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt
new file mode 100644
index 000000000..973355e88
--- /dev/null
+++ b/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt
@@ -0,0 +1,2 @@
+pycocotools
+opencv-python-headless
diff --git a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
index 4b2b5ffcb..0cad8cab4 100644
--- a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
+++ b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
@@ -2,3 +2,4 @@ fp16: false
 compiler: xtcl
 no_validation: true
 exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx 
+resnet50_fuse: true
diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml
new file mode 100644
index 000000000..ed982f5f1
--- /dev/null
+++ b/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,3 @@
+fp16: false
+compiler: xtcl
+no_validation: true
diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
index fa778e7e8..a61287b38 100644
--- a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
+++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
@@ -46,11 +46,12 @@ RUN /root/miniconda/envs/python38/bin/pip install \
     munch \
     pyyaml \
     tqdm \
-    scipy
+    scipy \
+    opencv-python-headless
 
 RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu
 
-RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/XTCL-2.1/XTCL-ubuntu_x86_64.tar.gz && tar -xzf XTCL-ubuntu_x86_64.tar.gz
+RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/kunlunxin_xtcl_output_ubuntu1604_daily_0923.tar.gz && tar -xzf kunlunxin_xtcl_output_ubuntu1604_daily_0923.tar.gz &&  mv output/XTCL  XTCL-ubuntu_x86_64
 
 RUN cd /root && wget https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.18.1/xre-ubuntu_2004_x86_64.tar.gz && tar -xzf xre-ubuntu_2004_x86_64.tar.gz
 
diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py
index 3afbe9634..5e38a7e41 100755
--- a/inference/inference_engine/kunlunxin/xtcl.py
+++ b/inference/inference_engine/kunlunxin/xtcl.py
@@ -26,7 +26,7 @@ def build_engine(self, config, onnx_path):
         for input in onnx_model.graph.input:
             input_shape = input.type.tensor_type.shape.dim
             input_shape = [a.dim_value for a in input_shape]
-            input_shape[0] = config.batch_size
+            #input_shape[0] = config.batch_size
             input_name = input.name #'inputs:0'
             self.input_names.append(input_name)
             shape_dict[input_name] = input_shape
@@ -35,7 +35,11 @@ def build_engine(self, config, onnx_path):
 
         target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}'
         ctx = tvm.device("xpu", 0)
-        build_config = {}
+        build_config = {
+                }
+        #os.environ["XTCL_BUILD_DEBUG"] = '1'
+        if config.resnet50_fuse:
+            os.environ["XTCL_FUSE_RES50V15"] = '1'
         if config.fp16 == True:
             os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1'
             input_fp16 = { name:"float16" for name in self.input_names}
@@ -47,6 +51,7 @@ def build_engine(self, config, onnx_path):
                                                  config_var_dtype_map=input_fp16,
                                                  ).value()
         else: ## fp32
+            os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1'
             os.environ['XTCL_USE_FP16'] = '1'
             os.environ['XTCL_QUANTIZE_WEIGHT'] = '1'
 
@@ -70,12 +75,12 @@ def build_engine(self, config, onnx_path):
     def __call__(self, model_inputs: list):
         for index, input_name in enumerate(self.input_names):
             if USE_VM_COMPILE:
-                self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index]))
+                self.engine.set_one_input("main",input_name, model_inputs[index].numpy())
             else:
-                self.engine.set_input(input_name, tvm.nd.array(model_inputs[index]))
+                self.engine.set_input(input_name, model_inputs[index].numpy())
         self.engine.run()
-        output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
         foo_time_start = time.time()
+        output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
         # d2h
         output_list = [torch.from_numpy(output.asnumpy()) for output in output_list]
         foo_time = time.time() - foo_time_start

From 6b0ae6ce69e6df7d9a4eb93a08024fef416df856 Mon Sep 17 00:00:00 2001
From: liuyumoye <452803476@qq.com>
Date: Mon, 25 Sep 2023 16:37:58 +0800
Subject: [PATCH 17/18] kunlunxin swinTransformer inference configs && results
 (#243)

* kunlunxin swinTransformer inference configs && results

* kunlunxin swinTransformer inference configs && results

{'vendor': 'kunlunxin', 'compiler': 'xtcl', 'precision': 'fp32', 'batchsize': 256, 'flops': 723982880000.0, 'e2e_time(second)': 543.745, 'p_validation_whole(qps)': None, 'p_validation_core(qps)': None, 'p_inference_whole(qps)': 166.937, '*p_inference_core(qps)': 175.724, 'val_average_acc': None, 'infer_average_acc': 0.832}

---------

Co-authored-by: SHIHONGHAO <13820618441@163.com>
---
 inference/benchmarks/swinTransformer/README.md   |  2 +-
 .../configs/swinTransformer/configurations.yaml  |  2 +-
 .../vendor_config/kunlunxin_configurations.yaml  | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml

diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md
index 04a97a3a6..14304fed9 100644
--- a/inference/benchmarks/swinTransformer/README.md
+++ b/inference/benchmarks/swinTransformer/README.md
@@ -84,4 +84,4 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16      | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 |
 | tensorrt | fp32   | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 |
-
+| kunlunxin_xtcl| W32A16   | 256 | 543.745 | / | / | / | / | / | 0.832 | / |
diff --git a/inference/configs/swinTransformer/configurations.yaml b/inference/configs/swinTransformer/configurations.yaml
index 1b7dd0607..66a36a9ff 100644
--- a/inference/configs/swinTransformer/configurations.yaml
+++ b/inference/configs/swinTransformer/configurations.yaml
@@ -13,4 +13,4 @@ no_validation: false
 # set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
 exist_onnx_path: null
 # set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
-exist_compiler_path: null
\ No newline at end of file
+exist_compiler_path: null
diff --git a/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml b/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml
new file mode 100644
index 000000000..209ee7821
--- /dev/null
+++ b/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,16 @@
+batch_size: 256
+# 1 item(like 1 sequence, 1 image) flops
+# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here
+# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50)
+flops: 1.55e10
+fp16: false
+compiler: xtcl
+num_workers: 8
+log_freq: 30
+repeat: 5
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: true
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: /home/liuyu/flagperf/FlagPerf/inference/onnxs/kunlunxin_flagperf_swinTransformer/swinTransformer_bs256_pytorch_fp16False.onnx
+# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
+exist_compiler_path: null

From 98d85df9eb477fa444c4a3f405096bec66c9289e Mon Sep 17 00:00:00 2001
From: Quanfeng Li <liquanfeng7@foxmail.com>
Date: Mon, 25 Sep 2023 16:51:39 +0800
Subject: [PATCH 18/18] kunlunxin sam_h (#244)

---
 inference/benchmarks/sam_h/README.md          | 18 +++++
 inference/benchmarks/sam_h/pytorch/forward.py |  3 +-
 .../kunlunxin_configurations.yaml             |  1 +
 .../kunlunxin_configurations.yaml             | 10 +++
 inference/inference_engine/kunlunxin/xtcl.py  | 70 ++++++++-----------
 5 files changed, 60 insertions(+), 42 deletions(-)
 create mode 100644 inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml

diff --git a/inference/benchmarks/sam_h/README.md b/inference/benchmarks/sam_h/README.md
index 02d6cf352..3ab043f48 100644
--- a/inference/benchmarks/sam_h/README.md
+++ b/inference/benchmarks/sam_h/README.md
@@ -36,6 +36,24 @@
 
   - TensorRT 8.6.1
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+   
+   - XTCL 2.0.0.67
+
 ### 3. 运行情况
 
 * 指标列表
diff --git a/inference/benchmarks/sam_h/pytorch/forward.py b/inference/benchmarks/sam_h/pytorch/forward.py
index df61177fa..9ff355c68 100644
--- a/inference/benchmarks/sam_h/pytorch/forward.py
+++ b/inference/benchmarks/sam_h/pytorch/forward.py
@@ -84,7 +84,6 @@ def engine_forward(model, dataloader, evaluator, config):
         for step, (x, y, osize, dsize) in enumerate(dataloader):
             if config.fp16:
                 x = x.to(torch.float16)
-                y = y.to(torch.float16)
             torch_sync(config)
             core_time_start = time.time()
 
@@ -101,7 +100,7 @@ def engine_forward(model, dataloader, evaluator, config):
                 torch_sync(config)
                 core_time += time.time() - core_time_start
 
-                pred = pred[0]
+                pred = pred[1]
                 pred = pred.reshape(config.batch_size, 1, 3, 256, 256).float()
                 pred = pred.cpu()
 
diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
index c29b9c46b..7cb3e921a 100644
--- a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
+++ b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
@@ -1,3 +1,4 @@
 compiler: xtcl
 no_validation: true
+vm_enable: false
 exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx
diff --git a/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml b/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml
new file mode 100644
index 000000000..81b04fceb
--- /dev/null
+++ b/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,10 @@
+compiler: xtcl
+no_validation: true
+build_config:
+  FuseWithoutPattern:
+    - FuseConv2dTransposeBiasAdd
+  pattern_match:
+    - fuse_attention_sam
+disabled_pass:
+  - xgraph_layout_opt
+exist_onnx_path: onnxs/sam_h_bs4_pytorch_fp16True.onnx
diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py
index 5e38a7e41..eb31dfe06 100755
--- a/inference/inference_engine/kunlunxin/xtcl.py
+++ b/inference/inference_engine/kunlunxin/xtcl.py
@@ -1,33 +1,28 @@
+import os
+import time
+
 import onnx
+import torch
 import tvm
 import tvm.relay as relay
-from tvm.contrib.download import download_testdata
-from tvm.relay import param_dict
 from tvm.contrib import graph_executor, xpu_config
+from tvm.relay.xpu.patterns import custom_fuse_patterns
 from tvm.runtime.vm import VirtualMachine
-import torch
-import os
-import subprocess
-from loguru import logger
-import numpy as np
-import time
 
-USE_VM_COMPILE = False
 
 class InferModel:
 
-    def __init__(self, config , onnx_path, model):
+    def __init__(self, config, onnx_path, model):
         self.input_names = []
         self.engine = self.build_engine(config, onnx_path)
+        self.vm_enable = True
 
     def build_engine(self, config, onnx_path):
         onnx_model = onnx.load(onnx_path)
         shape_dict = {}
-        for input in onnx_model.graph.input:
-            input_shape = input.type.tensor_type.shape.dim
-            input_shape = [a.dim_value for a in input_shape]
-            #input_shape[0] = config.batch_size
-            input_name = input.name #'inputs:0'
+        for inp in onnx_model.graph.input:
+            input_name, input_shape, _, _ = relay.frontend.onnx.get_info(inp)
+            input_shape[0] = config.batch_size
             self.input_names.append(input_name)
             shape_dict[input_name] = input_shape
 
@@ -35,56 +30,51 @@ def build_engine(self, config, onnx_path):
 
         target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}'
         ctx = tvm.device("xpu", 0)
-        build_config = {
-                }
+        build_config = config.build_config if 'build_config' in config._fields else {}
+        disabled_pass = config.disabled_pass if 'disabled_pass' in config._fields else []
+        self.vm_enable = config.vm_enable if 'vm_enable' in config._fields else True
+        if "pattern_match" in build_config:
+            build_config["XPUFuzzyMatch"] = xpu_config.XPUGraphMatchConfig(
+                pattern_match=build_config["pattern_match"]).value()
+            del build_config["pattern_match"]
         #os.environ["XTCL_BUILD_DEBUG"] = '1'
         if config.resnet50_fuse:
             os.environ["XTCL_FUSE_RES50V15"] = '1'
         if config.fp16 == True:
             os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1'
-            input_fp16 = { name:"float16" for name in self.input_names}
             build_config["XPUOutDtypeConfig"] = xpu_config.XPUOutDtypeConfig(
-                                                 default_precision="float16",
-                                                 config_last_node=True,
-                                                 config_map={
-                                                 },
-                                                 config_var_dtype_map=input_fp16,
-                                                 ).value()
+                default_precision="float16",
+                config_last_node=True,
+                config_map={},
+            ).value()
         else: ## fp32
             os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1'
             os.environ['XTCL_USE_FP16'] = '1'
             os.environ['XTCL_QUANTIZE_WEIGHT'] = '1'
 
-        with tvm.transform.PassContext(opt_level=3, config=build_config):
-            if USE_VM_COMPILE:
-                vm_exec = relay.backend.vm.compile(mod,
-                                                target=target_host,
-                                                target_host=target_host,
-                                                params=params)
-                
+        with tvm.transform.PassContext(opt_level=3, config=build_config, disabled_pass=disabled_pass):
+            if self.vm_enable:
+                vm_exec = relay.backend.vm.compile(mod, target=target_host, target_host=target_host, params=params)
                 vm = VirtualMachine(vm_exec, ctx)
                 return vm
             else:
                 graph, lib, params = relay.build(mod,
-                                                target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2",
-                                                params=params)
+                                                 target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2",
+                                                 params=params)
                 m = graph_executor.create(graph, lib, ctx)
                 m.set_input(**params)
                 return m
 
     def __call__(self, model_inputs: list):
         for index, input_name in enumerate(self.input_names):
-            if USE_VM_COMPILE:
-                self.engine.set_one_input("main",input_name, model_inputs[index].numpy())
+            if self.vm_enable:
+                self.engine.set_one_input("main", input_name, model_inputs[index].numpy())
             else:
-                self.engine.set_input(input_name, model_inputs[index].numpy())
+                self.engine.set_input(input_name, tvm.nd.array(model_inputs[index]))
         self.engine.run()
         foo_time_start = time.time()
         output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())]
         # d2h
-        output_list = [torch.from_numpy(output.asnumpy()) for output in output_list]
+        output_list = [torch.from_numpy(output.numpy()) for output in output_list]
         foo_time = time.time() - foo_time_start
         return output_list, foo_time
-
-
-