From 98b2d468d0028f1646d164a262df280ce5ca455f Mon Sep 17 00:00:00 2001
From: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Date: Wed, 27 Sep 2023 10:53:30 +0800
Subject: [PATCH 1/5] Fit gpt2 on kunlunxin

---
 training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py  | 4 ++--
 training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py | 3 +++
 training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py | 4 ++++
 training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py | 3 +++
 training/kunlunxin/gpt2-pytorch/config/config_common.py   | 6 ++++++
 training/kunlunxin/gpt2-pytorch/extern/.gitkeep           | 0
 6 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
 create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py
 create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py
 create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_common.py
 create mode 100644 training/kunlunxin/gpt2-pytorch/extern/.gitkeep

diff --git a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
index e2f053054..fcd247ed8 100644
--- a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
+++ b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py
@@ -62,9 +62,9 @@ def clip_grad_norm_fp32(parameters, grads_for_norm,
             # Multi-tensor applier takes a function and a list of list
             # and performs the operation on that list all in one kernel.
             if grads_for_norm:
-                grad_norm = torch.cuda.FloatTensor([item.norm() for item in grads_for_norm]).norm()
+                grad_norm = torch.FloatTensor([item.norm() for item in grads_for_norm]).norm().cuda()
             else:
-                grad_norm = torch.cuda.FloatTensor([0])
+                grad_norm = torch.FloatTensor([0]).cuda()
             # Since we will be summing across data parallel groups,
             # we need the pow(norm-type).
             total_norm = grad_norm ** norm_type
diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
new file mode 100644
index 000000000..7dbc0150e
--- /dev/null
+++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,3 @@
+from config_common import *
+
+train_batch_size = 2
diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py
new file mode 100644
index 000000000..aaf855643
--- /dev/null
+++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+train_batch_size = 2
+max_steps = 46140
\ No newline at end of file
diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py
new file mode 100644
index 000000000..7dbc0150e
--- /dev/null
+++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,3 @@
+from config_common import *
+
+train_batch_size = 2
diff --git a/training/kunlunxin/gpt2-pytorch/config/config_common.py b/training/kunlunxin/gpt2-pytorch/config/config_common.py
new file mode 100644
index 000000000..25d1c8984
--- /dev/null
+++ b/training/kunlunxin/gpt2-pytorch/config/config_common.py
@@ -0,0 +1,6 @@
+vendor = 'kunlunxin'
+
+# disable fp16
+fp16 = False
+
+dist_backend = "xccl"
\ No newline at end of file
diff --git a/training/kunlunxin/gpt2-pytorch/extern/.gitkeep b/training/kunlunxin/gpt2-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb

From e4ef5fe549d2bdce580f6d6d40e0c71f0c5a98fe Mon Sep 17 00:00:00 2001
From: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Date: Wed, 27 Sep 2023 11:20:48 +0800
Subject: [PATCH 2/5] Add kunlunxin readme

---
 training/kunlunxin/gpt2-pytorch/README.md | 46 +++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 training/kunlunxin/gpt2-pytorch/README.md

diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md
new file mode 100644
index 000000000..f1c0455b4
--- /dev/null
+++ b/training/kunlunxin/gpt2-pytorch/README.md
@@ -0,0 +1,46 @@
+### 测试数据集下载
+[测试数据集下载](../../benchmarks/gpt2/README.md#测试数据集下载)
+
+### 昆仑芯XPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+  - 机器型号: 昆仑芯AI加速器组R480-X8
+  - 加速卡型号: 昆仑芯AI加速卡R300
+  - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+  - OS版本：Ubuntu 20.04
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：4.0.25
+  - Docker镜像和版本：pytorch1.12.1-cpu-ubuntu20.04:v0.01
+  - 训练框架版本：xmlir
+  - 训练编译器版本：xacc
+  - 依赖软件版本：pytorch-1.12.1+cpu
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | 自然语言编码            |                                             |
+| 模型           | megatron-gpt2-345m      |                                             |
+| 数据集         | lambada               |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | R300             |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练序列数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)，单位为samples/s(seq_length=1024)|
+| 训练结果       | lambada_acc,见“性能指标”    | lambada任务准确率                         |                                      |
+
+* 性能指标
+
+| 配置                | precision | fix_hp           | e2e_time | p_whole | p_train | p_core | lambada_acc | mem       |
+| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- |
+| R300单机单卡（1x1） |      |    |          |         |         |         |        |         |
+| R300单机8卡（1x8）  |  fp32      | bs=32,lr=0.00015 |     |     |     |  |  0.60 | 20.7/32.0 |
+| R300两机8卡（2x8）  |       |    |           |         |         |         |       |  |
+

From 933b288168d9f586c667c685fe172865611dab4c Mon Sep 17 00:00:00 2001
From: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Date: Wed, 27 Sep 2023 11:28:38 +0800
Subject: [PATCH 3/5] Refine task kind  kunlunxin readme

---
 training/kunlunxin/gpt2-pytorch/README.md | 2 +-
 training/nvidia/gpt2-pytorch/README.md    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md
index f1c0455b4..07f41544e 100644
--- a/training/kunlunxin/gpt2-pytorch/README.md
+++ b/training/kunlunxin/gpt2-pytorch/README.md
@@ -23,7 +23,7 @@
 
 | 指标名称       | 指标值                  | 特殊说明                                    |
 | -------------- | ----------------------- | ------------------------------------------- |
-| 任务类别       | 自然语言编码            |                                             |
+| 任务类别       | Text2Text Generation            |                                             |
 | 模型           | megatron-gpt2-345m      |                                             |
 | 数据集         | lambada               |                                             |
 | 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
diff --git a/training/nvidia/gpt2-pytorch/README.md b/training/nvidia/gpt2-pytorch/README.md
index 2b981c65b..a52d11a79 100644
--- a/training/nvidia/gpt2-pytorch/README.md
+++ b/training/nvidia/gpt2-pytorch/README.md
@@ -21,7 +21,7 @@
 
 | 指标名称       | 指标值                  | 特殊说明                                    |
 | -------------- | ----------------------- | ------------------------------------------- |
-| 任务类别       | 自然语言编码            |                                             |
+| 任务类别       | Text2Text Generation            |                                             |
 | 模型           | megatron-gpt2-345m      |                                             |
 | 数据集         | lambada               |                                             |
 | 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |

From a7d6b3ce85e183ff1a71d5f372b0d80814475e8a Mon Sep 17 00:00:00 2001
From: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Date: Sat, 7 Oct 2023 10:59:07 +0800
Subject: [PATCH 4/5] Fix unit of p_whole in README.md

---
 training/kunlunxin/gpt2-pytorch/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md
index 07f41544e..0e7f6b27b 100644
--- a/training/kunlunxin/gpt2-pytorch/README.md
+++ b/training/kunlunxin/gpt2-pytorch/README.md
@@ -31,7 +31,7 @@
 | 硬件设备简称   | R300             |                                             |
 | 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
 | 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
-| 总吞吐量       | p_whole,见“性能指标”    | 实际训练序列数除以总时间(performance_whole) |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练样本数除以总时间(performance_whole) |
 | 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
 | **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)，单位为samples/s(seq_length=1024)|
 | 训练结果       | lambada_acc,见“性能指标”    | lambada任务准确率                         |                                      |

From 7b97382e5682d2504b86ddc2241b9491b8d0a7fa Mon Sep 17 00:00:00 2001
From: root <root@szzj-isa-ai-chip0.szzj.baidu.com>
Date: Sat, 7 Oct 2023 14:35:51 +0800
Subject: [PATCH 5/5] Refine 1x1 config

---
 training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
index 7dbc0150e..0d955b5c4 100644
--- a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
+++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py
@@ -1,3 +1,5 @@
 from config_common import *
 
 train_batch_size = 2
+max_steps = 369120
+gradient_accumulation_steps = 8
\ No newline at end of file