From 98b2d468d0028f1646d164a262df280ce5ca455f Mon Sep 17 00:00:00 2001 From: root Date: Wed, 27 Sep 2023 10:53:30 +0800 Subject: [PATCH 1/5] Fit gpt2 on kunlunxin --- training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py | 4 ++-- training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py | 3 +++ training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py | 4 ++++ training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py | 3 +++ training/kunlunxin/gpt2-pytorch/config/config_common.py | 6 ++++++ training/kunlunxin/gpt2-pytorch/extern/.gitkeep | 0 6 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py create mode 100644 training/kunlunxin/gpt2-pytorch/config/config_common.py create mode 100644 training/kunlunxin/gpt2-pytorch/extern/.gitkeep diff --git a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py index e2f053054..fcd247ed8 100644 --- a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py +++ b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py @@ -62,9 +62,9 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, # Multi-tensor applier takes a function and a list of list # and performs the operation on that list all in one kernel. if grads_for_norm: - grad_norm = torch.cuda.FloatTensor([item.norm() for item in grads_for_norm]).norm() + grad_norm = torch.FloatTensor([item.norm() for item in grads_for_norm]).norm().cuda() else: - grad_norm = torch.cuda.FloatTensor([0]) + grad_norm = torch.FloatTensor([0]).cuda() # Since we will be summing across data parallel groups, # we need the pow(norm-type). total_norm = grad_norm ** norm_type diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..7dbc0150e --- /dev/null +++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py @@ -0,0 +1,3 @@ +from config_common import * + +train_batch_size = 2 diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py new file mode 100644 index 000000000..aaf855643 --- /dev/null +++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x8.py @@ -0,0 +1,4 @@ +from config_common import * + +train_batch_size = 2 +max_steps = 46140 \ No newline at end of file diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..7dbc0150e --- /dev/null +++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x2x8.py @@ -0,0 +1,3 @@ +from config_common import * + +train_batch_size = 2 diff --git a/training/kunlunxin/gpt2-pytorch/config/config_common.py b/training/kunlunxin/gpt2-pytorch/config/config_common.py new file mode 100644 index 000000000..25d1c8984 --- /dev/null +++ b/training/kunlunxin/gpt2-pytorch/config/config_common.py @@ -0,0 +1,6 @@ +vendor = 'kunlunxin' + +# disable fp16 +fp16 = False + +dist_backend = "xccl" \ No newline at end of file diff --git a/training/kunlunxin/gpt2-pytorch/extern/.gitkeep b/training/kunlunxin/gpt2-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb From e4ef5fe549d2bdce580f6d6d40e0c71f0c5a98fe Mon Sep 17 00:00:00 2001 From: root Date: Wed, 27 Sep 2023 11:20:48 +0800 Subject: [PATCH 2/5] Add kunlunxin readme --- training/kunlunxin/gpt2-pytorch/README.md | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 training/kunlunxin/gpt2-pytorch/README.md diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md new file mode 100644 index 000000000..f1c0455b4 --- /dev/null +++ b/training/kunlunxin/gpt2-pytorch/README.md @@ -0,0 +1,46 @@ +### 测试数据集下载 +[测试数据集下载](../../benchmarks/gpt2/README.md#测试数据集下载) + +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir + - 训练编译器版本:xacc + - 依赖软件版本:pytorch-1.12.1+cpu + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 自然语言编码 | | +| 模型 | megatron-gpt2-345m | | +| 数据集 | lambada | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练序列数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=1024)| +| 训练结果 | lambada_acc,见“性能指标” | lambada任务准确率 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | lambada_acc | mem | +| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- | +| R300单机单卡(1x1) | | | | | | | | | +| R300单机8卡(1x8) | fp32 | bs=32,lr=0.00015 | | | | | 0.60 | 20.7/32.0 | +| R300两机8卡(2x8) | | | | | | | | | + From 933b288168d9f586c667c685fe172865611dab4c Mon Sep 17 00:00:00 2001 From: root Date: Wed, 27 Sep 2023 11:28:38 +0800 Subject: [PATCH 3/5] Refine task kind kunlunxin readme --- training/kunlunxin/gpt2-pytorch/README.md | 2 +- training/nvidia/gpt2-pytorch/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md index f1c0455b4..07f41544e 100644 --- a/training/kunlunxin/gpt2-pytorch/README.md +++ b/training/kunlunxin/gpt2-pytorch/README.md @@ -23,7 +23,7 @@ | 指标名称 | 指标值 | 特殊说明 | | -------------- | ----------------------- | ------------------------------------------- | -| 任务类别 | 自然语言编码 | | +| 任务类别 | Text2Text Generation | | | 模型 | megatron-gpt2-345m | | | 数据集 | lambada | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | diff --git a/training/nvidia/gpt2-pytorch/README.md b/training/nvidia/gpt2-pytorch/README.md index 2b981c65b..a52d11a79 100644 --- a/training/nvidia/gpt2-pytorch/README.md +++ b/training/nvidia/gpt2-pytorch/README.md @@ -21,7 +21,7 @@ | 指标名称 | 指标值 | 特殊说明 | | -------------- | ----------------------- | ------------------------------------------- | -| 任务类别 | 自然语言编码 | | +| 任务类别 | Text2Text Generation | | | 模型 | megatron-gpt2-345m | | | 数据集 | lambada | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | From a7d6b3ce85e183ff1a71d5f372b0d80814475e8a Mon Sep 17 00:00:00 2001 From: root Date: Sat, 7 Oct 2023 10:59:07 +0800 Subject: [PATCH 4/5] Fix unit of p_whole in README.md --- training/kunlunxin/gpt2-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/gpt2-pytorch/README.md b/training/kunlunxin/gpt2-pytorch/README.md index 07f41544e..0e7f6b27b 100644 --- a/training/kunlunxin/gpt2-pytorch/README.md +++ b/training/kunlunxin/gpt2-pytorch/README.md @@ -31,7 +31,7 @@ | 硬件设备简称 | R300 | | | 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | -| 总吞吐量 | p_whole,见“性能指标” | 实际训练序列数除以总时间(performance_whole) | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | | 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | | **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=1024)| | 训练结果 | lambada_acc,见“性能指标” | lambada任务准确率 | | From 7b97382e5682d2504b86ddc2241b9491b8d0a7fa Mon Sep 17 00:00:00 2001 From: root Date: Sat, 7 Oct 2023 14:35:51 +0800 Subject: [PATCH 5/5] Refine 1x1 config --- training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py index 7dbc0150e..0d955b5c4 100644 --- a/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/gpt2-pytorch/config/config_R300x1x1.py @@ -1,3 +1,5 @@ from config_common import * train_batch_size = 2 +max_steps = 369120 +gradient_accumulation_steps = 8 \ No newline at end of file