From f1c8c3ba7f0e138e201b7a61d4fcdbb459269e98 Mon Sep 17 00:00:00 2001 From: KungYork <30741085+KungYork@users.noreply.github.com> Date: Mon, 11 Sep 2023 13:20:30 +0800 Subject: [PATCH 01/18] Fix kunlunxin-glm training. (#242) * Fix kunlunxin GLM training configs * Relocate xacc install logic * Modify max_steps for config 1x1 and 2x8 --- training/kunlunxin/docker_image/pytorch/pytorch_install.sh | 2 ++ training/kunlunxin/glm-pytorch/config/config_R300x1x1.py | 2 +- training/kunlunxin/glm-pytorch/config/config_R300x2x8.py | 2 +- .../kunlunxin/glm-pytorch/config/environment_variables.sh | 2 ++ training/kunlunxin/glm-pytorch/config/requirements.txt | 1 + training/kunlunxin/glm-pytorch/extern/trainer_adapter.py | 5 +++-- 6 files changed, 10 insertions(+), 4 deletions(-) diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh index 850a304b4..2a96fe267 100644 --- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh +++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh @@ -4,3 +4,5 @@ set -xe pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl + +python -m xacc.install diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index 8a4c96915..cd9afdd40 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 925510 +max_samples_termination = 4000 diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 840259660..fa79a403e 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 2776540 +max_samples_termination = 20000 diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh index 9c9f20b8e..b527e7873 100755 --- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh @@ -17,3 +17,5 @@ export ALLREDUCE_FUSION=0 export XMLIR_F_XPU_FC_GEMM_MODE=float export XMLIR_F_FAST_INDEX_PUT=true + +export XACC_ENABLE=1 diff --git a/training/kunlunxin/glm-pytorch/config/requirements.txt b/training/kunlunxin/glm-pytorch/config/requirements.txt index 8bac0066c..46109702b 100644 --- a/training/kunlunxin/glm-pytorch/config/requirements.txt +++ b/training/kunlunxin/glm-pytorch/config/requirements.txt @@ -2,3 +2,4 @@ h5sparse boto3 h5py numpy>=1.15.4 +psutil diff --git a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py index 3ac97e41a..719532ede 100644 --- a/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py +++ b/training/kunlunxin/glm-pytorch/extern/trainer_adapter.py @@ -1,13 +1,13 @@ import torch -import config - from torch import nn import torch.distributed as dist +import config from optimizers import get_optimizer_param_groups from optimizers.loss_scaler import DynamicLossScaler from driver.dist_pytorch import main_proc_print +import torch_xmlir import torch_xmlir.core.xpu_model as xm from torch_xmlir.optimizer import AdamW as Adam from torch_xmlir.nn.clip_grad import clip_grad_norm @@ -79,4 +79,5 @@ def _clip_grad(): if DynamicLossScaler._has_inf_or_nan(reduced_loss): main_proc_print("Found NaN loss, skip backward") + torch_xmlir.xpu.empty_cache() return reduced_loss From 9603550e72c037bbe60b5a641c53dc879920f2c6 Mon Sep 17 00:00:00 2001 From: Zhou Yu Date: Thu, 14 Sep 2023 11:40:06 +0800 Subject: [PATCH 02/18] glm: fix dataset url (#248) Co-authored-by: zhouyu --- training/benchmarks/glm/README.md | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/training/benchmarks/glm/README.md b/training/benchmarks/glm/README.md index 0481948e9..2dfa685bd 100644 --- a/training/benchmarks/glm/README.md +++ b/training/benchmarks/glm/README.md @@ -13,20 +13,38 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of ### 数据集 -- 数据集下载地址 -> `https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip` +- 数据集及checkpoint下载地址 +>`https://model.baai.ac.cn/model-detail/100097` +> 文件名:`glm_train_dataset.zip` - 预处理 -> 无需预处理 +- 无需预处理,解压缩数据集即可。 +```bash +unzip glm_train_dataset.zip +``` +解压后的目录结构 +```bash +├── ReCoRD +│ └── glm_train_eval_hdf5_sparse +│ ├── eval_hdf5 +│ │ └── eval_sparse.hdf5 +│ └── train_hdf5 +│ └── train_sparse.hdf5 +├── blocklm-large-blank +│ ├── 200000 +│ │ └── mp_rank_00_model_states.pt +│ └── latest_checkpointed_iteration.txt +``` + + -### 模型checkpoint -> `https://cloud.tsinghua.edu.cn/d/13f5b03da9594e5490c4/files/?p=%2Fglm-large-blank.tar.bz2` ### 框架与芯片支持情况 | | Pytorch |Paddle|TensorFlow2| | ---- | ---- | ---- | ---- | | Nvidia GPU | ✅ |N/A |N/A| | 昆仑芯 XPU | ✅ |N/A |N/A| +| 天数智芯 GPU | ✅ |N/A |N/A| From 4cd6d53d7adf7d9f8cef7d80cb9413dfa0261705 Mon Sep 17 00:00:00 2001 From: jinxiangshi <44688400+jinxiangshi@users.noreply.github.com> Date: Thu, 14 Sep 2023 11:41:15 +0800 Subject: [PATCH 03/18] kunlunxin berfLarge inference configs && results (#212) * kunlunxin inference : add bertLarge * Revert "kunlunxin inference : add bertLarge" This reverts commit cd9127c79de9c46f26c90edf09a6e5c65fe93054. * kunlunxin inference : add bertLarge * kunlunxin : remove re-install transformers * adjust env for bertlarge * kunlunxin: update bertLarge performance * Update BertLarge performance --------- Co-authored-by: zhaoyixuan02 Co-authored-by: Shi Jinxiang --- inference/benchmarks/bertLarge/README.md | 20 +++++++++- .../kunlunxin_configurations.yaml | 3 ++ inference/inference_engine/kunlunxin/xtcl.py | 40 +++++++++++++------ 3 files changed, 49 insertions(+), 14 deletions(-) create mode 100644 inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index f84a474eb..349240525 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -40,6 +40,24 @@ bert_reference_results_text_md5.txt - TensorRT 8.5.1.7 +#### 2.2 昆仑芯R200 + +- ##### 硬件环境 + - 机器、加速卡型号: R200 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-56-generic + - 加速卡驱动版本:4.0 + - Docker 版本:20.10.21 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + +- 推理工具包 + + - XTCL 2.1 + ### 4. 运行情况(BERT-Large) * 指标列表 @@ -64,5 +82,5 @@ bert_reference_results_text_md5.txt | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 | | tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | - +| kunlunxin_xtcl| W32A16 | 32 |3867.6 | None | None | 93.8 | 124.9 | None | 0.638/0.638| None| diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..c29b9c46b --- /dev/null +++ b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,3 @@ +compiler: xtcl +no_validation: true +exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 396cc3ae9..2643f51d5 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -3,7 +3,8 @@ import tvm.relay as relay from tvm.contrib.download import download_testdata from tvm.relay import param_dict -from tvm.contrib import xpu_config +from tvm.contrib import graph_executor, xpu_config +from tvm.runtime.vm import VirtualMachine import torch import os import subprocess @@ -11,8 +12,10 @@ import numpy as np import time +USE_VM_COMPILE = False + class InferModel: - + def __init__(self, config , onnx_path, model): self.input_names = [] self.engine = self.build_engine(config, onnx_path) @@ -27,7 +30,7 @@ def build_engine(self, config, onnx_path): input_name = input.name #'inputs:0' self.input_names.append(input_name) shape_dict[input_name] = input_shape - + mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' @@ -44,21 +47,32 @@ def build_engine(self, config, onnx_path): config_var_dtype_map=input_fp16, ).value() else: ## fp32 - os.environ['XTCL_USE_FP16'] = '0' - os.environ['XTCL_QUANTIZE_WEIGHT'] = '0' + os.environ['XTCL_USE_FP16'] = '1' + os.environ['XTCL_QUANTIZE_WEIGHT'] = '1' with tvm.transform.PassContext(opt_level=3, config=build_config): - vm_exec = relay.backend.vm.compile(mod, - target=target_host, - target_host=target_host, - params=params) - from tvm.runtime.vm import VirtualMachine - vm = VirtualMachine(vm_exec, ctx) - return vm + if USE_VM_COMPILE: + vm_exec = relay.backend.vm.compile(mod, + target=target_host, + target_host=target_host, + params=params) + + vm = VirtualMachine(vm_exec, ctx) + return vm + else: + graph, lib, params = relay.build(mod, + target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2", + params=params) + m = graph_executor.create(graph, lib, ctx) + m.set_input(**params) + return m def __call__(self, model_inputs: list): for index, input_name in enumerate(self.input_names): - self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + if USE_VM_COMPILE: + self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + else: + self.engine.set_input(input_name, tvm.nd.array(model_inputs[index])) self.engine.run() output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] foo_time_start = time.time() From e8c406d6ebf2d86d85bc564a2d87c19f45fedb08 Mon Sep 17 00:00:00 2001 From: Zhou Yu Date: Thu, 14 Sep 2023 11:41:59 +0800 Subject: [PATCH 04/18] update cpm 1x1 running stats (#238) Co-authored-by: zhouyu --- training/nvidia/cpm-pytorch/README.md | 2 +- training/nvidia/cpm-pytorch/config/config_A100x1x1.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/training/nvidia/cpm-pytorch/README.md b/training/nvidia/cpm-pytorch/README.md index 7f26bfccc..e16f10928 100644 --- a/training/nvidia/cpm-pytorch/README.md +++ b/training/nvidia/cpm-pytorch/README.md @@ -48,5 +48,5 @@ | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | | A100单机8卡(1x8) | fp16 | / | 1641 | 587 | 835 | 1059 | 0.92 | 12.9/40.0 | | A100单机8卡(1x8) | fp16 | bs=128,lr=0.002 | 5469 | 771 | 1090 | 1292 | 0.918 | 23.1/40.0 | -| A100单机单卡(1x1) | fp16 | bs=192,lr=0.0005 | | 78.4 | 111.9 | 127.2 | | 34.8/40.0 | +| A100单机单卡(1x1) | fp16 | bs=192,lr=0.0005 | | 98.8 | 143.8 | 168.8 | | 39.5/40.0 | | A100两机8卡(2x8) | fp16 | bs=192,lr=0.0005 | | 1583 | 2221 | 2583.8 | | 29.9/40.0 | \ No newline at end of file diff --git a/training/nvidia/cpm-pytorch/config/config_A100x1x1.py b/training/nvidia/cpm-pytorch/config/config_A100x1x1.py index ea439af5e..85374eb0e 100644 --- a/training/nvidia/cpm-pytorch/config/config_A100x1x1.py +++ b/training/nvidia/cpm-pytorch/config/config_A100x1x1.py @@ -3,13 +3,13 @@ fp16 = True dist_backend = "nccl" -target_embedding_average = 0.8 +target_embedding_average = 0.92 gradient_accumulation_steps = 1 train_batch_size = 32 eval_batch_size = train_batch_size -max_steps = 60000 +max_steps = 3000 max_samples_termination = 439126000 warmup = 0.2 From c9d87bb087a913307a63926ed39a73fd0bd62ec9 Mon Sep 17 00:00:00 2001 From: Zhou Yu Date: Thu, 14 Sep 2023 11:44:43 +0800 Subject: [PATCH 05/18] update data_dir for test_conf (#247) Co-authored-by: zhouyu --- training/run_benchmarks/config/test_conf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index fef579ed2..771b5cbbf 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -53,9 +53,9 @@ "model:framework:hardwareID:nnodes:nproc:repeat": "dataset path"} ''' CASES = { - "bert:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/bert/train/", - "glm:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/glm/train/", - "cpm:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/cpm/train/", + "bert:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/bert/train/", + "glm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/glm/train/", + "cpm:pytorch_1.8:A100:1:8:1": "/raid/home_datasets_ckpt/cpm/train/", # "mobilenetv2:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", # "vit:pytorch_1.13:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", @@ -72,6 +72,6 @@ # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", - # "transformer:pytorch_1.13:A100:1:8:1": "/home/datasets_ckpt/transformer/train/", + # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", } From e9ee7c7e3f6dceda757c088336aa775e3d961861 Mon Sep 17 00:00:00 2001 From: KungYork <30741085+KungYork@users.noreply.github.com> Date: Fri, 15 Sep 2023 15:32:27 +0800 Subject: [PATCH 06/18] Add DistilBERT model (#249) * Add DistilBert with training logic under developing * DistilBert for 1x1 GPU training * DistilBert for 1x8 GPU training * Add README and externel configs * Remove non-necessary files * Restore environment_varaibles.sh from kunlunxin-cpm * Update training configurations in _base.py update max_epoch and target_acc * Update README.md * Add nvidia pytorch1.12 docker * Update README.md * Add 1x1 2x8 cases * Add p_core unit name * Add p_core unit name * Update README.md --------- Co-authored-by: wangyakai --- training/benchmarks/distilbert/README.md | 49 +++++++ .../distilbert/pytorch/config/__init__.py | 2 + .../distilbert/pytorch/config/_base.py | 54 +++++++ .../pytorch/config/mutable_params.py | 6 + .../create_train_eval_data.py | 40 ++++++ .../pytorch/dataloaders/__init__.py | 1 + .../pytorch/dataloaders/dataloader.py | 134 ++++++++++++++++++ .../distilbert/pytorch/model/__init__.py | 22 +++ .../distilbert/pytorch/optimizers/__init__.py | 27 ++++ .../distilbert/pytorch/run_pretraining.py | 128 +++++++++++++++++ .../distilbert/pytorch/schedulers/__init__.py | 11 ++ .../distilbert/pytorch/train/__init__.py | 0 .../distilbert/pytorch/train/evaluator.py | 44 ++++++ .../distilbert/pytorch/train/trainer.py | 125 ++++++++++++++++ .../pytorch/train/trainer_adapter.py | 34 +++++ .../pytorch/train/training_state.py | 78 ++++++++++ training/nvidia/distilbert-pytorch/README.md | 45 ++++++ .../config/config_A100x1x1.py | 2 + .../config/config_A100x1x8.py | 1 + .../config/config_A100x2x8.py | 1 + .../config/requirements.txt | 2 + .../nvidia/distilbert-pytorch/extern/.gitkeep | 0 .../docker_image/pytorch_1.12/Dockerfile | 4 + .../pytorch_1.12/pytorch1.12_install.sh | 1 + training/run_benchmarks/config/test_conf.py | 1 + 25 files changed, 812 insertions(+) create mode 100644 training/benchmarks/distilbert/README.md create mode 100644 training/benchmarks/distilbert/pytorch/config/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/config/_base.py create mode 100644 training/benchmarks/distilbert/pytorch/config/mutable_params.py create mode 100644 training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py create mode 100644 training/benchmarks/distilbert/pytorch/dataloaders/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py create mode 100644 training/benchmarks/distilbert/pytorch/model/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/optimizers/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/run_pretraining.py create mode 100644 training/benchmarks/distilbert/pytorch/schedulers/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/train/__init__.py create mode 100644 training/benchmarks/distilbert/pytorch/train/evaluator.py create mode 100644 training/benchmarks/distilbert/pytorch/train/trainer.py create mode 100644 training/benchmarks/distilbert/pytorch/train/trainer_adapter.py create mode 100644 training/benchmarks/distilbert/pytorch/train/training_state.py create mode 100644 training/nvidia/distilbert-pytorch/README.md create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x1x1.py create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x1x8.py create mode 100644 training/nvidia/distilbert-pytorch/config/config_A100x2x8.py create mode 100644 training/nvidia/distilbert-pytorch/config/requirements.txt create mode 100644 training/nvidia/distilbert-pytorch/extern/.gitkeep create mode 100644 training/nvidia/docker_image/pytorch_1.12/Dockerfile create mode 100644 training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh diff --git a/training/benchmarks/distilbert/README.md b/training/benchmarks/distilbert/README.md new file mode 100644 index 000000000..fa2aeb4ef --- /dev/null +++ b/training/benchmarks/distilbert/README.md @@ -0,0 +1,49 @@ +## Model Introduction +### DistilBERT base model (uncased) + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-uncased). It was +introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found +[here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation). This model is uncased: it does +not make a difference between english and English. + +## Model and Training Scripts source code +Pytorch case: +This repository includes software from https://github.com/huggingface/transformers/tree/v4.33.0 +licensed under the Apache License 2.0. + +Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf. + +## Dataset and Model Checkpoints + +> Dataset website:https://huggingface.co/datasets/sst2 +https://huggingface.co/distilbert-base-uncased +> Model checkpoint website: https://huggingface.co/distilbert-base-uncased + +We have already preprocessed the dataset and the model checkpoint files(The preprocessing script is `training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py`). +The preprocessed can be downloaded directly from https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/distilbert_train.tar. +No additional preprocessing steps need to be conducted. + +After decompressing, the dataset and model checkpoint files are organized as the following: + +``` +distilbert +├── dataset # dataset files +│ ├── eval_dataset.npz +│ └── train_dataset.npz +└── model # model checkpoint and config files + ├── config.json + ├── pytorch_model.bin + ├── special_tokens_map.json + ├── tokenizer_config.json + └── vocab.txt +``` + +## Benchmark Task and Target Accuracy +This experiment is to finetune a text classification task on SST-2 dataset with DistilBERT-base-uncased pretrained checkpoints. +After finetuning 10 epoches, the DistilBERT-base-uncased model is able to achieve accuracy score of 90+, which matches the evaluation result on the [report](https://huggingface.co/distilbert-base-uncased). + +## AI Frameworks && Accelerators supports + +| | Pytorch | Paddle | TensorFlow2 | +| ---------- | ------- | ------ | ----------- | +| Nvidia GPU | [✅](../../nvidia/distilbert-pytorch/README.md) | N/A | N/A | diff --git a/training/benchmarks/distilbert/pytorch/config/__init__.py b/training/benchmarks/distilbert/pytorch/config/__init__.py new file mode 100644 index 000000000..d877a8e37 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/config/__init__.py @@ -0,0 +1,2 @@ +from ._base import * +from .mutable_params import mutable_params \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/config/_base.py b/training/benchmarks/distilbert/pytorch/config/_base.py new file mode 100644 index 000000000..799d342db --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/config/_base.py @@ -0,0 +1,54 @@ +# DO NOT MODIFY THESE REQUIRED PARAMETERS + +# Required parameters +vendor: str = None +data_dir: str = None +name: str = "distilbert" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True + +# Optional parameters + +# ========================================================= +# loss scale +# ========================================================= +lr: float = 5e-5 +weight_decay = 0.0 + +# ========================================================= +# train && evaluate +# ========================================================= +train_batch_size: int = 4 +eval_batch_size: int = 4 + +max_epoch: int = 10 +target_acc: float = 0.91 + +do_train = True +distributed: bool = True + + +# ========================================================= +# utils +# ========================================================= +seed: int = 0 +dist_backend: str = 'nccl' +device: str = None + +# ========================================================= +# datasets +# ========================================================= +dataloader_drop_last: bool = False +dataloader_num_workers: int = 8 + +# ========================================================= +# for driver +# ========================================================= +local_rank: int = -1 +use_env: bool = True +log_freq: int = 1000 +print_freq: int = 1000 +n_device: int = 1 +sync_bn: bool = False +gradient_accumulation_steps: int = 1 + diff --git a/training/benchmarks/distilbert/pytorch/config/mutable_params.py b/training/benchmarks/distilbert/pytorch/config/mutable_params.py new file mode 100644 index 000000000..6a1879263 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/config/mutable_params.py @@ -0,0 +1,6 @@ +mutable_params = [ + 'vendor', 'data_dir', 'lr', 'weight_decay', 'train_batch_size', + 'gradient_accumulation_steps', 'eval_batch_size', 'do_train', + 'distributed', 'dist_backend', 'device', 'cudnn_benchmark', + 'cudnn_deterministic' +] diff --git a/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py b/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py new file mode 100644 index 000000000..db5f0b327 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/data_preprocessing/create_train_eval_data.py @@ -0,0 +1,40 @@ +import os + +import numpy as np +from datasets import load_dataset +from transformers import DistilBertTokenizer + + +def save_dataset(ds, save_path): + np.savez(save_path, + idx=ds['idx'], + sentence=ds['sentence'], + label=ds['label'], + input_ids=ds['input_ids'], + attention_mask=ds['attention_mask'],) + + +def main(): + data_prefix = 'distilbert/dataset' + os.makedirs(data_prefix, exist_ok=True) + train_datapath = os.path.join(data_prefix, 'train_dataset.npz') + eval_datapath = os.path.join(data_prefix, 'eval_dataset.npz') + + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + + raw_datasets = load_dataset("sst2") + + def tokenize_function(examples): + return tokenizer(examples["sentence"], padding="max_length", truncation=True) + + tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) + + train_dataset = tokenized_datasets["train"].with_format('numpy') + save_dataset(train_dataset, train_datapath) + + eval_dataset = tokenized_datasets["validation"].with_format('numpy') + save_dataset(eval_dataset, eval_datapath) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py b/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py new file mode 100644 index 000000000..83fa73435 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/dataloaders/__init__.py @@ -0,0 +1 @@ +from .dataloader import build_train_dataloader, build_eval_dataloader diff --git a/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py b/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py new file mode 100644 index 000000000..843176f80 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/dataloaders/dataloader.py @@ -0,0 +1,134 @@ +import os +import numpy as np +import torch +from torch.utils.data import Dataset +from torch.utils.data import DataLoader +from torch.utils.data.dataloader import default_collate +from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union +from collections.abc import Mapping +InputDataClass = NewType("InputDataClass", Any) + +class DistilBertDataset(Dataset): + def __init__(self, filepath): + origin_data = np.load(filepath) + self.idx = origin_data['idx'] + self.sentence = origin_data['sentence'] + self.label = origin_data['label'] + self.input_ids = origin_data['input_ids'] + self.attention_mask = origin_data['attention_mask'] + + def __len__(self): + return len(self.idx) + + def __getitem__(self, idx): + sample = { + 'sentence': self.sentence[idx], + 'label': self.label[idx], + 'input_ids': self.input_ids[idx], + 'attention_mask': self.attention_mask[idx], + } + return sample + + +def default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]: + """ + https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/data/data_collator.py#L105 + """ + if not isinstance(features[0], Mapping): + features = [vars(f) for f in features] + first = features[0] + batch = {} + + # Special handling for labels. + # Ensure that tensor is created with the correct type + # (it should be automatically the case, but let's make sure of it.) + if "label" in first and first["label"] is not None: + batch["labels"] = torch.tensor([f["label"] for f in features], dtype=torch.long) + elif "label_ids" in first and first["label_ids"] is not None: + if isinstance(first["label_ids"], torch.Tensor): + batch["labels"] = torch.stack([f["label_ids"] for f in features]) + else: + dtype = torch.long if type(first["label_ids"][0]) is int else torch.float + batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype) + + # Handling of all other possible keys. + # Again, we will use the first element to figure out which key/values are not None for this model. + for k, v in first.items(): + if k not in ("label", "label_ids") and v is not None and not isinstance(v, str): + if isinstance(v, torch.Tensor): + batch[k] = torch.stack([f[k] for f in features]) + elif isinstance(v, np.ndarray): + batch[k] = torch.tensor(np.stack([f[k] for f in features])) + else: + batch[k] = torch.tensor([f[k] for f in features]) + + return batch + + +def build_train_sampler(config, dataset): + if torch.distributed.is_initialized(): + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, seed = config.seed) + else: + generator = torch.Generator() + generator.manual_seed(config.seed) + sampler = torch.utils.data.RandomSampler(dataset, generator=generator) + return sampler + + +def build_train_dataloader(config): + train_dataset = DistilBertDataset( + os.path.join(config.data_dir, 'dataset', 'train_dataset.npz')) + + train_sampler = build_train_sampler(config, train_dataset) + data_loader = DataLoader( + train_dataset, + sampler=train_sampler, + batch_size=config.train_batch_size, + collate_fn=default_data_collator, + drop_last=config.dataloader_drop_last, + num_workers=config.dataloader_num_workers, + ) + return data_loader + + +def build_eval_sampler(dataset): + if torch.distributed.is_initialized(): + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank) + else: + sampler = None + return sampler + + +def build_eval_dataloader(config): + eval_dataset = DistilBertDataset( + os.path.join(config.data_dir, 'dataset', 'eval_dataset.npz')) + + eval_sampler = build_eval_sampler(eval_dataset) + data_loader = DataLoader( + eval_dataset, + sampler=eval_sampler, + batch_size=config.eval_batch_size, + collate_fn=default_data_collator, + drop_last=config.dataloader_drop_last, + num_workers=config.dataloader_num_workers, + ) + + return data_loader + + +if __name__ == '__main__': + from collections import namedtuple + Config = namedtuple( + 'Config', + ['data_dir', 'distributed', 'train_batch_size', 'eval_batch_size', 'dataloader_drop_last', 'dataloader_num_workers', 'seed']) + config = Config('distilbert', False, 4, 4, False, 8, 1234) + train_dataloader = build_train_dataloader(config) + for i, batch in enumerate(train_dataloader): + print(batch.keys()) + break \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/model/__init__.py b/training/benchmarks/distilbert/pytorch/model/__init__.py new file mode 100644 index 000000000..0b2938d19 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/model/__init__.py @@ -0,0 +1,22 @@ +import os +from collections import namedtuple + +from transformers import DistilBertConfig, DistilBertTokenizer +from transformers import DistilBertForSequenceClassification + + +def create_model(config): + model_path = os.path.join(config.data_dir, 'model') + hfconfig = DistilBertConfig.from_pretrained(model_path) + model = DistilBertForSequenceClassification.from_pretrained(model_path, + config=hfconfig) + tokenizer = DistilBertTokenizer.from_pretrained(model_path) + return model, hfconfig, tokenizer + + +if __name__ == '__main__': + + Config = namedtuple('Config', ['data_dir']) + config = Config('distilbert') + model, model_config, tokenizer = create_model(config) + import pdb; pdb.set_trace() \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/optimizers/__init__.py b/training/benchmarks/distilbert/pytorch/optimizers/__init__.py new file mode 100644 index 000000000..30d50d86f --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/optimizers/__init__.py @@ -0,0 +1,27 @@ +import torch + + +def create_optimizer(model, args): + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": + args.weight_decay, + }, + { + "params": [ + p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": + 0.0, + }, + ] + optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr) + return optimizer \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/run_pretraining.py b/training/benchmarks/distilbert/pytorch/run_pretraining.py new file mode 100644 index 000000000..ae32d9476 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/run_pretraining.py @@ -0,0 +1,128 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import os +import sys +import time +from typing import Any, Tuple + +# benchmarks目录 append到sys.path +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, + "../../"))) # benchmarks目录 +import config +from driver import Event, dist_pytorch +from driver.helper import InitHelper + +from train import trainer_adapter +from train.evaluator import Evaluator +from train.trainer import Trainer +from train.training_state import TrainingState +from dataloaders.dataloader import build_train_dataloader, build_eval_dataloader + +logger = None + + +def main() -> Tuple[Any, Any]: + global logger + global config + + # init + init_helper = InitHelper(config) + model_driver = init_helper.init_driver(globals(), locals()) + config = model_driver.config + dist_pytorch.init_dist_training_env(config) + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.INIT_START) + + config.distributed = dist_pytorch.get_world_size() > 1 + # logger + logger = model_driver.logger + + train_dataloader = build_train_dataloader(config) + eval_dataloader = build_eval_dataloader(config) + + seed = config.seed + + init_helper.set_seed(seed, model_driver.config.vendor) + + # 创建TrainingState对象 + training_state = TrainingState() + + # 构建 trainer:依赖 evaluator、TrainingState对象 + evaluator = Evaluator(config, eval_dataloader) + trainer = Trainer(driver=model_driver, + adapter=trainer_adapter, + evaluator=evaluator, + training_state=training_state, + device=config.device, + config=config) + training_state._trainer = trainer + + # 设置分布式环境, trainer init() + dist_pytorch.barrier(config.vendor) + trainer.init(train_dataloader) + dist_pytorch.barrier(config.vendor) + + # evaluation统计 + init_evaluation_start = time.time() # evaluation起始时间,单位为秒 + + training_state.acc = evaluator.evaluate(trainer) + + init_evaluation_end = time.time() # evaluation结束时间,单位为秒 + + init_evaluation_info = dict(time=init_evaluation_end - + init_evaluation_start) + + model_driver.event(Event.INIT_EVALUATION, init_evaluation_info) + + if not config.do_train: + return config, training_state + + model_driver.event(Event.INIT_END) + + # TRAIN_START + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.TRAIN_START) + train_start_time = time.time() + + # 训练过程 + epoch = 1 + while not training_state.end_training: + training_state.epoch = epoch + trainer.train_one_epoch(train_dataloader) + epoch += 1 + + # TRAIN_END事件 + training_state.train_time = time.time() - train_start_time + model_driver.event(Event.TRAIN_END) + + return config, training_state + + +if __name__ == "__main__": + start = time.time() + config_update, state = main() + if not dist_pytorch.is_main_process(): + sys.exit(0) + + # 训练信息写日志 + e2e_time = time.time() - start + if config_update.do_train: + + finished_info = { + "e2e_time": e2e_time, + "train_time": state.train_time, + "train_no_eval_time": state.no_eval_time, + "pure_training_computing_time": state.pure_compute_time, + "throughput(ips)_raw": state.num_trained_samples / state.train_time, + "throughput(ips)_no_eval": + state.num_trained_samples / state.no_eval_time, + "throughput(ips)_pure_compute": + state.num_trained_samples / state.pure_compute_time, + "converged": state.converged, + "acc": state.acc, + } + else: + finished_info = {"e2e_time": e2e_time} + logger.log(Event.FINISHED, message=finished_info, stacklevel=0) diff --git a/training/benchmarks/distilbert/pytorch/schedulers/__init__.py b/training/benchmarks/distilbert/pytorch/schedulers/__init__.py new file mode 100644 index 000000000..7ba78bbd8 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/schedulers/__init__.py @@ -0,0 +1,11 @@ +from transformers import get_scheduler + + +def create_scheduler(optimizer, train_dataloader, args): + lr_scheduler = get_scheduler( + name='linear', + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=len(train_dataloader) * args.max_epoch, + ) + return lr_scheduler \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/train/__init__.py b/training/benchmarks/distilbert/pytorch/train/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/distilbert/pytorch/train/evaluator.py b/training/benchmarks/distilbert/pytorch/train/evaluator.py new file mode 100644 index 000000000..27ae1c796 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/train/evaluator.py @@ -0,0 +1,44 @@ +import os + +import torch +from torch.types import Device + +from driver import dist_pytorch + +class Evaluator: + """Evaluator""" + def __init__(self, config, dataloader): + self.config = config + self.eval_dataloader = dataloader + self.device = config.device + + def process_batch(self, batch, device: Device): + """Process batch and produce inputs for the model.""" + for k, v in batch.items(): + batch[k] = v.to(device, non_blocking=True) + return batch + + def evaluate(self, trainer): + model = trainer.model + model.eval() + + total_output = 0.0 + num_examples = len(self.eval_dataloader.dataset) + with torch.no_grad(): + # For all the batches in the dataset. + for step, inputs in enumerate(self.eval_dataloader): + # Forward pass through the model. + inputs = self.process_batch(inputs, self.device) + output = model(**inputs) + # For accuracy, return the number of correctly predicted samples. + outputs = torch.argmax(output['logits'], -1) + correct = (outputs == inputs['labels']).float() + output = correct.sum() + + # Reduce across processes. + if dist_pytorch.is_dist_avail_and_initialized(): + torch.distributed.all_reduce(output) + + total_output += output + acc = total_output / num_examples + return acc.item() \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/train/trainer.py b/training/benchmarks/distilbert/pytorch/train/trainer.py new file mode 100644 index 000000000..e3209b35d --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/train/trainer.py @@ -0,0 +1,125 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import time +import os +import sys +import math + +import torch +import torch.utils.data +from torch.types import Device + +from model import create_model +from optimizers import create_optimizer +from schedulers import create_scheduler +from train.evaluator import Evaluator +from train.training_state import TrainingState + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import Driver, Event, dist_pytorch + + +class Trainer: + def __init__(self, driver: Driver, adapter, evaluator: Evaluator, + training_state: TrainingState, device: Device, config): + super(Trainer, self).__init__() + self.driver = driver + self.adapter = adapter + self.training_state = training_state + self.device = device + self.config = config + self.evaluator = evaluator + + def init(self, train_dataloader): + self.model, self.model_config, self.tokenizer = create_model( + self.config) + self.model.to(self.device) + + self.model = self.adapter.convert_model(self.model) + self.model = self.adapter.model_to_ddp(self.config, self.model) + + self.optimizer = create_optimizer(self.model, self.config) + self.lr_scheduler = create_scheduler(self.optimizer, train_dataloader, + self.config) + + + def process_batch(self, batch, device: Device): + """Process batch and produce inputs for the model.""" + for k, v in batch.items(): + batch[k] = v.to(device, non_blocking=True) + return batch + + + def train_one_epoch(self, dataloader): + state = self.training_state + driver = self.driver + driver.event(Event.EPOCH_BEGIN, state.epoch) + + no_eval_start = time.time() + for _, data in enumerate(dataloader): + data = self.process_batch(data, self.device) + + pure_compute_start = time.time() + state.global_steps += 1 + state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size( + self.config) + + driver.event(Event.STEP_BEGIN, step=state.global_steps) + self.train_one_step(data) + + train_end = time.time() + state.pure_compute_time += train_end - pure_compute_start + state.no_eval_time += train_end - no_eval_start + + other_state = dict() + if state.global_steps % self.config.gradient_accumulation_steps == 0: + sequences_per_second = state.num_trained_samples / state.no_eval_time + other_state["seq/s"] = sequences_per_second + + step_info = state.to_dict(**other_state) + driver.event(Event.STEP_END, + message=step_info, + step=state.global_steps, + loss=state.loss) + + no_eval_start = time.time() + + driver.event(Event.EPOCH_END, state.epoch) + eval_start = time.time() + state.acc = self.evaluator.evaluate(self) + eval_result = dict( + global_steps=state.global_steps, + acc=state.acc, + time=time.time() - eval_start) + driver.event(Event.EVALUATE, eval_result) + self.detect_training_status(state) + + + def train_one_step(self, data): + + state = self.training_state + self.model.train() + + outputs = self.model(**data) + #loss 为标量 + loss = outputs["loss"].item() + state.loss = loss + self.adapter.backward(self.config, state.global_steps, outputs["loss"], + self.optimizer) + self.lr_scheduler.step() + self.driver.event(Event.BACKWARD, state.global_steps, state.loss, + self.optimizer) + + + def detect_training_status(self, state: TrainingState): + if state.acc >= self.config.target_acc: + state.converged_success() + state.end_training = True + + if state.epoch >= self.config.max_epoch: + state.end_training = True + + return state.end_training + diff --git a/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py b/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py new file mode 100644 index 000000000..1dc597460 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/train/trainer_adapter.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import torch +from torch import nn +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel + +import config + +def convert_model(model: nn.Module) -> nn.Module: + """convert_model""" + return model + + +def model_to_ddp(config, model: nn.Module) -> nn.Module: + use_ddp = dist.is_initialized() + if use_ddp: + model = DistributedDataParallel( + model, + device_ids=[config.local_rank]) + + return model + + +def backward(config, step: int, loss: torch.Tensor, optimizer, **kwarg): + if config.gradient_accumulation_steps > 1: + loss = loss / config.gradient_accumulation_steps + + loss.backward() + + if step % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() \ No newline at end of file diff --git a/training/benchmarks/distilbert/pytorch/train/training_state.py b/training/benchmarks/distilbert/pytorch/train/training_state.py new file mode 100644 index 000000000..e97ead272 --- /dev/null +++ b/training/benchmarks/distilbert/pytorch/train/training_state.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +from dataclasses import dataclass + +import torch +import inspect + +@dataclass +class TrainingState: + """TrainingState dataclass""" + _trainer = None + _status = 'aborted' # later set to 'success' if termination criteria met + + global_steps = 0 + + loss: float = 0.0 + acc: float = 0.0 + + epoch: int = 1 + + end_training: bool = False + converged: bool = False + + train_time = 0.0 + no_eval_time = 0.0 + pure_compute_time = 0.0 + + num_trained_samples = 0 + + def status(self): + """get status""" + if self.converged: + self._status = "success" + return self._status + + def converged_success(self): + """converged success""" + self.end_training = True + self.converged = True + + def _is_property(self, value): + status = [ + not callable(value), not inspect.isclass(value), + not inspect.ismodule(value), not inspect.ismethod(value), + not inspect.isfunction(value), not inspect.isbuiltin(value), + "classmethod object" not in str(value) + ] + return all(status) + + + def to_dict(self, **kwargs): + state_dict = dict() + + for var_name, value in self.__dict__.items(): + if not var_name.startswith("_") and self._is_property(value): + state_dict[var_name] = value + + lr = self._trainer.lr_scheduler.get_lr() + if isinstance(lr, (tuple, list)): + lr = lr[0] + state_dict["learning_rate"] = lr + + exclude = [ + "acc", "skipped_steps", + "converged", "init_time", "raw_train_time" + ] + for exkey in exclude: + if exkey in state_dict: + state_dict.pop(exkey) + + state_dict.update(kwargs) + + for k in state_dict.keys(): + if torch.is_tensor(state_dict[k]): + state_dict[k] = state_dict[k].item() + + return state_dict diff --git a/training/nvidia/distilbert-pytorch/README.md b/training/nvidia/distilbert-pytorch/README.md new file mode 100644 index 000000000..0ec6b5e89 --- /dev/null +++ b/training/nvidia/distilbert-pytorch/README.md @@ -0,0 +1,45 @@ +### 1. 下载数据集和模型 +[下载链接](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/distilbert_train.tar) + +### 2. 设置test_conf.py + +为了使得`training/nvidia/distilbert-pytorch/config/requirements.txt`里的依赖库均能被下载,需要将`training/run_benchmarks/config/test_conf.py`里的`PIP_SOURCE`的值修改为`https://pypi.tuna.tsinghua.edu.cn/simple` + +### 3. Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.12.0a0+bd13bc6 + - 依赖软件版本: + - cuda: 11.6 + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------- | +| 任务类别 | Summarization | | +| 模型 | distilbert | | +| 数据集 | SST-2 | | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际样本数数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=512) | +| 训练结果 | acc,见“性能指标” | 分类准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| A100单机8卡(1x8) | fp32 | / | 361 | 1764.0 | 1861.9 | 1942.6 | 0.915 | 13.9 /40.0 | diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py b/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py new file mode 100644 index 000000000..9d1621307 --- /dev/null +++ b/training/nvidia/distilbert-pytorch/config/config_A100x1x1.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +gradient_accumulation_steps = 8 diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py b/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py new file mode 100644 index 000000000..f85ba4108 --- /dev/null +++ b/training/nvidia/distilbert-pytorch/config/config_A100x1x8.py @@ -0,0 +1 @@ +train_batch_size = 32 diff --git a/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py b/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py new file mode 100644 index 000000000..ed3891623 --- /dev/null +++ b/training/nvidia/distilbert-pytorch/config/config_A100x2x8.py @@ -0,0 +1 @@ +train_batch_size = 16 diff --git a/training/nvidia/distilbert-pytorch/config/requirements.txt b/training/nvidia/distilbert-pytorch/config/requirements.txt new file mode 100644 index 000000000..a772ae8fa --- /dev/null +++ b/training/nvidia/distilbert-pytorch/config/requirements.txt @@ -0,0 +1,2 @@ +datasets==2.14.4 +transformers==4.33.0 \ No newline at end of file diff --git a/training/nvidia/distilbert-pytorch/extern/.gitkeep b/training/nvidia/distilbert-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/nvidia/docker_image/pytorch_1.12/Dockerfile b/training/nvidia/docker_image/pytorch_1.12/Dockerfile new file mode 100644 index 000000000..e57eaa828 --- /dev/null +++ b/training/nvidia/docker_image/pytorch_1.12/Dockerfile @@ -0,0 +1,4 @@ +FROM nvcr.io/nvidia/pytorch:22.04-py3 +RUN /bin/bash -c "pip config set global.index-url https://mirror.baidu.com/pypi/simple" +RUN /bin/bash -c "uname -a" +RUN /bin/bash -c alias python3=python diff --git a/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh b/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh new file mode 100644 index 000000000..cc1f786e8 --- /dev/null +++ b/training/nvidia/docker_image/pytorch_1.12/pytorch1.12_install.sh @@ -0,0 +1 @@ +#!/bin/bash \ No newline at end of file diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 771b5cbbf..81bf848d5 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -71,6 +71,7 @@ # "wav2vec2:pytorch_1.13:A100:1:8:1": "/raid/dataset/wav2vec2_data/LibriSpeech", # "WaveGlow:pytorch_1.13:A100:1:8:1": "/raid/dataset/LJSpeech/", + # "distilbert:pytorch_1.12:A100:1:8:1": "/raid/dataset/distilbert/", # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", From c845525c015c6f0ea6122ae8f5d4215c0481cad6 Mon Sep 17 00:00:00 2001 From: KungYork <30741085+KungYork@users.noreply.github.com> Date: Fri, 15 Sep 2023 17:50:08 +0800 Subject: [PATCH 07/18] GPT2 (#205) * Add gpt2 model * Add gpt2 test case in test_conf.py * refine README and python files * Remove redundant codes and re-organize denpendency * remove redundancy files * refine gpt_dataset * "Refine traing job" * Refine README * fix typo in README.md * Update README.md * Add config for 1x1 2x8 * Update README.md 1x1 config * Update README.md --- training/benchmarks/gpt2/README.md | 49 + .../gpt2/pytorch/config/__init__.py | 2 + .../benchmarks/gpt2/pytorch/config/_base.py | 122 ++ .../gpt2/pytorch/config/mutable_params.py | 6 + .../gpt2/pytorch/dataloaders/__init__.py | 3 + .../gpt2/pytorch/dataloaders/dataloader.py | 33 + .../pytorch/dataloaders/gpt2_tokenization.py | 263 +++++ .../gpt2/pytorch/dataloaders/gpt_dataset.py | 302 +++++ .../pytorch/dataloaders/indexed_dataset.py | 344 ++++++ .../gpt2/pytorch/dataloaders/tokenizer.py | 137 +++ .../benchmarks/gpt2/pytorch/model/__init__.py | 13 + .../gpt2/pytorch/model/layers/__init__.py | 0 .../pytorch/model/layers/fused_bias_gelu.py | 43 + .../pytorch/model/layers/fused_softmax.py | 212 ++++ .../gpt2/pytorch/model/layers/layers.py | 288 +++++ .../gpt2/pytorch/model/layers/transformer.py | 985 ++++++++++++++++ .../gpt2/pytorch/model/layers/utils.py | 37 + .../gpt2/pytorch/model/losses/__init__.py | 0 .../pytorch/model/losses/cross_entropy.py | 32 + .../gpt2/pytorch/model/models/__init__.py | 20 + .../gpt2/pytorch/model/models/enums.py | 25 + .../gpt2/pytorch/model/models/gpt_model.py | 123 ++ .../pytorch/model/models/language_model.py | 502 ++++++++ .../gpt2/pytorch/model/models/module.py | 125 ++ .../gpt2/pytorch/model/models/utils.py | 54 + .../benchmarks/gpt2/pytorch/mpu/__init__.py | 11 + .../gpt2/pytorch/optimizer/__init__.py | 138 +++ .../gpt2/pytorch/optimizer/clip_grads.py | 86 ++ .../pytorch/optimizer/distrib_optimizer.py | 1011 +++++++++++++++++ .../gpt2/pytorch/optimizer/grad_scaler.py | 119 ++ .../gpt2/pytorch/optimizer/optimizer.py | 645 +++++++++++ .../gpt2/pytorch/run_pretraining.py | 144 +++ .../gpt2/pytorch/schedulers/__init__.py | 1 + .../gpt2/pytorch/schedulers/factory.py | 34 + .../schedulers/optimizer_param_scheduler.py | 214 ++++ .../benchmarks/gpt2/pytorch/train/__init__.py | 0 .../gpt2/pytorch/train/evaluator.py | 46 + .../benchmarks/gpt2/pytorch/train/trainer.py | 190 ++++ .../gpt2/pytorch/train/trainer_adapter.py | 57 + .../gpt2/pytorch/train/training_state.py | 79 ++ .../benchmarks/gpt2/pytorch/train/utils.py | 101 ++ training/nvidia/gpt2-pytorch/README.md | 42 + .../gpt2-pytorch/config/config_A100x1x1.py | 4 + .../gpt2-pytorch/config/config_A100x1x8.py | 1 + .../gpt2-pytorch/config/config_A100x2x8.py | 1 + .../gpt2-pytorch/config/config_common.py | 6 + training/nvidia/gpt2-pytorch/extern/.gitkeep | 0 training/run_benchmarks/config/test_conf.py | 1 + 48 files changed, 6651 insertions(+) create mode 100644 training/benchmarks/gpt2/README.md create mode 100644 training/benchmarks/gpt2/pytorch/config/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/config/_base.py create mode 100644 training/benchmarks/gpt2/pytorch/config/mutable_params.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py create mode 100644 training/benchmarks/gpt2/pytorch/dataloaders/tokenizer.py create mode 100644 training/benchmarks/gpt2/pytorch/model/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/layers.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/transformer.py create mode 100644 training/benchmarks/gpt2/pytorch/model/layers/utils.py create mode 100755 training/benchmarks/gpt2/pytorch/model/losses/__init__.py create mode 100755 training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/enums.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/gpt_model.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/language_model.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/module.py create mode 100644 training/benchmarks/gpt2/pytorch/model/models/utils.py create mode 100644 training/benchmarks/gpt2/pytorch/mpu/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py create mode 100644 training/benchmarks/gpt2/pytorch/optimizer/optimizer.py create mode 100644 training/benchmarks/gpt2/pytorch/run_pretraining.py create mode 100755 training/benchmarks/gpt2/pytorch/schedulers/__init__.py create mode 100755 training/benchmarks/gpt2/pytorch/schedulers/factory.py create mode 100644 training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py create mode 100644 training/benchmarks/gpt2/pytorch/train/__init__.py create mode 100644 training/benchmarks/gpt2/pytorch/train/evaluator.py create mode 100644 training/benchmarks/gpt2/pytorch/train/trainer.py create mode 100644 training/benchmarks/gpt2/pytorch/train/trainer_adapter.py create mode 100644 training/benchmarks/gpt2/pytorch/train/training_state.py create mode 100644 training/benchmarks/gpt2/pytorch/train/utils.py create mode 100644 training/nvidia/gpt2-pytorch/README.md create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x1x1.py create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x1x8.py create mode 100755 training/nvidia/gpt2-pytorch/config/config_A100x2x8.py create mode 100755 training/nvidia/gpt2-pytorch/config/config_common.py create mode 100644 training/nvidia/gpt2-pytorch/extern/.gitkeep diff --git a/training/benchmarks/gpt2/README.md b/training/benchmarks/gpt2/README.md new file mode 100644 index 000000000..f720d14a8 --- /dev/null +++ b/training/benchmarks/gpt2/README.md @@ -0,0 +1,49 @@ +### 模型信息 +- 模型介绍 + +GPT-2 Medium is the 345M parameter version of Megatron-GPT2, a transformer-based language model created and released by OpenAI. The model is a pretrained model on English language using a causal language modeling (CLM) objective. + +>[Language Models are Unsupervised Multitask Learners](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) + +- 模型代码来源 + +This case includes code from open source project at https://github.com/NVIDIA/Megatron-LM/tree/v3.0/megatron + +Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf. + + +### 数据集 +- 数据集下载地址 +> Dataset website:https://huggingface.co/datasets/lambada + +> The training data should be downloaded from huggingface. First, download training data in a loose json format, with one json containing a text sample per line. For example in python interpreter: + +``` +from datasets import load_dataset + +train_data = load_dataset('lambada', split='train') +train_data.to_json("lambada.train.json", lines=True) +``` + +- 预处理 +> The training data requires preprocessing. +The loose json is then processed into a binary format for training. To convert the json into mmap format use preprocess_data.py. An example script to prepare data for GPT2 training is: + +``` bash +python tools/preprocess_data.py \ + --input lambada.train.json \ + --output-prefix lambada \ + --vocab gpt2-vocab.json \ + --dataset-impl mmap \ + --tokenizer-type GPT2BPETokenizer \ + --merge-file gpt2-merges.txt \ + --append-eod \ + --workers 32 \ + --chunk-size 25 \ +``` + + +### 框架与芯片支持情况 +| | Pytorch |Paddle|TensorFlow2| +| ---- | ---- | ---- | ---- | +| Nvidia GPU | ✅ |N/A |N/A| diff --git a/training/benchmarks/gpt2/pytorch/config/__init__.py b/training/benchmarks/gpt2/pytorch/config/__init__.py new file mode 100644 index 000000000..96e0aae70 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/config/__init__.py @@ -0,0 +1,2 @@ +from ._base import * +from .mutable_params import mutable_params diff --git a/training/benchmarks/gpt2/pytorch/config/_base.py b/training/benchmarks/gpt2/pytorch/config/_base.py new file mode 100644 index 000000000..ae9a91b57 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/config/_base.py @@ -0,0 +1,122 @@ +# Required parameters + +vendor: str = None +data_dir: str = None +name: str = "GPT2" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True + +use_env: bool = True +log_freq: int = 1 +device: str = None + +# ========================================================= +# train config +# ========================================================= + +seed: int = 1234 +gradient_accumulation_steps: int = 1 + +max_steps: int = 23070 +train_batch_size: int = 4 + +eval_iter_start_samples: int = 3200 +eval_interval_samples: int = 3200 + +target_acc: float = 0.60 + +# ========================================================= +# data +# ========================================================= + +train_data_prefix: str = "lambada_train_text_document" +test_data_prefix: str = "lambada_test.json" +vocab_file: str = "gpt2-vocab.json" +merge_file: str = "gpt2-merges.txt" + +# ========================================================= +# loss scale +# ========================================================= +clip_grad: float = 1.0 + +# ========================================================= +# optimizer & lr scheduler & weight decay +# ========================================================= +optimizer: str = "adam" +adam_beta1: float = 0.9 +adam_beta2: float = 0.999 +adam_eps: float = 1e-8 + +lr: float = 0.00015 +min_lr: float = 1e-05 +lr_warmup_fraction: float = 0.01 +lr_warmup_iters: int = 0 +lr_warmup_samples: int = 0 +lr_decay_style: str = "cosine" +lr_decay_samples: int=None + +weight_decay: float = 0.01 +start_weight_decay: float = 0.01 +end_weight_decay: float = 0.01 +weight_decay_incr_style: str = "constant" + +use_distributed_optimizer: bool = False +barrier_with_L1_time: bool = True + +# ========================================================= +# transformer +# ========================================================= + +num_layers: int = 24 +encoder_num_layers: str = 24 + +num_attention_heads: int = 16 +hidden_size: int = 1024 +ffn_hidden_size: int = 4096 +kv_channels: int = 64 +seq_length: int = 1024 +attention_dropout: float = 0.1 +hidden_dropout: float = 0.1 +transformer_impl: str = "local" +use_flash_attn: bool = False + +layernorm_epsilon: float = 1e-05 + +fp16: bool = False +bf16: bool = False + +init_method_std: float = 0.02 +import torch +params_dtype = torch.float32 +masked_softmax_fusion: bool = True +bias_gelu_fusion: bool = True +bias_dropout_fusion: bool = True +apply_residual_connection_post_layernorm: bool = False +apply_query_key_layer_scaling: bool = True +fp16_lm_cross_entropy: bool = False +fp32_residual_connection: bool = False +attention_softmax_in_fp32: bool = False + +# ========================================================= +# dataset +# ========================================================= + +tokenizer_type: str = "GPT2BPETokenizer" +num_workers: int = 2 +mmap_warmup: bool = False +padded_vocab_size: int = 0 +make_vocab_size_divisible_by: int = 128 +max_position_embeddings: int = 1024 + +reset_position_ids: bool = False +reset_attention_mask: bool = False +eod_mask_loss: bool = False + +# ========================================================= +# distributed parallel +# ========================================================= + +dist_backend: str = None +DDP_impl: str = "native" +gradient_accumulation_fusion: bool = False +use_contiguous_buffers_in_local_ddp: bool = False diff --git a/training/benchmarks/gpt2/pytorch/config/mutable_params.py b/training/benchmarks/gpt2/pytorch/config/mutable_params.py new file mode 100644 index 000000000..ecab0dd70 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/config/mutable_params.py @@ -0,0 +1,6 @@ +mutable_params = [ + 'vendor', 'data_dir', 'lr', 'weight_decay', + "gradient_accumulation_steps", "max_steps", + "train_batch_size", "eval_iter_start_samples", "eval_interval_samples", + 'dist_backend', 'device', +] \ No newline at end of file diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py b/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py new file mode 100644 index 000000000..3731c0321 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/dataloaders/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from .tokenizer import get_tokenizer diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py b/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py new file mode 100644 index 000000000..c4ddef76f --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/dataloaders/dataloader.py @@ -0,0 +1,33 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Dataloaders.""" + +import torch + +from mpu import get_data_parallel_rank, get_data_parallel_world_size + + +def build_data_loader(dataset, train_batch_size, num_workers, drop_last, + task_collate_fn=None): + """Data loader. Note that batch-size is the local (per GPU) batch-size.""" + + # Sampler. + if torch.distributed.is_initialized(): + world_size = get_data_parallel_world_size() + rank = get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank) + else: + sampler = None + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader(dataset, + batch_size=train_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + collate_fn=task_collate_fn) + + return data_loader diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py b/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py new file mode 100644 index 000000000..c164f5cdf --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/dataloaders/gpt2_tokenization.py @@ -0,0 +1,263 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for OpenAI GPT.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +import os +import regex as re +from io import open + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE + # tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", +} +PRETRAINED_MERGES_ARCHIVE_MAP = { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", +} +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'gpt2': 1024, +} +VOCAB_NAME = 'vocab.json' +MERGES_NAME = 'merges.txt' +SPECIAL_TOKENS_NAME = 'special_tokens.txt' + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + \ + list(range(ord("®"), ord("ÿ") + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(object): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level BPE + """ + + def __init__(self, vocab_file, merges_file, errors='replace', + special_tokens=None, max_len=None): + self.max_len = max_len if max_len is not None else int(1e12) + self.encoder = json.load(open(vocab_file)) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for + # capitalized versions of contractions + self.pat = re.compile( + r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + self.special_tokens = {} + self.special_tokens_decoder = {} + self.set_special_tokens(special_tokens) + + def __len__(self): + return len(self.encoder) + len(self.special_tokens) + + def set_special_tokens(self, special_tokens): + """ Add a list of additional tokens to the encoder. + The additional tokens are indexed starting from the last index of the + current vocabulary in the order of the `special_tokens` list. + """ + if not special_tokens: + self.special_tokens = {} + self.special_tokens_decoder = {} + return + self.special_tokens = dict((tok, len(self.encoder) + i) + for i, tok in enumerate(special_tokens)) + self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()} + logger.info("Special tokens {}".format(self.special_tokens)) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except BaseException: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def tokenize(self, text): + """ Tokenize a string. """ + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) + else: + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def convert_tokens_to_ids(self, tokens): + """ Converts a sequence of tokens into ids using the vocab. """ + ids = [] + if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)): + if tokens in self.special_tokens: + return self.special_tokens[tokens] + else: + return self.encoder.get(tokens, 0) + for token in tokens: + if token in self.special_tokens: + ids.append(self.special_tokens[token]) + else: + ids.append(self.encoder.get(token, 0)) + if len(ids) > self.max_len: + logger.warning( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this OpenAI GPT model ({} > {}). Running this" + " sequence through the model will result in indexing errors".format( + len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """Converts a sequence of ids in BPE tokens using the vocab.""" + tokens = [] + for i in ids: + if i in self.special_tokens_decoder: + if not skip_special_tokens: + tokens.append(self.special_tokens_decoder[i]) + else: + tokens.append(self.decoder[i]) + return tokens + + def encode(self, text): + return self.convert_tokens_to_ids(self.tokenize(text)) + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + return text + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(vocab_path): + logger.error("Vocabulary path ({}) should be a directory".format(vocab_path)) + return + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + merge_file = os.path.join(vocab_path, MERGES_NAME) + special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + index = len(self.encoder) + with open(special_tokens_file, 'w', encoding='utf-8') as writer: + for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(special_tokens_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + + return vocab_file, merge_file, special_tokens_file diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py b/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py new file mode 100644 index 000000000..4206b43e8 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/dataloaders/gpt_dataset.py @@ -0,0 +1,302 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""GPT style dataset.""" + +import json + +import numpy as np +import torch + +from dataloaders.indexed_dataset import make_dataset as make_indexed_dataset +from dataloaders.dataloader import build_data_loader +from dataloaders import get_tokenizer + +import config + +def build_train_test_datasets(train_num_samples, + seq_length, seed, skip_warmup, + train_data_prefix=None, + test_data_prefix=None): + """Build train, valid, and test datasets.""" + # get the tokenizer + tokenizer = get_tokenizer() + + train_dataset, test_dataset = None, None + # Single dataset. + assert train_data_prefix is not None + train_dataset = build_dataset("train", train_data_prefix, + train_num_samples, seq_length, seed, + skip_warmup) + assert test_data_prefix is not None + test_dataset = _LambadaDataset(test_data_prefix, tokenizer.eod, tokenizer, + seq_length) + return (train_dataset, test_dataset) + + +def build_dataset(dataset_name, data_prefix, num_samples, seq_length, seed, skip_warmup): + """ + Build dataset. This method is called when individual + train, valid, test datasets are provided + """ + dataset = None + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + + documents = np.arange(start=0, stop=total_num_of_documents, + step=1, dtype=np.int32) + + dataset = GPTDataset(dataset_name, data_prefix, + documents, indexed_dataset, + num_samples, seq_length, seed) + + return dataset + + +def get_indexed_dataset_(data_prefix, skip_warmup): + """Build indexed dataset.""" + indexed_dataset = make_indexed_dataset(data_prefix, + "mmap", + skip_warmup) + return indexed_dataset + + +class GPTDataset(torch.utils.data.Dataset): + + def __init__(self, name, data_prefix, documents, indexed_dataset, + num_samples, seq_length, seed): + + self.name = name + self.indexed_dataset = indexed_dataset + + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + documents, self.indexed_dataset.sizes, seq_length, seed) + + def __len__(self): + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 + + def __getitem__(self, idx): + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f)] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append(self.indexed_dataset.get( + self.doc_idx[doc_index_l], + length=offset_l + 1)) + sample = np.concatenate(sample_list) + + return {'text': np.array(sample, dtype=np.int64)} + + +def _build_index_mappings(documents, sizes, seq_length, seed): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, sizes) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # doc-idx. + doc_idx = _build_doc_idx(documents, np_rng) + # sample-idx. + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, + tokens_per_epoch) + # shuffle-idx. + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + num_samples_ = sample_idx.shape[0] - 1 + shuffle_idx = _build_shuffle_idx(num_samples_, + sample_idx.shape[0] - 1, np_rng) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _build_doc_idx(documents, np_rng): + """Build an array with length = number-of-epochs * number-of-dcuments. + Each index is mapped to a corresponding document.""" + doc_idx = np.mgrid[0:1, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + + +def _build_sample_idx(sizes, doc_idx, seq_length, tokens_per_epoch): + """Sample index mapping is a 2D array with sizes + [number-of-samples + 1, 2] where [..., 0] contains + the index into `doc_idx` and [..., 1] is the + starting offset in that document.""" + + # Total number of samples. For -1 see comments in `_num_epochs`. + num_samples = (tokens_per_epoch - 1) // seq_length + sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) + + # Index into sample_idx. + sample_index = 0 + # Index into doc_idx. + doc_idx_index = 0 + # Begining offset for each document. + doc_offset = 0 + # Start with first document and no offset. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + while sample_index <= num_samples: + # Start with a fresh sequence. + remaining_seq_length = seq_length + 1 + while remaining_seq_length != 0: + # Get the document length. + doc_id = doc_idx[doc_idx_index] + doc_length = sizes[doc_id] - doc_offset + # And add it to the current sequence. + remaining_seq_length -= doc_length + # If we have more than a full sequence, adjust offset and set + # remaining length to zero so we return from the while loop. + # Note that -1 here is for the same reason we have -1 in + # `_num_epochs` calculations. + if remaining_seq_length <= 0: + doc_offset += (remaining_seq_length + doc_length - 1) + remaining_seq_length = 0 + else: + # Otherwise, start from the begining of the next document. + doc_idx_index += 1 + doc_offset = 0 + # Record the sequence. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + + return sample_idx + + +def _build_shuffle_idx(num_samples, total_size, np_rng): + """Build the range [0, size) and shuffle.""" + + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + shuffle_idx_first = np.arange(start=0, stop=num_samples, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = np.arange(start=num_samples, stop=total_size, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_last) + + return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + + +def build_train_test_data_dataloaders( + build_train_test_datasets_provider): + """XXX""" + + (train_dataloader, test_dataloader) = (None, None) + + # Number of train/valid/test samples. + train_samples = config.max_steps* config.global_batch_size + + # Build the datasets. + train_ds, test_ds = build_train_test_datasets_provider( + train_num_samples=train_samples) + + # Build dataloders. + train_dataloader = build_data_loader(train_ds, config.train_batch_size, + config.num_workers, drop_last=False) + + test_dataloader = build_data_loader(test_ds, config.train_batch_size, + config.num_workers, drop_last=False) + + # Flags to know if we need to do training/validation/testing. + config.do_train = train_dataloader is not None and config.max_steps> 0 + + return train_dataloader, test_dataloader + + +class _LambadaDataset(torch.utils.data.Dataset): + + def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False): + self.seq_len = seq_len + self.pad_idx = pad_idx + self.tokenizer = tokenizer + self.strict = strict + + self.tokens = [] + self.labels = [] + with open(path, 'r') as f: + for line in f.readlines(): + text = json.loads(line)['text'] + tokens, labels = self.get_tokens(text) + self.tokens.append(tokens) + self.labels.append(labels) + + def get_tokens(self, text): + if not self.strict: + tokens = self.tokenizer.tokenize(text) + return tokens[:-1], [tokens[-1]] + last_token = text.split()[-1] + start_idx = text.rfind(last_token) + beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip()) + last_token = self.tokenizer.tokenize(' ' + last_token) + return beginning_tokens, last_token + + def __len__(self): + return len(self.tokens) + + def __getitem__(self, idx): + tokens = self.tokens[idx] + num_tokens = len(tokens) + pad_mask = [0] * num_tokens + labels = self.labels[idx] + pad_mask += [1] * len(labels) + tokens = tokens + labels + num_tokens = len(tokens) + if num_tokens < self.seq_len + 1: + num_pad = (self.seq_len + 1 - num_tokens) + pad_mask += [0] * (num_pad) + tokens += [self.pad_idx] * num_pad + pad_mask = np.array(pad_mask[1:]) + + return {'text': np.array(tokens), 'pad_mask': pad_mask} + diff --git a/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py b/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py new file mode 100644 index 000000000..661022a39 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/dataloaders/indexed_dataset.py @@ -0,0 +1,344 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +# copied from fairseq/fairseq/data/indexed_dataset.py +# Removed IndexedRawTextDataset since it relied on Fairseq dictionary +# other slight modifications to remove fairseq dependencies +# Added document index to index file and made it accessible. +# An empty sentence no longer separates documents. + +from functools import lru_cache +import os +import shutil +import struct +from itertools import accumulate + +import numpy as np +import torch + + +def __best_fitting_dtype(vocab_size=None): + if vocab_size is not None and vocab_size < 65500: + return np.uint16 + else: + return np.int32 + + +def get_available_dataset_impl(): + return ['lazy', 'cached', 'mmap'] + + +def make_builder(out_file, impl, vocab_size=None): + if impl == 'mmap': + return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size)) + + +def make_dataset(path, impl='mmap', skip_warmup=False): + if not MMapIndexedDataset.exists(path): + print(f"Dataset does not exist: {path}") + print("Path should be a basename that both .idx and .bin can be appended to get full filenames.") + return None + elif impl == 'mmap' and MMapIndexedDataset.exists(path): + return MMapIndexedDataset(path, skip_warmup) + print(f"Unknown dataset implementation: {impl}") + return None + + +def dataset_exists(path, impl): + if impl == 'mmap': + return MMapIndexedDataset.exists(path) + + +def read_longs(f, n): + a = np.empty(n, dtype=np.int64) + f.readinto(a) + return a + + +def write_longs(f, a): + f.write(np.array(a, dtype=np.int64)) + + +dtypes = { + 1: np.uint8, + 2: np.int8, + 3: np.int16, + 4: np.int32, + 5: np.int64, + 6: float, + 7: np.double, + 8: np.uint16 +} + + +def code(dtype): + for k in dtypes.keys(): + if dtypes[k] == dtype: + return k + raise ValueError(dtype) + + +def index_file_path(prefix_path): + return prefix_path + '.idx' + + +def data_file_path(prefix_path): + return prefix_path + '.bin' + + +def create_doc_idx(sizes): + doc_idx = [0] + for i, s in enumerate(sizes): + if s == 0: + doc_idx.append(i + 1) + return doc_idx + + +def _warmup_mmap_file(path): + with open(path, 'rb') as stream: + while stream.read(100 * 1024 * 1024): + pass + + +class MMapIndexedDataset(torch.utils.data.Dataset): + class Index(object): + _HDR_MAGIC = b'MMIDIDX\x00\x00' + + @classmethod + def writer(cls, path, dtype): + class _Writer(object): + def __enter__(self): + self._file = open(path, 'wb') + + self._file.write(cls._HDR_MAGIC) + self._file.write(struct.pack(''] + + @property + def vocab_size(self): + return len(self.tokenizer.encoder) + + @property + def vocab(self): + return self.tokenizer.encoder + + @property + def inv_vocab(self): + return self.tokenizer.decoder + + def tokenize(self, text): + return self.tokenizer.encode(text) + + def detokenize(self, token_ids): + return self.tokenizer.decode(token_ids) + + @property + def eod(self): + return self.eod_id + diff --git a/training/benchmarks/gpt2/pytorch/model/__init__.py b/training/benchmarks/gpt2/pytorch/model/__init__.py new file mode 100644 index 000000000..8a4b8c43c --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/__init__.py @@ -0,0 +1,13 @@ +from model.models.gpt_model import GPTModel + +def create_model(args): + # config.resume_step = 0 + + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True, + ) + + return None, model, diff --git a/training/benchmarks/gpt2/pytorch/model/layers/__init__.py b/training/benchmarks/gpt2/pytorch/model/layers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py b/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py new file mode 100644 index 000000000..29222db02 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/layers/fused_bias_gelu.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch + + +###### BIAS GELU FUSION/ NO AUTOGRAD ################ +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) + +@torch.jit.script +def bias_gelu(bias, y): + x = bias + y + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@torch.jit.script +def bias_gelu_back(g, bias, y): + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff*g + +class GeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(bias, input) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_back(grad_output, bias, input) + return tmp, tmp + +bias_gelu_impl = GeLUFunction.apply diff --git a/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py b/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py new file mode 100644 index 000000000..2f359d166 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/layers/fused_softmax.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +import torch +import torch.nn as nn + +from model.models.enums import AttnMaskType + +class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_upper_triang_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_cuda.forward( + inputs, scale_t[0] + ) + + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_upper_triang_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_upper_triang_masked_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + + return input_grads, None + + +class ScaledMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply the mask. + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, mask, scale): + import scaled_masked_softmax_cuda + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_masked_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_masked_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + return input_grads, None, None + + +class ScaledSoftmax(torch.autograd.Function): + """ + Fused operation which performs following two operations in sequence + 1. Scale the tensor. + 2. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + import scaled_softmax_cuda + + scale_t = torch.tensor([scale]) + + softmax_results = scaled_softmax_cuda.forward( + inputs, scale_t[0] + ) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + import scaled_softmax_cuda + + softmax_results, scale_t = ctx.saved_tensors + + input_grads = scaled_softmax_cuda.backward( + output_grads, softmax_results, scale_t[0] + ) + return input_grads, None, None + + +class FusedScaleMaskSoftmax(nn.Module): + """ + fused operation: scaling + mask + softmax + + Arguments: + input_in_fp16: flag to indicate if input in fp16 data format. + input_in_bf16: flag to indicate if input in bf16 data format. + attn_mask_type: attention mask type (pad or causal) + scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + scale: scaling factor used in input tensor scaling. + """ + + def __init__( + self, + input_in_fp16, + input_in_bf16, + attn_mask_type, + scaled_masked_softmax_fusion, + mask_func, + softmax_in_fp32, + scale, + ): + super(FusedScaleMaskSoftmax, self).__init__() + self.input_in_fp16 = input_in_fp16 + self.input_in_bf16 = input_in_bf16 + assert not ( + self.input_in_fp16 and self.input_in_bf16 + ), "both fp16 and bf16 flags cannot be active at the same time." + self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.scale = scale + + assert ( + self.scale is None or softmax_in_fp32 + ), "softmax should be in fp32 when scaled" + + def forward(self, input, mask): + # [b, np, sq, sk] + assert input.dim() == 4 + + # if self.is_kernel_available(mask, *input.size()): + # return self.forward_fused_softmax(input, mask) + # else: + return self.forward_torch_softmax(input, mask) + + def is_kernel_available(self, mask, b, np, sq, sk): + attn_batches = b * np + + if ( + self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_float16 # input must be fp16 + and 16 < sk <= 4096 # sk must be 16 ~ 2048 + and sq % 4 == 0 # sq must be divisor of 4 + and sk % 4 == 0 # sk must be divisor of 4 + and attn_batches % 4 == 0 # np * b must be divisor of 4 + ): + if 0 <= sk <= 4096: + batch_per_block = self.get_batch_per_block(sq, sk, b, np) + + if self.attn_mask_type == AttnMaskType.causal: + if attn_batches % batch_per_block == 0: + return True + else: + if sq % batch_per_block == 0: + return True + return False + + def forward_fused_softmax(self, input, mask): + b, np, sq, sk = input.size() + scale = self.scale if self.scale is not None else 1.0 + + if self.attn_mask_type == AttnMaskType.causal: + assert sq == sk, "causal mask is only for self attention" + + # input is 3D tensor (attn_batches, sq, sk) + input = input.view(-1, sq, sk) + probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale) + return probs.view(b, np, sq, sk) + else: + # input is 4D tensor (b, np, sq, sk) + if mask is not None: + return ScaledMaskedSoftmax.apply(input, mask, scale) + else: + return ScaledSoftmax.apply(input, scale) + + def forward_torch_softmax(self, input, mask): + if self.input_in_float16 and self.softmax_in_fp32: + input = input.float() + + if self.scale is not None: + input = input * self.scale + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + + if self.input_in_float16 and self.softmax_in_fp32: + if self.input_in_fp16: + probs = probs.half() + else: + probs = probs.bfloat16() + + return probs + + @staticmethod + def get_batch_per_block(sq, sk, b, np): + import scaled_masked_softmax_cuda + + return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np) diff --git a/training/benchmarks/gpt2/pytorch/model/layers/layers.py b/training/benchmarks/gpt2/pytorch/model/layers/layers.py new file mode 100644 index 000000000..cc7cf6194 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/layers/layers.py @@ -0,0 +1,288 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +# Parts of the code here are adapted from PyTorch +# repo: https://github.com/pytorch/pytorch + +import torch +import torch.nn.functional as F +import torch.nn.init as init +from torch.nn.parameter import Parameter + +from model.layers.utils import divide + + +def _initialize_affine_weight_cpu(weight, output_size, input_size, + per_partition_size, partition_dim, + init_method, stride=1, + return_master_weight=False, + *, params_dtype=torch.float32): + """Initialize affine weight for model parallel. + + Build the master weight on all processes and scatter + the relevant chunk.""" + + # Initialize master weight + master_weight = torch.empty(output_size, input_size, + dtype=torch.float, + requires_grad=False) + init_method(master_weight) + master_weight = master_weight.to(dtype=params_dtype) + + # Split and copy + per_partition_per_stride_size = divide(per_partition_size, stride) + weight_list = torch.split(master_weight, per_partition_per_stride_size, + dim=partition_dim) + my_weight_list = weight_list + + with torch.no_grad(): + torch.cat(my_weight_list, dim=partition_dim, out=weight) + if return_master_weight: + return master_weight + return None + + +class VocabParallelEmbedding(torch.nn.Module): + """Embedding parallelized in the vocabulary dimension. + + This is mainly adapted from torch.nn.Embedding and all the default + values are kept. + Arguments: + num_embeddings: vocabulary size. + embedding_dim: size of hidden state. + + Keyword Arguments: + init_method: method to initialize weights. + params_dtype + use_cpu_initialization + perform_initialization + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, *, + init_method=init.xavier_normal_, + params_dtype: torch.dtype=torch.float32, + use_cpu_initialization: bool=False, + perform_initialization: bool=True): + super(VocabParallelEmbedding, self).__init__() + # Keep the input dimensions. + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + # Set the detauls for compatibility. + self.padding_idx = None + self.max_norm = None + self.norm_type = 2. + self.scale_grad_by_freq = False + self.sparse = False + self._weight = None + # Divide the weight matrix along the vocaburaly dimension. + self.vocab_start_index = 0 + self.vocab_end_index = self.num_embeddings + + self.num_embeddings_per_partition = self.vocab_end_index - \ + self.vocab_start_index + + # Allocate weights and initialize. + self.weight = Parameter(torch.empty( + self.num_embeddings_per_partition, self.embedding_dim, + dtype=params_dtype)) + _initialize_affine_weight_cpu( + self.weight, self.num_embeddings, self.embedding_dim, + self.num_embeddings_per_partition, 0, init_method, + params_dtype=params_dtype) + self.weight.data = self.weight.data.cuda() + + def forward(self, input_): + masked_input = input_ + # Get the embeddings. + output = F.embedding(masked_input, self.weight, + self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, + self.sparse) + return output + + +class ColumnParallelLinear(torch.nn.Module): + """Linear layer with column parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its second dimension as A = [A_1, ..., A_p]. + + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + + Keyword Arguments + bias: If true, add bias + gather_output: If true, call all-gather on output and make Y available + to all GPUs, otherwise, every GPU will have its output + which is Y_i = XA_i + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + skip_bias_add: This was added to enable performance optimations where bias + can be fused with other elementwise operations. we skip + adding bias but instead return it. + params_dtype: + use_cpu_initialization: + gradient_accumulation_fusion: + sequence_parallel_enabled: + """ + + def __init__(self, input_size, output_size, *, + bias=True, gather_output=True, + init_method=init.xavier_normal_, stride=1, + keep_master_weight_for_test=False, + skip_bias_add=False, + params_dtype=torch.float32, + use_cpu_initialization=False, + perform_initialization=True, + gradient_accumulation_fusion=False, + sequence_parallel_enabled: bool = False, + ): + super(ColumnParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + # Divide the weight matrix along the last dimension. + self.skip_bias_add = skip_bias_add + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + # Initialize weight. + self.weight = Parameter(torch.empty(self.output_size, + self.input_size, + dtype=params_dtype)) + self.master_weight = _initialize_affine_weight_cpu( + self.weight, self.output_size, self.input_size, + self.output_size, 0, init_method, + stride=stride, return_master_weight=keep_master_weight_for_test) + if bias: + self.bias = Parameter(torch.empty( + self.output_size, dtype=params_dtype)) + + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + + def forward(self, input_): + """Forward of ColumnParallelLinear + + Args: + input_: 3D tensor whose order of dimension is [sequence, batch, hidden] + + Returns: + - output + - bias + """ + bias = self.bias if not self.skip_bias_add else None + output = F.linear(input_, self.weight, bias) + output_bias = self.bias if self.skip_bias_add else None + return output, output_bias + + +class RowParallelLinear(torch.nn.Module): + """Linear layer with row parallelism. + + The linear layer is defined as Y = XA + b. A is parallelized along + its first dimension and X along its second dimension as: + - - + | A_1 | + | . | + A = | . | X = [X_1, ..., X_p] + | . | + | A_p | + - - + Arguments: + input_size: first dimension of matrix A. + output_size: second dimension of matrix A. + + Keyword Arguments: + bias: If true, add bias. Note that bias is not parallelized. + input_is_parallel: If true, we assume that the input is already + split across the GPUs and we do not split + again. + init_method: method to initialize weights. Note that bias is always set + to zero. + stride: For the strided linear layers. + keep_master_weight_for_test: This was added for testing and should be + set to False. It returns the master weights + used for initialization. + skip_bias_add: This was added to enable performance optimization where bias + can be fused with other elementwise operations. We skip + adding bias but instead return it. + params_dtype: + use_cpu_initialization: + perform_initialization: + gradient_accumulation_fusion: + sequence_parallel_enabled: + """ + + def __init__(self, input_size, output_size, *, + bias=True, input_is_parallel=False, + init_method=init.xavier_normal_, stride=1, + keep_master_weight_for_test=False, + skip_bias_add=False, + params_dtype=torch.float32, + use_cpu_initialization=False, + perform_initialization=True, + gradient_accumulation_fusion=False, + sequence_parallel_enabled: bool = False, + ): + super(RowParallelLinear, self).__init__() + + # Keep input parameters + self.input_size = input_size + self.output_size = output_size + # Divide the weight matrix along the last dimension. + self.skip_bias_add = skip_bias_add + + # Parameters. + # Note: torch.nn.functional.linear performs XA^T + b and as a result + # we allocate the transpose. + # Initialize weight. + self.weight = Parameter(torch.empty(self.output_size, + self.input_size, + dtype=params_dtype)) + self.master_weight = _initialize_affine_weight_cpu( + self.weight, self.output_size, self.input_size, + self.input_size, 1, init_method, + stride=stride, return_master_weight=keep_master_weight_for_test, + params_dtype=params_dtype) + + if bias: + self.bias = Parameter(torch.empty(self.output_size, + dtype=params_dtype)) + setattr(self.bias, 'sequence_parallel', sequence_parallel_enabled) + + # Always initialize bias to zero. + with torch.no_grad(): + self.bias.zero_() + else: + self.register_parameter('bias', None) + + + + def forward(self, input_): + """Forward of RowParallelLinear + + Args: + input_: 3D tensor whose order of dimension is [sequence, batch, hidden] + + Returns: + - output + - bias + """ + if not self.skip_bias_add: + output = F.linear(input_, self.weight, self.bias) + output_bias = None + else: + output = F.linear(input_, self.weight, None) + output_bias = self.bias + return output, output_bias diff --git a/training/benchmarks/gpt2/pytorch/model/layers/transformer.py b/training/benchmarks/gpt2/pytorch/model/layers/transformer.py new file mode 100644 index 000000000..ded57e471 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/layers/transformer.py @@ -0,0 +1,985 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Transformer.""" +import math +from typing import Optional, List +from contextlib import nullcontext + +import numpy as np +import torch +import torch.nn.functional as F +from torch.nn import LayerNorm + +from model.models.module import MegatronModule +from model.layers import layers +from model.models.enums import AttnMaskType, LayerType, AttnType +from model.layers.fused_softmax import FusedScaleMaskSoftmax +from model.layers.fused_bias_gelu import bias_gelu_impl +from model.layers.utils import attention_mask_func +import config + +rearrange = None + +flash_attn_unpadded_func = None + + +def split_tensor_along_last_dim( + tensor: torch.Tensor, + num_partitions: int, + contiguous_split_chunks: bool = False, +) -> List[torch.Tensor]: + """ Split a tensor along its last dimension. + + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + + Returns: + A list of Tensors + """ + # Get the size and dimension. + last_dim = tensor.dim() - 1 + last_dim_size = tensor.size()[last_dim] // num_partitions + # Split. + tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + +def _kernel_make_viewless_tensor(inp, requires_grad): + '''Make a viewless tensor. + + View tensors have the undesirable side-affect of retaining a reference + to the originally-viewed tensor, even after manually setting the '.data' + field. This method creates a new tensor that links to the old tensor's + data, without linking the viewed tensor, referenced via the '._base' + field. + ''' + out = torch.empty( + (1,), + dtype = inp.dtype, + device = inp.device, + requires_grad = requires_grad, + ) + out.data = inp.data + return out + +class MakeViewlessTensor(torch.autograd.Function): + ''' + Autograd function to make a viewless tensor. + + This function should be used in cases where the computation graph needs + to be propagated, but we only want a viewless tensor (e.g., + ParallelTransformer's hidden_states). Call this function by passing + 'keep_graph = True' to 'make_viewless_tensor()'. + ''' + @staticmethod + def forward(ctx, inp, requires_grad): + return _kernel_make_viewless_tensor(inp, requires_grad) + @staticmethod + def backward(ctx, grad_output): + return grad_output, None + +def make_viewless_tensor(inp, requires_grad, keep_graph): + ''' + Entry-point for creating viewless tensors. + + This method should be used, rather than calling 'MakeViewlessTensor' + or '_kernel_make_viewless_tensor' directly. This method acts as a + switch for determining if an autograd function or a regular method + should be used to create the tensor. + ''' + + # return tensor as-is, if not a 'view' + if inp._base is None: + return inp + + # create viewless tensor + if keep_graph: + return MakeViewlessTensor.apply(inp, requires_grad) + else: + return _kernel_make_viewless_tensor(inp, requires_grad) + + +""" We use the following notation throughout this file: + h: hidden size + n: number of attention heads + p: number of model parallel partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + l: number of layers + Transformer takes input of size [s, b, h] and returns a + tensor of the same size. We use the following arguments: + hyperparameters: transformer hyperparameters +""" + +class DropPath(MegatronModule): + """Drop paths (Stochastic Depth) per sample + (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=0.): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_state): + if self.drop_prob == 0. or not self.training: + return hidden_state + keep_prob = 1 - self.drop_prob + # work with diff dim tensors, not just 2D ConvNets + # hidden_state: [s, b, h] + shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2) + random_tensor = keep_prob + \ + torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device) + random_tensor.floor_() # binarize + output = hidden_state.div(keep_prob) * random_tensor + return output + +def _args_to_kwargs(): + + common_kwargs = { + "params_dtype": config.params_dtype, + "use_cpu_initialization": True, + "perform_initialization": True, + "gradient_accumulation_fusion": False, + "sequence_parallel_enabled": False, + } + return common_kwargs + +class ParallelMLP(MegatronModule): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, init_method, output_layer_init_method): + super(ParallelMLP, self).__init__() + + # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf + self.dense_h_to_4h = layers.ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + gather_output=False, + init_method=init_method, + skip_bias_add=True, + **_args_to_kwargs()) + + self.bias_gelu_fusion = False + self.activation_func = None + + self.bias_gelu_fusion = config.bias_gelu_fusion + self.activation_func = F.gelu + + # Project back to h. + self.dense_4h_to_h = layers.RowParallelLinear( + config.ffn_hidden_size, + config.hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + skip_bias_add=True, + **_args_to_kwargs()) + + def forward(self, hidden_states): + + # [s, b, 4hp] + intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states) + + if self.bias_gelu_fusion: + intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) + else: + intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel) + + # [s, b, h] + output, output_bias = self.dense_4h_to_h(intermediate_parallel) + return output, output_bias + +class CoreAttention(MegatronModule): + + def __init__(self, layer_number, + attn_mask_type=AttnMaskType.padding): + super(CoreAttention, self).__init__() + self.fp16 = config.fp16 + self.bf16 = config.bf16 + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + self.attn_mask_type = attn_mask_type + self.sequence_parallel = False + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_partition = projection_size + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + self.fp16, self.bf16, + self.attn_mask_type, + config.masked_softmax_fusion, + attention_mask_func, + self.attention_softmax_in_fp32, + coeff) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, + value_layer, attention_mask): + + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), + query_layer.size(2), + query_layer.size(0), + key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], + output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], + output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = torch.zeros( + (output_size[0]*output_size[1], output_size[2], output_size[3]), + dtype=query_layer.dtype, device=key_layer.device) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, alpha=(1.0/self.norm_factor)) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs = self.scale_mask_softmax(attention_scores, + attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), + value_layer.size(2), + query_layer.size(0), + value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), + output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class FlashSelfAttention(torch.nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, + device=None, dtype=None): + super().__init__() + assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, ' + 'e.g., with pip install flash-attn') + assert rearrange is not None, 'Please install einops first, e.g., with pip install einops' + self.causal = causal + self.softmax_scale = softmax_scale + self.dropout_p = attention_dropout + + def forward(self, q, k, v): + """Implements the multihead softmax attention. + Arguments + --------- + q, k, v: The tensor containing the query, key, and value. (B, S, H, D) + """ + + assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v))) + assert all((i.is_cuda for i in (q,k,v))) + + batch_size, seqlen_q = q.shape[0], q.shape[1] + seqlen_k = k.shape[1] + + q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, + device=q.device) + + if self.training: + # during training q,k,v always have same seqlen + assert seqlen_k == seqlen_q + + is_causal = self.causal + cu_seqlens_k = cu_seqlens_q + else: + # turn off FA causal mask after first inference autoregressive iteration + # only on first autoregressive step q,k,v have same seqlen + is_causal = seqlen_q == seqlen_k + cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, + device=q.device) + self.dropout_p = 0 + + output = flash_attn_unpadded_func( + q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, + self.dropout_p, + softmax_scale=self.softmax_scale, causal=is_causal + ) + + output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + return output + + +class ParallelAttention(MegatronModule): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, init_method, + output_layer_init_method, layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=AttnMaskType.padding): + super(ParallelAttention, self).__init__() + self.layer_number = max(1, layer_number) + self.attention_type = attention_type + self.attn_mask_type = attn_mask_type + self.params_dtype = config.params_dtype + self.sequence_parallel = False + + self.use_flash_attn = config.use_flash_attn \ + and attention_type == AttnType.self_attn \ + and self.attn_mask_type == AttnMaskType.causal + if self.use_flash_attn: + if flash_attn_unpadded_func is None: + raise ImportError('FlashAttention is not installed, please install with ' + 'pip install flash-attn') + assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports ' + 'self-attention for now') + assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only ' + 'supports causal mask for now') + if rearrange is None: + raise ImportError('einops is not installed, please install with pip install einops') + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + self.hidden_size_per_attention_head = projection_size // config.num_attention_heads + self.num_attention_heads_per_partition = config.num_attention_heads + + # Strided linear layer. + if attention_type == AttnType.self_attn: + self.query_key_value = layers.ColumnParallelLinear( + config.hidden_size, + 3 * projection_size, + gather_output=False, + init_method=init_method, + **_args_to_kwargs()) + else: + assert attention_type == AttnType.cross_attn + self.query = layers.ColumnParallelLinear( + config.hidden_size, + projection_size, + gather_output=False, + init_method=init_method, + **_args_to_kwargs()) + + + self.key_value = layers.ColumnParallelLinear( + config.hidden_size, + 2 * projection_size, + gather_output=False, + init_method=init_method, + **_args_to_kwargs()) + + self.core_attention = CoreAttention(self.layer_number, + self.attn_mask_type) + + if self.use_flash_attn: + self.core_attention_flash = FlashSelfAttention( + causal=True, attention_dropout=config.attention_dropout + ) + + # Output. + self.dense = layers.RowParallelLinear( + projection_size, + config.hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + skip_bias_add=True, + **_args_to_kwargs()) + + def _allocate_memory(self, inference_max_sequence_len, batch_size): + return torch.empty( + inference_max_sequence_len, + batch_size, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + dtype=self.params_dtype, + device=torch.cuda.current_device()) + + def forward(self, hidden_states, attention_mask, + encoder_output=None, inference_params=None, + rotary_pos_emb=None): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + if inference_params: + if self.layer_number not in inference_params.key_value_memory_dict: + inf_max_seq_len = inference_params.max_sequence_len + inf_max_batch_size = inference_params.max_batch_size + inference_key_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size) + inference_value_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size) + inference_params.key_value_memory_dict[self.layer_number] = ( + inference_key_memory, inference_value_memory) + else: + inference_key_memory, inference_value_memory = \ + inference_params.key_value_memory_dict[self.layer_number] + + # ===================== + # Query, Key, and Value + # ===================== + + if self.attention_type == AttnType.self_attn: + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer, _ = self.query_key_value(hidden_states) + # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, + key_layer, + value_layer) = torch.split(mixed_x_layer, mixed_x_layer.size()[-1] // 3, mixed_x_layer.dim() - 1) + + else: + # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] + mixed_kv_layer, _ = self.key_value(encoder_output) + + # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn] + new_tensor_shape = mixed_kv_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 2 * self.hidden_size_per_attention_head) + mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape) + + # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn] + (key_layer, + value_layer) = torch.split(mixed_kv_layer, mixed_kv_layer.size()[-1] // 2, mixed_kv_layer.dim() - 1) + + # Attention head [sq, b, h] --> [sq, b, hp] + query_layer, _ = self.query(hidden_states) + # [sq, b, hp] --> [sq, b, np, hn] + new_tensor_shape = query_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + query_layer = query_layer.view(*new_tensor_shape) + + # ================================== + # Adjust key and value for inference + # ================================== + + if inference_params: + batch_start = inference_params.batch_size_offset + batch_end = batch_start + key_layer.size(1) + assert batch_end <= inference_key_memory.size(1) + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + key_layer.size(0) + assert sequence_end <= inference_key_memory.size(0) + # Copy key and values. + inference_key_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = key_layer + inference_value_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = value_layer + key_layer = inference_key_memory[ + :sequence_end, batch_start:batch_end, ...] + value_layer = inference_value_memory[ + :sequence_end, batch_start:batch_end, ...] + + # ================================== + # core attention computation + # ================================== + + if not self.use_flash_attn: + context_layer = self.core_attention( + query_layer, key_layer, value_layer, attention_mask) + else: + q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() + for x in (query_layer, key_layer, value_layer)] + if not self.sequence_parallel: + context_layer = self.core_attention_flash(q, k, v) + else: + context_layer = self.core_attention_flash(q, k, v) + context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + + # ================= + # Output. [sq, b, h] + # ================= + + output, bias = self.dense(context_layer) + return output, bias + + +def bias_dropout_add(x, bias, residual, prob, training): + # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor + out = torch.nn.functional.dropout(x + bias, p=prob, training=training) + out = residual + out + return out + + +def get_bias_dropout_add(training): + def _bias_dropout_add(x, bias, residual, prob): + return bias_dropout_add(x, bias, residual, prob, training) + return _bias_dropout_add + + +@torch.jit.script +def bias_dropout_add_fused_train(x: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, True) + + +@torch.jit.script +def bias_dropout_add_fused_inference(x: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, False) + + +class ParallelTransformerLayer(MegatronModule): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, init_method, output_layer_init_method, + layer_number, layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + drop_path_rate=0.): + + super(ParallelTransformerLayer, self).__init__() + self.layer_number = layer_number + self.layer_type = layer_type + + self.apply_residual_connection_post_layernorm \ + = config.apply_residual_connection_post_layernorm + + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + + # Layernorm on the input data. + # self.input_layernorm = LayerNorm( + # config.hidden_size, + # eps=config.layernorm_epsilon) + self.input_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + dtype=torch.float) + # eps=config.layernorm_epsilon, + # no_persist_layer_norm=config.no_persist_layer_norm, + # sequence_parallel=config.sequence_parallel) + + # Self attention. + self.self_attention = ParallelAttention( + init_method, + output_layer_init_method, + layer_number, + attention_type=AttnType.self_attn, + attn_mask_type=self_attn_mask_type) + self.hidden_dropout = config.hidden_dropout + self.bias_dropout_fusion = config.bias_dropout_fusion + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None + + # Layernorm on the attention output + # self.post_attention_layernorm = LayerNorm( + # config.hidden_size, + # eps=config.layernorm_epsilon) + self.post_attention_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + dtype=torch.float) + # eps=config.layernorm_epsilon, + # no_persist_layer_norm=config.no_persist_layer_norm, + # sequence_parallel=config.sequence_parallel) + + if self.layer_type == LayerType.decoder: + self.inter_attention = ParallelAttention( + init_method, + output_layer_init_method, + layer_number, + attention_type=AttnType.cross_attn) + # Layernorm on the attention output. + # self.post_inter_attention_layernorm = LayerNorm( + # config.hidden_size, + # eps=config.layernorm_epsilon) + self.post_inter_attention_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + dtype=torch.float) + # eps=config.layernorm_epsilon, + # no_persist_layer_norm=config.no_persist_layer_norm, + # sequence_parallel=config.sequence_parallel) + + # MLP + self.mlp = ParallelMLP(init_method, output_layer_init_method) + + # Set bias+dropout+add fusion grad_enable execution handler. + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) + self.bias_dropout_add_exec_handler = \ + nullcontext if use_nvfuser else torch.enable_grad + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + inference_params=None): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, attention_bias = \ + self.self_attention( + layernorm_output, + attention_mask, + inference_params=inference_params) + + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + if self.drop_path is None: + # jit scripting for a nn.module (with dropout) is not + # trigerring the fusion kernel. For now, we use two + # different nn.functional routines to account for varying + # dropout semantics during training and inference phases. + if self.bias_dropout_fusion: + if self.training: + bias_dropout_add_func = bias_dropout_add_fused_train + else: + bias_dropout_add_func = bias_dropout_add_fused_inference + else: + bias_dropout_add_func = get_bias_dropout_add(self.training) + + with self.bias_dropout_add_exec_handler(): + layernorm_input = bias_dropout_add_func( + attention_output, + attention_bias.expand_as(residual), + residual, + self.hidden_dropout) + else: + out = torch.nn.functional.dropout(attention_output + attention_bias, + p=self.hidden_dropout, + training=self.training) + layernorm_input = residual + self.drop_path(out) + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + if self.layer_type == LayerType.decoder: + attention_output, attention_bias = \ + self.inter_attention(layernorm_output, + enc_dec_attn_mask, + encoder_output=encoder_output) + # residual connection + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + with self.bias_dropout_add_exec_handler(): + layernorm_input = bias_dropout_add_func( + attention_output, + attention_bias.expand_as(residual), + residual, + self.hidden_dropout) + + # Layer norm post the decoder attention + layernorm_output = self.post_inter_attention_layernorm(layernorm_input) + + # MLP. + mlp_output, mlp_bias = self.mlp(layernorm_output) + + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + if self.drop_path is None: + with self.bias_dropout_add_exec_handler(): + output = bias_dropout_add_func( + mlp_output, + mlp_bias.expand_as(residual), + residual, + self.hidden_dropout) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = make_viewless_tensor(inp = output, + requires_grad = output.requires_grad, + keep_graph = True) + + else: + out = torch.nn.functional.dropout(mlp_output + mlp_bias, + p=self.hidden_dropout, + training=self.training) + output = residual + self.drop_path(out) + + return output + + +class NoopTransformerLayer(MegatronModule): + """A single 'no-op' transformer layer. + + The sole purpose of this layer is for when a standalone embedding layer + is used (i.e., args.standalone_embedding_stage == True). In this case, + zero transformer layers are assigned when pipeline rank == 0. Additionally, + when virtual pipeline rank >= 1, zero total model parameters are created + (virtual rank 0 contains the input embedding). This results in the model's + input and output tensors being the same, which causes an error when + performing certain memory optimiations on the output tensor (e.g., + deallocating it). Thus, this layer disconnects the input from the output + via a clone. Since ranks containing a no-op layer are generally under- + utilized (both compute and memory), there's no worry of any performance + degredation. + """ + + def __init__(self, layer_number): + super().__init__() + self.layer_number = layer_number + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + inference_params=None): + return hidden_states.clone() + + +def _get_num_layers(is_decoder=False): + """Compute the number of transformer layers resident on the current rank.""" + if not is_decoder: + num_layers = config.encoder_num_layers + else: + num_layers = config.decoder_num_layers + return num_layers + + +class ParallelTransformer(MegatronModule): + """Transformer class.""" + + def __init__(self, init_method, output_layer_init_method, + layer_type=LayerType.encoder, + self_attn_mask_type=AttnMaskType.padding, + post_layer_norm=True, + pre_process=True, post_process=True, + drop_path_rate=0.0): + super(ParallelTransformer, self).__init__() + + self.layer_type = layer_type + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = post_layer_norm + self.pre_process = pre_process + self.post_process = post_process + self.input_tensor = None + self.drop_path_rate = drop_path_rate + self.transformer_impl = config.transformer_impl + + self.sequence_parallel = False + + self.microbatch_count = 0 + + # Number of layers. + self.num_layers = _get_num_layers( + layer_type == LayerType.decoder) + + self.drop_path_rates = [rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)] + + # Transformer layers. + def build_layer(layer_number): + return ParallelTransformerLayer( + init_method, + output_layer_init_method, + layer_number, + layer_type=layer_type, + self_attn_mask_type=self_attn_mask_type, + drop_path_rate=self.drop_path_rates[layer_number - 1]) + + offset = 0 + + if self.num_layers == 0: + # When a standalone embedding stage is used (e.g., + # args.standalone_embedding_stage == True), virtual pipeline ranks + # on pipeline rank 0 will have zero transformer layers assigned to + # them. This results in the model's input and output tensors to be + # the same, which will cause failure for certain output tensor + # optimizations (e.g., pipeline output deallocation). To remedy + # this, we assign a 'no-op' layer on these ranks, which will + # disconnect the input tensor from the output tensor. + self.num_layers = 1 + self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ]) + else: + self.layers = torch.nn.ModuleList( + [build_layer(i + 1 + offset) for i in range(self.num_layers)]) + + if self.post_process and self.post_layer_norm: + # Final layer norm before output. + # self.final_layernorm = LayerNorm( + # config.hidden_size, + # eps=config.layernorm_epsilon) + self.final_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + dtype=torch.float) + #no_persist_layer_norm=config.no_persist_layer_norm, + #sequence_parallel=config.sequence_parallel) + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def set_input_tensor(self, input_tensor): + """Set input tensor to be used instead of forward()'s input. + + When doing pipeline parallelism the input from the previous + stage comes from communication, not from the input, so the + model's forward_step_func won't have it. This function is thus + used by internal code to bypass the input provided by the + forward_step_func""" + self.input_tensor = input_tensor + + def forward(self, hidden_states, attention_mask, + encoder_output=None, enc_dec_attn_mask=None, + inference_params=None): + # hidden_states: [s, b, h] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = make_viewless_tensor( + hidden_states, + requires_grad=True, + keep_graph=True, + ) + + rng_context = nullcontext() + + with rng_context: + # The fp8_autocast context manager is a no-op when enabled=True + # The if...else serves to short circuit name resolution for fp8_autocast + with nullcontext(): + + # Forward pass. + forward_kwargs = { + 'encoder_output': encoder_output, + 'enc_dec_attn_mask': enc_dec_attn_mask, + 'inference_params': inference_params, + } + + for index in range(self.num_layers): + layer = self._get_layer(index) + hidden_states = layer( + hidden_states, + attention_mask, + **forward_kwargs) + + # Skip counter update for eval and activation checkpointing + if torch.is_grad_enabled() and self.training: + self.microbatch_count += 1 + + # Final layer norm. + if self.post_process and self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states \ No newline at end of file diff --git a/training/benchmarks/gpt2/pytorch/model/layers/utils.py b/training/benchmarks/gpt2/pytorch/model/layers/utils.py new file mode 100644 index 000000000..c1e5e39a8 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/layers/utils.py @@ -0,0 +1,37 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for layers.""" + +import math + +import torch + +def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + +@torch.jit.script +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * + (1.0 + 0.044715 * x * x))) +def openai_gelu(x): + return gelu_impl(x) + +#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter +@torch.jit.script +def erf_gelu(x): + return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, "{} is not divisible by {}".format( + numerator, denominator + ) + + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator \ No newline at end of file diff --git a/training/benchmarks/gpt2/pytorch/model/losses/__init__.py b/training/benchmarks/gpt2/pytorch/model/losses/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py b/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py new file mode 100755 index 000000000..0722a9cde --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/losses/cross_entropy.py @@ -0,0 +1,32 @@ +import torch + + +def cross_entropy(outputs, target): + """ + Compute the cross entropy loss of output and target. + + para: outputs, [b, s, vocab_size] + target, [b, s] + return: loss, [b, s] + """ + + logits = outputs.clone() + # logits = outputs + logits_max = torch.max(logits, dim=-1)[0] + + # Subtract the maximum value. + logits.sub_(logits_max.unsqueeze(dim=-1)) + # Sum of exponential of logits along vocab dimension across all GPUs. + exp_logits = logits.exp() + sum_exp_logits = exp_logits.sum(dim=-1) + + logits_2d = logits.view(-1, logits.size()[-1]) + target_1d = target.view(-1) + arange_1d = torch.arange(start=0, + end=logits_2d.size()[0], + device=logits_2d.device) + predit_ligits_1d = logits_2d[arange_1d, target_1d] + predit_ligits = predit_ligits_1d.view_as(target) + + loss = torch.log(sum_exp_logits) - predit_ligits + return loss diff --git a/training/benchmarks/gpt2/pytorch/model/models/__init__.py b/training/benchmarks/gpt2/pytorch/model/models/__init__.py new file mode 100644 index 000000000..a00a56fac --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +try: + from ..layers.fused_layer_norm import MixedFusedLayerNorm as LayerNorm +except Exception: + # Make LayerNorm has the same parameters as FusedLayerNorm + from torch.nn import LayerNorm as TorchLayerNorm + class LayerNorm(TorchLayerNorm): + """Inherit from torch.nn.LayerNorm but eliminate extra kwargs""" + def __init__(self, normalized_shape, eps=1e-5, + no_persist_layer_norm=True, + sequence_parallel=False, + apply_layernorm_1p=False): + super().__init__( + normalized_shape, eps = eps) + +from .utils import RMSNorm +from .gpt_model import GPTModel +from .language_model import get_language_model +from .module import Float16Module diff --git a/training/benchmarks/gpt2/pytorch/model/models/enums.py b/training/benchmarks/gpt2/pytorch/model/models/enums.py new file mode 100644 index 000000000..2ea483051 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/enums.py @@ -0,0 +1,25 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +import enum + +class LayerType(enum.Enum): + encoder = 1 + decoder = 2 + retro_encoder = 3 + retro_decoder = 4 + retro_decoder_with_retriever = 5 + +class AttnType(enum.Enum): + self_attn = 1 + cross_attn = 2 + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 + +# For backward compatibility with old model checkpoints +class ModelType(enum.Enum): + encoder_or_decoder = 1 + encoder_and_decoder = 2 + retro_encoder = 3 + retro_decoder = 4 diff --git a/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py b/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py new file mode 100644 index 000000000..d43ee2f22 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/gpt_model.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""GPT-2 model.""" + +import torch + +import config +from model.models.module import MegatronModule +from model.losses.cross_entropy import cross_entropy +from model.models.enums import AttnMaskType +from model.models.language_model import get_language_model +from model.models.utils import init_method_normal,scaled_init_method_normal +import math + + +def init_method_normal(sigma): + """Init method based on N(0, sigma).""" + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method_normal(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +def post_language_model_processing(lm_output, labels, logit_weights, fp16_lm_cross_entropy): + # Output. Format [s b h] + output = torch.nn.functional.linear(lm_output, logit_weights, None) + + if labels is None: + # [s b h] => [b s h] + return output.transpose(0,1).contiguous() + else: + # [b s] => [s b] + labels = labels.transpose(0,1).contiguous() + if fp16_lm_cross_entropy: + assert output.dtype == torch.half + loss = cross_entropy(output, labels) + else: + loss = cross_entropy(output.float(), labels) + # [s b] => [b, s] + loss = loss.transpose(0,1).contiguous() + return loss + + +class GPTModel(MegatronModule): + """GPT-2 Language model.""" + + def __init__(self, + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True): + super(GPTModel, self).__init__() + + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = config.fp16_lm_cross_entropy + + self.language_model, self._language_model_key = get_language_model( + num_tokentypes=num_tokentypes, + add_pooler=False, + encoder_attn_mask_type=AttnMaskType.causal, + init_method=init_method_normal(config.init_method_std), + scaled_init_method=scaled_init_method_normal(config.init_method_std, + config.num_layers), + pre_process=self.pre_process, + post_process=self.post_process) + + self.initialize_word_embeddings(init_method_normal) + + def set_input_tensor(self, input_tensor): + """See megatron.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def forward(self, input_ids, position_ids, attention_mask, labels=None, + tokentype_ids=None, inference_params=None): + + lm_output = self.language_model( + input_ids, + position_ids, + attention_mask, + inference_params=inference_params) + + return post_language_model_processing( + lm_output, labels, + self.word_embeddings_weight(), + self.fp16_lm_cross_entropy) + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + # Save word_embeddings. + if self.post_process and not self.pre_process: + state_dict_[self._word_embeddings_for_head_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Load word_embeddings. + if self.post_process and not self.pre_process: + self.word_embeddings.load_state_dict( + state_dict[self._word_embeddings_for_head_key], strict=strict) + if self._language_model_key in state_dict: + state_dict = state_dict[self._language_model_key] + self.language_model.load_state_dict(state_dict, strict=strict) + + diff --git a/training/benchmarks/gpt2/pytorch/model/models/language_model.py b/training/benchmarks/gpt2/pytorch/model/models/language_model.py new file mode 100644 index 000000000..d61b7cb19 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/language_model.py @@ -0,0 +1,502 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Transformer based language model.""" + +import torch +import torch.nn.functional as F + +from .enums import AttnMaskType, LayerType, ModelType +from .module import MegatronModule +from ..layers.transformer import ParallelTransformer +from model.layers.layers import VocabParallelEmbedding +from .utils import get_linear_layer +from .utils import init_method_normal, scaled_init_method_normal + +import config + +def get_language_model(num_tokentypes, add_pooler, + encoder_attn_mask_type, init_method=None, + scaled_init_method=None, add_encoder=True, + add_decoder=False, + decoder_attn_mask_type=AttnMaskType.causal, + pre_process=True, post_process=True): + """Build language model and return along with the key to save.""" + + if init_method is None: + init_method = init_method_normal(config.init_method_std) + + if scaled_init_method is None: + scaled_init_method = scaled_init_method_normal(config.init_method_std, + config.num_layers) + + # Language model. + language_model = TransformerLanguageModel( + init_method, + scaled_init_method, + encoder_attn_mask_type, + num_tokentypes=num_tokentypes, + add_encoder=add_encoder, + add_decoder=add_decoder, + decoder_attn_mask_type=decoder_attn_mask_type, + add_pooler=add_pooler, + pre_process=pre_process, + post_process=post_process + ) + # key used for checkpoints. + language_model_key = 'language_model' + + return language_model, language_model_key + + +class Pooler(MegatronModule): + """Pooler layer. + + Pool hidden states of a specific token (for example start of the + sequence) and add a linear transformation followed by a tanh. + + Arguments: + hidden_size: hidden size + init_method: weight initialization method for the linear layer. + bias is set to zero. + """ + + def __init__(self, hidden_size, init_method): + super(Pooler, self).__init__() + self.dense = get_linear_layer(hidden_size, hidden_size, init_method) + + def forward(self, hidden_states, sequence_index=0): + # hidden_states: [s, b, h] + # sequence_index: index of the token to pool. + + # gather data along sequence dimensions + # same pooler is run on all tensor parallel nodes + + pooled = hidden_states[sequence_index, :, :] + pooled = self.dense(pooled) + pooled = torch.tanh(pooled) + return pooled + + +class Embedding(MegatronModule): + """Language model embeddings. + + Arguments: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + init_method: weight initialization method + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, + hidden_size, + vocab_size, + max_sequence_length, + embedding_dropout_prob, + init_method, + num_tokentypes=0): + super(Embedding, self).__init__() + + self.hidden_size = hidden_size + self.init_method = init_method + self.num_tokentypes = num_tokentypes + + # Word embeddings (parallel). + self.word_embeddings = VocabParallelEmbedding( + vocab_size, self.hidden_size, + init_method=self.init_method, + params_dtype=config.params_dtype, + use_cpu_initialization=True, + perform_initialization=True + ) + self._word_embeddings_key = 'word_embeddings' + + # Position embedding (serial). + self.position_embeddings = torch.nn.Embedding( + max_sequence_length, self.hidden_size) + self._position_embeddings_key = 'position_embeddings' + # Initialize the position embeddings. + self.init_method(self.position_embeddings.weight) + + # Token type embedding. + # Add this as an optional field that can be added through + # method call so we can load a pretrain model without + # token types and add them as needed. + self._tokentype_embeddings_key = 'tokentype_embeddings' + if self.num_tokentypes > 0: + self.tokentype_embeddings = torch.nn.Embedding(self.num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + self.init_method(self.tokentype_embeddings.weight) + else: + self.tokentype_embeddings = None + + self.fp32_residual_connection = config.fp32_residual_connection + self.sequence_parallel = False + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + + def zero_parameters(self): + """Zero out all parameters in embedding.""" + self.word_embeddings.weight.data.fill_(0) + self.word_embeddings.weight.shared = True + self.position_embeddings.weight.data.fill_(0) + self.position_embeddings.weight.shared = True + if self.num_tokentypes > 0: + self.tokentype_embeddings.weight.data.fill_(0) + self.tokentype_embeddings.weight.shared = True + + def add_tokentype_embeddings(self, num_tokentypes): + """Add token-type embedding. This function is provided so we can add + token-type embeddings in case the pretrained model does not have it. + This allows us to load the model normally and then add this embedding. + """ + if self.tokentype_embeddings is not None: + raise Exception('tokentype embeddings is already initialized') + if torch.distributed.get_rank() == 0: + print('adding embedding for {} tokentypes'.format(num_tokentypes), + flush=True) + self.num_tokentypes = num_tokentypes + self.tokentype_embeddings = torch.nn.Embedding(num_tokentypes, + self.hidden_size) + # Initialize the token-type embeddings. + self.init_method(self.tokentype_embeddings.weight) + + def forward(self, input_ids, position_ids, tokentype_ids=None): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + embeddings = words_embeddings + position_embeddings + if tokentype_ids is not None: + assert self.tokentype_embeddings is not None + embeddings = embeddings + self.tokentype_embeddings(tokentype_ids) + else: + assert self.tokentype_embeddings is None + + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + + # Dropout. + embeddings = self.embedding_dropout(embeddings) + + return embeddings + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load.""" + + state_dict_ = {} + state_dict_[self._word_embeddings_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + state_dict_[self._position_embeddings_key] \ + = self.position_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + if self.num_tokentypes > 0: + state_dict_[self._tokentype_embeddings_key] \ + = self.tokentype_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Word embedding. + if self._word_embeddings_key in state_dict: + state_dict_ = state_dict[self._word_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'word_embeddings' in key: + state_dict_[key.split('word_embeddings.')[1]] \ + = state_dict[key] + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + + # Position embedding. + if self._position_embeddings_key in state_dict: + state_dict_ = state_dict[self._position_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'position_embeddings' in key: + state_dict_[key.split('position_embeddings.')[1]] \ + = state_dict[key] + self.position_embeddings.load_state_dict(state_dict_, strict=strict) + + # Tokentype embedding. + if self.num_tokentypes > 0: + state_dict_ = {} + if self._tokentype_embeddings_key in state_dict: + state_dict_ = state_dict[self._tokentype_embeddings_key] + else: + # for backward compatibility. + for key in state_dict.keys(): + if 'tokentype_embeddings' in key: + state_dict_[key.split('tokentype_embeddings.')[1]] \ + = state_dict[key] + if len(state_dict_.keys()) > 0: + self.tokentype_embeddings.load_state_dict(state_dict_, + strict=strict) + else: + print('***WARNING*** expected tokentype embeddings in the ' + 'checkpoint but could not find it', flush=True) + + +class TransformerLanguageModel(MegatronModule): + """Transformer language model. + + Arguments: + transformer_hparams: transformer hyperparameters + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, + init_method, + output_layer_init_method, + encoder_attn_mask_type, + num_tokentypes=0, + add_encoder=True, + add_decoder=False, + decoder_attn_mask_type=AttnMaskType.causal, + add_pooler=False, + pre_process=True, + post_process=True): + super(TransformerLanguageModel, self).__init__() + + self.pre_process = pre_process + self.post_process = post_process + self.hidden_size = config.hidden_size + self.num_tokentypes = num_tokentypes + self.init_method = init_method + self.add_encoder = add_encoder + self.encoder_attn_mask_type = encoder_attn_mask_type + self.add_decoder = add_decoder + self.decoder_attn_mask_type = decoder_attn_mask_type + self.add_pooler = add_pooler + self.encoder_hidden_state = None + + # Embeddings. + if self.pre_process: + self.embedding = Embedding(self.hidden_size, + config.padded_vocab_size, + config.max_position_embeddings, + config.hidden_dropout, + self.init_method, + self.num_tokentypes) + self._embedding_key = 'embedding' + + # Transformer. + # Encoder (usually set to True, False if part of an encoder-decoder + # architecture and in encoder-only stage). + if self.add_encoder: + self.encoder = ParallelTransformer( + self.init_method, + output_layer_init_method, + self_attn_mask_type=self.encoder_attn_mask_type, + pre_process=self.pre_process, + post_process=self.post_process + ) + self._encoder_key = 'encoder' + else: + self.encoder = None + + # Decoder (usually set to False, True if part of an encoder-decoder + # architecture and in decoder-only stage). + if self.add_decoder: + self.decoder = ParallelTransformer( + self.init_method, + output_layer_init_method, + layer_type=LayerType.decoder, + self_attn_mask_type=self.decoder_attn_mask_type, + pre_process=self.pre_process, + post_process=self.post_process) + self._decoder_key = 'decoder' + else: + self.decoder = None + + if self.post_process: + # Pooler. + if self.add_pooler: + self.pooler = Pooler(self.hidden_size, self.init_method) + self._pooler_key = 'pooler' + + def set_input_tensor(self, input_tensor): + """ See megatron.model.transformer.set_input_tensor()""" + + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + + if self.add_encoder and self.add_decoder: + assert len(input_tensor) == 1, \ + 'input_tensor should only be length 1 for stage with both encoder and decoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_encoder: + assert len(input_tensor) == 1, \ + 'input_tensor should only be length 1 for stage with only encoder' + self.encoder.set_input_tensor(input_tensor[0]) + elif self.add_decoder: + if len(input_tensor) == 2: + self.decoder.set_input_tensor(input_tensor[0]) + self.encoder_hidden_state = input_tensor[1] + elif len(input_tensor) == 1: + self.decoder.set_input_tensor(None) + self.encoder_hidden_state = input_tensor[0] + else: + raise Exception('input_tensor must have either length 1 or 2') + else: + raise Exception('Stage must have at least either encoder or decoder') + + def forward(self, enc_input_ids, enc_position_ids, enc_attn_mask, + dec_input_ids=None, dec_position_ids=None, dec_attn_mask=None, + enc_dec_attn_mask=None, tokentype_ids=None, + inference_params=None, + pooling_sequence_index=0, + enc_hidden_states=None, output_enc_hidden=False): + + # Encoder embedding. + if self.pre_process: + encoder_input = self.embedding(enc_input_ids, enc_position_ids, + tokentype_ids=tokentype_ids) + else: + encoder_input = None + + # Run encoder. + if enc_hidden_states is None: + if self.encoder is not None: + encoder_output = self.encoder( + encoder_input, + enc_attn_mask, + inference_params=inference_params) + else: + encoder_output = self.encoder_hidden_state + else: + encoder_output = enc_hidden_states.to(encoder_input.dtype) + + if self.post_process: + if self.add_pooler: + pooled_output = self.pooler(encoder_output, + pooling_sequence_index) + + # output_enc_hidden refers to when we just need the encoder's + # output. For example, it is helpful to compute + # similarity between two sequences by average pooling + if not self.add_decoder or output_enc_hidden: + if self.add_pooler and self.post_process: + return encoder_output, pooled_output + else: + return encoder_output + + # Decoder embedding. + if self.pre_process: + decoder_input = self.embedding(dec_input_ids, + dec_position_ids) + else: + decoder_input = None + + # Run decoder. + decoder_output = self.decoder( + decoder_input, + dec_attn_mask, + encoder_output=encoder_output, + enc_dec_attn_mask=enc_dec_attn_mask, + inference_params=inference_params) + + if self.add_pooler and self.post_process: + return decoder_output, encoder_output, pooled_output + else: + return decoder_output, encoder_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """For easy load.""" + + state_dict_ = {} + if self.pre_process: + state_dict_[self._embedding_key] \ + = self.embedding.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.add_encoder: + state_dict_[self._encoder_key] \ + = self.encoder.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.post_process: + if self.add_pooler: + state_dict_[self._pooler_key] \ + = self.pooler.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + if self.add_decoder: + state_dict_[self._decoder_key] \ + = self.decoder.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Embedding. + if self.pre_process: + if self._embedding_key in state_dict: + state_dict_ = state_dict[self._embedding_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if '_embeddings' in key: + state_dict_[key] = state_dict[key] + self.embedding.load_state_dict(state_dict_, strict=strict) + + # Encoder. + if self.add_encoder: + if self._encoder_key in state_dict: + state_dict_ = state_dict[self._encoder_key] + # For backward compatibility. + elif 'transformer' in state_dict: + state_dict_ = state_dict['transformer'] + else: + # For backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'transformer.' in key: + state_dict_[key.split('transformer.')[1]] = state_dict[key] + + # For backward compatibility. + state_dict_self_attention = {} + for key in state_dict_.keys(): + if '.attention.' in key: + state_dict_self_attention[key.replace(".attention.", + ".self_attention.")] = state_dict_[key] + else: + state_dict_self_attention[key] = state_dict_[key] + state_dict_ = state_dict_self_attention + + self.encoder.load_state_dict(state_dict_, strict=strict) + + # Pooler. + if self.post_process: + if self.add_pooler: + assert 'pooler' in state_dict, \ + 'could not find data for pooler in the checkpoint' + self.pooler.load_state_dict(state_dict[self._pooler_key], + strict=strict) + # Decoder. + if self.add_decoder: + assert 'decoder' in state_dict, \ + 'could not find data for pooler in the checkpoint' + self.decoder.load_state_dict(state_dict[self._decoder_key], + strict=strict) diff --git a/training/benchmarks/gpt2/pytorch/model/models/module.py b/training/benchmarks/gpt2/pytorch/model/models/module.py new file mode 100644 index 000000000..8b09c46a1 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/module.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron Module""" + +import torch +from torch.autograd import Variable +from torch.nn.parameter import Parameter + +_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor) +_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor) +_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor) + + +def param_is_not_shared(param): + return not hasattr(param, 'shared') or not param.shared + + +class MegatronModule(torch.nn.Module): + """Megatron specific extensions of torch Module with support + for pipelining.""" + + def __init__(self, share_word_embeddings=True): + super(MegatronModule, self).__init__() + self.share_word_embeddings = share_word_embeddings + + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + """Use this function to override the state dict for + saving checkpoints.""" + return self.state_dict(prefix=prefix, keep_vars=keep_vars) + + + def word_embeddings_weight(self): + if self.pre_process: + return self.language_model.embedding.word_embeddings.weight + else: + if not self.share_word_embeddings: + raise Exception('word_embeddings_weight() called for last ' + 'stage, but share_word_embeddings is false') + return self.word_embeddings.weight + + + def initialize_word_embeddings(self, init_method_normal): + if not self.share_word_embeddings: + raise Exception('initialize_word_embeddings() was called but ' + 'share_word_embeddings is false') + return + +def conversion_helper(val, conversion): + """Apply conversion to val. Recursively apply conversion if `val` + #is a nested tuple/list structure.""" + if not isinstance(val, (tuple, list)): + return conversion(val) + rtn = [conversion_helper(v, conversion) for v in val] + if isinstance(val, tuple): + rtn = tuple(rtn) + return rtn + + +def fp32_to_float16(val, float16_convertor): + """Convert fp32 `val` to fp16/bf16""" + def half_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, _FLOAT_TYPES): + val = float16_convertor(val) + return val + return conversion_helper(val, half_conversion) + + +def float16_to_fp32(val): + """Convert fp16/bf16 `val` to fp32""" + def float_conversion(val): + val_typecheck = val + if isinstance(val_typecheck, (Parameter, Variable)): + val_typecheck = val.data + if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)): + val = val.float() + return val + return conversion_helper(val, float_conversion) + + + +class Float16Module(MegatronModule): + + def __init__(self, module, args): + super(Float16Module, self).__init__() + + if args.fp16: + self.add_module('module', module.half()) + def float16_convertor(val): + return val.half() + elif args.bf16: + self.add_module('module', module.bfloat16()) + def float16_convertor(val): + return val.bfloat16() + else: + raise Exception('should not be here') + + self.float16_convertor = float16_convertor + + + def set_input_tensor(self, input_tensor): + return self.module.set_input_tensor(input_tensor) + + + def forward(self, *inputs, **kwargs): + inputs = fp32_to_float16(inputs, self.float16_convertor) + outputs = self.module(*inputs, **kwargs) + outputs = float16_to_fp32(outputs) + return outputs + + + def state_dict(self, prefix='', keep_vars=False): + return self.module.state_dict(prefix=prefix, keep_vars=keep_vars) + + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + return self.module.state_dict_for_save_checkpoint(prefix=prefix, + keep_vars=keep_vars) + + + def load_state_dict(self, state_dict, strict=True): + self.module.load_state_dict(state_dict, strict=strict) diff --git a/training/benchmarks/gpt2/pytorch/model/models/utils.py b/training/benchmarks/gpt2/pytorch/model/models/utils.py new file mode 100644 index 000000000..a32740c7f --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/model/models/utils.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Utilities for models.""" + +import math + +import torch +import config + +def init_method_normal(sigma): + """Init method based on N(0, sigma).""" + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method_normal(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + + +def get_linear_layer(rows, columns, init_method): + """Simple linear layer with weight initialization.""" + layer = torch.nn.Linear(rows, columns) + if get_args().perform_initialization: + init_method(layer.weight) + with torch.no_grad(): + layer.bias.zero_() + return layer + + +class RMSNorm(torch.nn.Module): + """An alternate to layer normalization, without mean centering and the learned bias + paper: [Root mean square layer normalization](https://arxiv.org/abs/1910.07467) + code: https://github.com/facebookresearch/llama/blob/main/llama/model.py#L33 + """ + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = torch.nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight diff --git a/training/benchmarks/gpt2/pytorch/mpu/__init__.py b/training/benchmarks/gpt2/pytorch/mpu/__init__.py new file mode 100644 index 000000000..e889e1e7a --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/mpu/__init__.py @@ -0,0 +1,11 @@ +import torch + +_DATA_PARALLEL_GLOBAL_RANKS = None +if torch.distributed.is_initialized(): + _DATA_PARALLEL_GLOBAL_RANKS = [i for i in range(torch.distributed.get_world_size())] + +def get_data_parallel_rank(): + return torch.distributed.get_rank() + +def get_data_parallel_world_size(): + return torch.distributed.get_world_size() \ No newline at end of file diff --git a/training/benchmarks/gpt2/pytorch/optimizer/__init__.py b/training/benchmarks/gpt2/pytorch/optimizer/__init__.py new file mode 100644 index 000000000..1e8edec9e --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/optimizer/__init__.py @@ -0,0 +1,138 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +from torch.optim import SGD +from torch.optim import AdamW as Adam + +from optimizer.distrib_optimizer import DistributedOptimizer +from optimizer.grad_scaler import ConstantGradScaler, DynamicGradScaler +from optimizer.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer +import config + +def get_param_groups(module, + no_weight_decay_cond, + scale_lr_cond, + lr_mult): + """creates param groups based on weight decay condition (regularized vs non regularized) + and learning rate scale condition (args.lr vs lr_mult * args.lr) + scale_lr_cond is used during finetuning where head of the network requires a scaled + version of the base learning rate. + """ + wd_no_scale_lr = [] + wd_scale_lr = [] + no_wd_no_scale_lr = [] + no_wd_scale_lr = [] + for name, param in module.named_parameters(): + if not param.requires_grad: + continue + + if no_weight_decay_cond is not None: + no_wd = no_weight_decay_cond(name, param) + else: + # do not regularize biases nor Norm parameters + no_wd = name.endswith(".bias") or len(param.shape) == 1 + + if scale_lr_cond is not None: + scale_lr = scale_lr_cond(name, param) + else: + scale_lr = False + + if not no_wd and not scale_lr: + wd_no_scale_lr.append(param) + elif not no_wd and scale_lr: + wd_scale_lr.append(param) + elif no_wd and not scale_lr: + no_wd_no_scale_lr.append(param) + else: + no_wd_scale_lr.append(param) + + param_groups = [] + if len(wd_no_scale_lr): + param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0}) + if len(wd_scale_lr): + param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult}) + if len(no_wd_no_scale_lr): + param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0}) + if len(no_wd_scale_lr): + param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult}) + + return param_groups + +def get_megatron_optimizer(model, + no_weight_decay_cond=None, + scale_lr_cond=None, + lr_mult=1.0): + # Base optimizer. + param_groups = get_param_groups(model, + no_weight_decay_cond, + scale_lr_cond, + lr_mult) + + if config.optimizer == 'adam': + optimizer = Adam(param_groups, + lr=config.lr, + weight_decay=config.weight_decay, + betas=(config.adam_beta1, config.adam_beta2), + eps=config.adam_eps) + elif config.optimizer == 'sgd': + optimizer = SGD(param_groups, + lr=config.lr, + weight_decay=config.weight_decay, + momentum=config.sgd_momentum) + else: + raise Exception('{} optimizer is not supported.'.format( + config.optimizer)) + + # Determine whether the params have main-grad field. + params_have_main_grad = False + if config.DDP_impl == 'local': + params_have_main_grad = True + + # Mixed precision optimizer. + # - Note: both the Float16Optimizer and the DistributedOptimizer inherit + # from the MixedPrecisionOptimizer, which manages any optimizer where + # the model params and main params are distinct. + if config.fp16 or config.bf16 or config.use_distributed_optimizer: + + # Grad scaler: + # if loss-scale is provided, instantiate the constant scaler. + # if we are using fp16 and loss-scale is not present, use a + # dynamic scaler. + # otherwise we are running in bf16 with no loss-scale so + # leave it as None. + grad_scaler = None + + # Constant loss scale. + if config.loss_scale: + grad_scaler = ConstantGradScaler(config.loss_scale) + + # Dynamic loss scale. + else: + if config.fp16: + grad_scaler = DynamicGradScaler( + initial_scale=config.initial_loss_scale, + min_scale=config.min_loss_scale, + growth_factor=2.0, + backoff_factor=0.5, + growth_interval=config.loss_scale_window, + hysteresis=config.hysteresis) + + # Megatron optimizer. + opt_ty = DistributedOptimizer \ + if config.use_distributed_optimizer else \ + Float16OptimizerWithFloat16Params + return opt_ty(optimizer, + config.clip_grad, + params_have_main_grad, + config.use_contiguous_buffers_in_local_ddp, + config.fp16, + config.bf16, + config.params_dtype, + grad_scaler, + model) + + # FP32. + + return FP32Optimizer(optimizer, config.clip_grad, + params_have_main_grad, + config.use_contiguous_buffers_in_local_ddp, + model) diff --git a/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py new file mode 100644 index 000000000..e2f053054 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/optimizer/clip_grads.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Gradient clipping.""" + +import torch +from torch import inf + +from model.models.module import param_is_not_shared + + +def clip_grad_norm_fp32(parameters, grads_for_norm, + max_norm, norm_type=2): + """Clips gradient norm of an iterable of parameters whose gradients + are in fp32. + + This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and + added functionality to handle model parallel parameters. Note that + the gradients are modified in place. + + Arguments: + parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a + single Tensor that will have gradients normalized + grads_for_norm (Iterable[Tensor]): an iterable of Tensors or a single + Tensor that will be used for calculating the grad norm. + max_norm (float or int): max norm of the gradients + norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + model_parallel_group (group): given the nature of the distributed + optimizer, this is passed as an argument. + + Returns: + Total norm of the parameters (viewed as a single vector). + """ + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + if isinstance(grads_for_norm, torch.Tensor): + grads_for_norm = [grads_for_norm] + + # Grads. + grads = [] + for param in parameters: + if param.grad is not None: + assert param.grad.type() in ['torch.xpu.FloatTensor', 'torch.cuda.FloatTensor'] + grads.append(param.grad.detach()) + + # Norm parameters. + max_norm = float(max_norm) + norm_type = float(norm_type) + total_norm = 0.0 + + # Calculate norm. + if norm_type == inf: + total_norm = max(grad.abs().max() for grad in grads_for_norm) + total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) + total_norm = total_norm_cuda[0].item() + + else: + if norm_type == 2.0: + dummy_overflow_buf = torch.cuda.IntTensor([0]) + # Use apex's multi-tensor applier for efficiency reasons. + # Multi-tensor applier takes a function and a list of list + # and performs the operation on that list all in one kernel. + if grads_for_norm: + grad_norm = torch.cuda.FloatTensor([item.norm() for item in grads_for_norm]).norm() + else: + grad_norm = torch.cuda.FloatTensor([0]) + # Since we will be summing across data parallel groups, + # we need the pow(norm-type). + total_norm = grad_norm ** norm_type + + else: + for grad in grads_for_norm: + grad_norm = torch.norm(grad, norm_type) + total_norm += grad_norm ** norm_type + + # Sum across all model-parallel GPUs. + total_norm = total_norm.item() ** (1.0 / norm_type) + + # Scale. + clip_coeff = max_norm / (total_norm + 1.0e-6) + if clip_coeff < 1.0: + dummy_overflow_buf = torch.cuda.IntTensor([0]) + grads = [item * clip_coeff for item in grads] + + return total_norm diff --git a/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py b/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py new file mode 100644 index 000000000..50125b83c --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/optimizer/distrib_optimizer.py @@ -0,0 +1,1011 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Megatron distributed optimizer.""" +import math + +import torch +from torch.optim import AdamW as Adam + +from mpu import get_data_parallel_world_size, get_data_parallel_rank +from optimizer.optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper +import config + +class Range: + """ + A range represents a start and end points for indexing a shard + from a full tensor. + """ + def __init__(self, start, end): + self.start = start + self.end = end + self.size = end - start + def normalize(self, start = 0): + return Range(start, start + self.size) + def __str__(self): + return "%d,%d [%d]" % (self.start, self.end, self.size) + def __len__(self): + return self.end - self.start + + +class DistributedOptimizer(MixedPrecisionOptimizer): + """Distributed optimizer, for all data types (fp16, bf16, and fp32). + + Arguments: + optimizer: base optimizer such as Adam or SGD + clip_grad: clip gradeints with this global L2 norm. Note + that clipping is ignored if clip_grad == 0 + params_have_main_grad: flag indicating if parameters have + a `main_grad` field. If this is set, we are assuming + that the model parameters are store in the `main_grad` + field instead of the typical `grad` field. This happens + for the DDP cases where there is a continuous buffer + holding the gradients. For example for bfloat16, we want + to do gradient accumulation and all-reduces in float32 + and as a result we store those gradients in the main_grad. + Note that main grad is not necessarily in float32. + use_contiguous_buffers_in_local_ddp: if true, the local DDP model + is using a contiguous buffer to hold the model grads. + fp16: if true, the model is running in fp16. + bf16: if true, the model is running in bfloat16. + grad_scaler: used for scaling gradients. Note that this can be + None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constnat gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + models: list of models (i.e., the virtual pipelining models). This + is used by the distributed optimizer for mapping parameters. + """ + + @classmethod + def build_model_gbuf_param_range_map(cls, model, dtype, gbuf_world_range): + """ + Build mapping from param reference to grad buffer shard ranges. + + This method builds a mapping from parameter references to grad + buffer shard ranges, specific to each data-parallel (DP) rank's + set of 'owned' parameters. Each grad buffer (padded to be an even + multiple of DP-world-size) is conceptually divided into DP-world-size + contiguous regions, where each DP rank 'owns' a contiguous regions. + Ownership in this sense means DP rank is responsible for reducing + the relevant subset of grads, and updating the relevant subset of + params. + + This conceptual partitioning of the grad buffer does NOT respect + parameter boundaries, and as such it is assumed that each created + range references a shard (or subset) of the full parameter. It is + easiest to think of each DP rank as operating (i.e., reducing, + gathering) purely on views into the grad buffer, for all model-to- + main & main-to-model operations. + + This method creates three ranges: + - The param's range within the entire grad buffer (i.e., world index). + - The param's range within the DP rank's local view of the grad buffer. + - The param's range within itself (i.e., its shard). + """ + + # Param range map. + param_world_index_map = model._grad_buffer_param_index_map[dtype] + param_range_map = {} + for param, param_world_indexes in param_world_index_map.items(): + + # Param range. + param_world_start, param_world_end = param_world_indexes + param_local_start = max( + 0, + param_world_start - gbuf_world_range.start) + param_local_end = min( + gbuf_world_range.size, + param_world_end - gbuf_world_range.start) + + # Add param, if within local gbuf range. + if param_local_end > param_local_start: + param_local_range = Range(param_local_start, param_local_end) + param_world_range = param_local_range.normalize( + param_local_start + gbuf_world_range.start) + sub_param_start = max(0, gbuf_world_range.start-param_world_start) + sub_param_range = param_local_range.normalize(sub_param_start) + param_range_map[param] = { + "gbuf_world" : param_world_range, + "gbuf_local" : param_local_range, + "param" : sub_param_range, + } + + return param_range_map + + + @classmethod + def build_model_gbuf_range(cls, model, dtype): + """ + Build mapping between params and their grad buffers. + + This method does the initial setup for the method above. This setup + includes determining the shard ranges into the DDP's grad buffer for + each data-parallel (DP) rank. Each DP rank keeps range info for + all other DP ranks, for the purpose of creating args for + reduce-scatter and all-gather. + """ + + data_parallel_rank = mpu.get_data_parallel_rank() + data_parallel_world_size = mpu.get_data_parallel_world_size() + + # Grad buffer range. + grad_buffer = model._grad_buffers[dtype] + gbuf_size = grad_buffer.numel + max_gbuf_range_size = int(math.ceil(gbuf_size / data_parallel_world_size)) + + # All world ranges. (i.e., across all data parallel ranks) + gbuf_world_all_ranges = [] + for r in range(data_parallel_world_size): + gbuf_world_start = r * max_gbuf_range_size + gbuf_world_end = min(gbuf_size, gbuf_world_start+max_gbuf_range_size) + gbuf_world_range = Range(gbuf_world_start, gbuf_world_end) + gbuf_world_all_ranges.append(gbuf_world_range) + + # Local DP's ranges. + gbuf_world_range = gbuf_world_all_ranges[data_parallel_rank] + gbuf_local_range = gbuf_world_range.normalize() + + # Get each param's ranges. + param_range_map = cls.build_model_gbuf_param_range_map(model, + dtype, + gbuf_world_range) + + # Group into dict. + data = { + "local" : gbuf_local_range, + "world" : gbuf_world_range, + "world_all" : gbuf_world_all_ranges, + "param_map" : param_range_map, + "max_range_size" : max_gbuf_range_size, + } + + return data + + + @classmethod + def build_model_gbuf_range_map(cls, model): + """ + Create param-to-grad-buffer mappings, for grad buffer data types + within a specific virtual model. + """ + return { + dtype : cls.build_model_gbuf_range(model, dtype) + for dtype in model._grad_buffers + } + + + @classmethod + def build_model_param_gbuf_map(cls, model_gbuf_ranges): + """ + Create a reverse of the model_gbuf_ranges, for referencing in + opposite direction. + """ + param_gbuf_map = {} + for model_index, model_gbuf_range_map in enumerate(model_gbuf_ranges): + for dtype, gbuf_range_map in model_gbuf_range_map.items(): + for param, param_range_map in gbuf_range_map["param_map"].items(): + param_gbuf_map[param] = (model_index, dtype) + return param_gbuf_map + + + @classmethod + def build_optimizer_group_ranges(cls, param_groups, model_gbuf_ranges): + """ + Create optimizer groups. + + Given the set of parameter shard ranges that are owned by the current + data-parallel (DP) rank, gather the set of parameters that will be + used (in the method below) to create the current DP's optimizer + groups. + """ + + num_groups = len(param_groups) + + # Param group map. + # World param group map. + # - Store a mapping of for all parameters + # across all DP ranks. This is necessary because it is our first + # cross reference between the DDP mappings and the optimizer group + # parameters. This mapping only for use in the next step of building + # the local mapping over this DP rank's parameters. + world_param_group_map = {} + for group_index, group in enumerate(param_groups): + for param in group["params"]: + assert param.requires_grad + world_param_group_map[param] = group_index + + # Optimizer group ranges & param-group mapping. + # - Build a mapping from groups to their contained parameters, and also + # from parameters to their containing group index and order within + # the group. The group index and order are particularly important for + # saving and loading checkpoints. + local_param_group_map = {} + group_ranges = [ {"params": []} for _ in param_groups ] + for model_gbuf_range_map in model_gbuf_ranges: + for dtype, gbuf_range_map in model_gbuf_range_map.items(): + for param in gbuf_range_map["param_map"]: + group_index = world_param_group_map[param] + group_range = group_ranges[group_index] + group_range["params"].append(param) + local_param_group_map[param] = \ + (group_index, len(group_range["params"]) - 1) + + # Squeeze zero-size group ranges. + for group_index, group_range in enumerate(group_ranges): + group_range["orig_group"] = param_groups[group_index] + group_range["orig_group_idx"] = param_groups[group_index] + + return local_param_group_map, group_ranges + + + @classmethod + def build_model_and_main_param_groups(cls, + model_gbuf_ranges, + param_gbuf_map, + opt_group_ranges): + """ + Create main parameter groups needed for the optimizer step. + + These groups encompass both: 1) groups used by this class, for + reducing/gather, and 2) groups used by the inner optimizer for the + parameter update. Given that the conceptual grad buffer partitioning + (created in earlier method) doesn't respect parameter boundaries, + the optimizer operates on shards of the model parameters, rather than + the full parameters. + """ + + # Parameter groups: + # model_float16_groups: original float16 parameters + # model_fp32_groups: original fp32 parameters + # shard_float16_groups: shards of original float16 parameters + # shard_fp32_groups: shards of original fp32 parameters + # shard_fp32_from_float16_groups: fp32 copy of float16 parameters + model_float16_groups = [] + model_fp32_groups = [] + shard_float16_groups = [] + shard_fp32_groups = [] + shard_fp32_from_float16_groups = [] + + # Allocate (or slice) each group's param shard. + for group_index, group_range in enumerate(opt_group_ranges): + + # Params of this group. + model_float16_params_this_group = [] + model_fp32_params_this_group = [] + shard_float16_params_this_group = [] + shard_fp32_params_this_group = [] + shard_fp32_from_float16_params_this_group = [] + model_float16_groups.append(model_float16_params_this_group) + model_fp32_groups.append(model_fp32_params_this_group) + shard_float16_groups.append(shard_float16_params_this_group) + shard_fp32_groups.append(shard_fp32_params_this_group) + shard_fp32_from_float16_groups.append( + shard_fp32_from_float16_params_this_group) + + for model_param in group_range["params"]: + + assert model_param.requires_grad + + model_index, dtype = param_gbuf_map[model_param] + gbuf_range = model_gbuf_ranges[model_index][dtype] + param_range = gbuf_range["param_map"][model_param]["param"] + + # fp16, bf16 params. + if model_param.type() in ['torch.cuda.HalfTensor', + 'torch.cuda.BFloat16Tensor', + 'torch.xpu.HalfTensor', + 'torch.xpu.BFloat16Tensor']: + + # Clone model -> main. + shard_model_param = model_param.detach().view(-1) \ + [param_range.start:param_range.end] + shard_main_param = shard_model_param.clone().float() + if hasattr(model_param, 'shared'): + shard_model_param.shared = model_param.shared + shard_main_param.shared = model_param.shared + + # Add to group. + model_float16_params_this_group.append(model_param) + shard_float16_params_this_group.append(shard_model_param) + shard_fp32_from_float16_params_this_group.append(shard_main_param) + + # fp32 params. + elif model_param.type() in ['torch.cuda.FloatTensor', + 'torch.xpu.FloatTensor']: + shard_model_param = model_param.view(-1) \ + [param_range.start:param_range.end] + model_fp32_params_this_group.append(model_param) + shard_fp32_params_this_group.append(shard_model_param) + if hasattr(model_param, 'shared'): + shard_model_param.shared = model_param.shared + + else: + raise TypeError('Wrapped parameters must be one of ' + 'torch.cuda.FloatTensor, ' + 'torch.cuda.HalfTensor, or ' + 'torch.cuda.BFloat16Tensor, or ' + 'torch.xpu.FloatTensor, or ' + 'torch.xpu.HalfTensor, or ' + 'torch.xpu.BFloat16Tensor. ' + 'Received {}'.format(model_param.type())) + + # Update optimizer's params. + group_range["orig_group"]["params"] = [ + *shard_fp32_params_this_group, + *shard_fp32_from_float16_params_this_group, + ] + + return ( + model_float16_groups, + model_fp32_groups, + shard_float16_groups, + shard_fp32_groups, + shard_fp32_from_float16_groups, + ) + + + def __init__(self, optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + fp16, bf16, params_dtype, grad_scaler, models): + """ + See top of class definition for argument descriptions. + + The steps in this method create the core mapping between DDP grad + buffers, parameters, and parameter shard ranges, that is needed for + converting between model param indexes and main parameter shard + indexes. This method also updates the optimizer parameter groups + with the newly created shards. + """ + + super().__init__( + optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + fp16, bf16, params_dtype, grad_scaler, models) + + # Verify that contiguous buffers are being used. + # - Note: this should already be checked in arguments.py. + assert use_contiguous_buffers_in_local_ddp + assert isinstance(optimizer, Adam), \ + "Only Adam currently supported, due to checkpointing requirements." + + # Model grad buffer ranges. + self.model_gbuf_ranges = [] + for model_index, model in enumerate(self.models): + self.model_gbuf_ranges.append(self.build_model_gbuf_range_map(model)) + self.model_param_gbuf_map = \ + self.build_model_param_gbuf_map(self.model_gbuf_ranges) + + # Optimizer ranges. + self.model_param_group_index_map, self.opt_group_ranges = \ + self.build_optimizer_group_ranges(self.optimizer.param_groups, + self.model_gbuf_ranges) + + # Allocate main param shards. + ( + self.model_float16_groups, + self.model_fp32_groups, + self.shard_float16_groups, + self.shard_fp32_groups, + self.shard_fp32_from_float16_groups, + ) = self.build_model_and_main_param_groups(self.model_gbuf_ranges, + self.model_param_gbuf_map, + self.opt_group_ranges) + + # Initialize param buffers. + # - These are views on the DDP model's grad buffers, that share + # storage & have their own dtype. This is safe because the param + # dtype size is always <= grad dtype size. + self.param_buffers = [] + for model_index, model in enumerate(self.models): + current_param_buffers = {} + for dtype, grad_buffer in model._grad_buffers.items(): + + # Handle older/newer method for getting untyped storage. + try: + storage = grad_buffer.data.storage()._untyped() + except: + storage = grad_buffer.data.storage().untyped() + + # FIXME: xpu do not support init torch.Tensor from XPU-typed tensor, use torch.frombuffer instead + # Typed param buffer. + #param_buffer = torch.tensor( + # storage, + # dtype = params_dtype, + # device = grad_buffer.data.device) + param_buffer = torch.frombuffer(grad_buffer.data.cpu().numpy(), dtype = params_dtype).to(grad_buffer.data.device) + param_buffer = param_buffer[:grad_buffer.numel_padded] + current_param_buffers[dtype] = param_buffer + self.param_buffers.append(current_param_buffers) + + # Update optimizer groups. + # - Also, leverage state_dict() and load_state_dict() to + # recast preexisting per-param state tensors. + self.optimizer.param_groups = \ + [ g["orig_group"] for g in self.opt_group_ranges ] + self.optimizer.load_state_dict(self.optimizer.state_dict()) + + + def get_model_param_range_map(self, param): + """ + Given a model param, get the index sub-range of the param that this + data-parallel rank owns. + """ + model_index, dtype = self.model_param_gbuf_map[param] + gbuf_range_map = self.model_gbuf_ranges[model_index][dtype] + param_range_map = gbuf_range_map["param_map"][param] + return param_range_map + + + def get_model_parallel_group(self): + """ + With the distributed optimizer, the model parallel group is the + entire world. + """ + return None + + + def state_dict(self): + """ + The state dict contains all non-DP-rank-dependent (i.e., non-parameter- + related) optimizer variables. The returned state dict can be stored in + the standard model/RNG checkpoint file. The parameter and dependent + optimizer state (e.g., exp_avg, exp_avg_sq) are stored in a separate + checkpoint file by calling 'save_parameter_state()'. + """ + + state_dict = {} + + # Optimizer state (do not store parameter state here). + state_dict['optimizer'] = { + k : v + for k, v in self.optimizer.state_dict().items() + if k != "state" + } + for param_group in state_dict["optimizer"]["param_groups"]: + del param_group["params"] + + # Grad scaler state. + if self.grad_scaler: + state_dict['grad_scaler'] = self.grad_scaler.state_dict() + + return state_dict + + + def load_state_dict(self, state_dict): + """Load the state dict. + + As detailed in state_dict(), the state dict contains all non- + parameter-related variables. This method is notably longer than + state_dict(), because the Torch optimizers state has yet to be + allocated at this point, and so we must do a cross referencing between + the optimizers state (and the ordering it expects for parameter state) + and this DP rank's shards. The optimizer at this point does not contain + any tensor dimension information, so we must get these dimensions from + the DP shards mapped during DistributedOptimizer.__init__(). + + The tensor parameter state is loaded via load_parameter_state(), and + so this method also must populate the loaded state dict with dummy + tensor data (i.e., via torch.empty() below). This will be overwritten + during load_parameter_state(). + + ** Note: Torch optimizer's state structure. ** + The Torch optimizer stores its state in two levels. The top level is a + list of groups, where each group contains a list of integer indexes + (corresponding to parameters) that index into a master parameter list + that is shared by all groups. As such, three values are necessary for + maintaining this ordering: + + - group_index : The group to which a parameter belongs. + - group_order : The index of a parameter within its group. + - state_order : The index of a parameter within the shared parameter + list. + """ + + # Get the Torch optimizer's state dict. + # - This 'inner' optimizer at this point is unallocated, and only + # contains an integer odering of parameters within each group, and + # the ordering of parameters within its flattened parameter state + # list. + inner_state_dict = self.optimizer.state_dict() + state_dict_param_groups = [{ + **group, + "params" : list(inner_state_dict["param_groups"][idx]["params"]), + } for idx, group in enumerate(state_dict["optimizer"]["param_groups"])] + + # Allocate 'dummy' data for optimizer state (i.e., torch.empty() below) + # - Real data is overwritten during load_parameter_state(). + state_dict_state = [] + for gbuf_range_maps in self.model_gbuf_ranges: + for gbuf_range_map in gbuf_range_maps.values(): + for model_param, param_range_map in \ + gbuf_range_map["param_map"].items(): + + # Get parameter ordering information (see method docstring + # for details). + group_index, group_order = \ + self.model_param_group_index_map[model_param] + state_order = inner_state_dict["param_groups"] \ + [group_index]["params"][group_order] + + # Allocate dummy tensors. + numel = len(param_range_map["gbuf_world"]) + init_shard = lambda : torch.empty( + (numel,), + dtype=torch.float32, + device=torch.cuda.current_device()) + + state_dict_state.append((state_order, { + "exp_avg" : init_shard(), + "exp_avg_sq" : init_shard(), + })) + + # Sort by state order (see method docstring for details). + state_dict_state.sort(key = lambda s : s[0]) + state_dict_state = {s[0]:s[1] for s in state_dict_state} + + # Optimizer. + self.optimizer.load_state_dict({ + "state" : state_dict_state, + "param_groups" : state_dict_param_groups, + }) + + # Grad scaler. + if 'grad_scaler' in state_dict: + if self.grad_scaler: + self.grad_scaler.load_state_dict(state_dict['grad_scaler']) + + + def save_parameter_state(self, filename): + """Save parameter state (i.e., parameter & optimizer tensors). + + This method performs three steps: + - For each DP rank, copy param & optimizer shards to contiguous CPU + buffers. (e.g., one buffer each for main_param, exp_avg, and + exp_avg_sq). + - Gather contiguous buffers on DP rank 0 and concatenate to world + buffers. + - Save world buffers to disk (i.e., distrib_opt.pt). + """ + + # Data parallelism variables. + data_parallel_world_size = mpu.get_data_parallel_world_size() + data_parallel_rank = mpu.get_data_parallel_rank() + data_parallel_group_gloo = mpu.get_data_parallel_group_gloo() + data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS) + + # Collect param states. + state = {} + for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges): + + # Iterate grad buffers (by data type). + dtype_state = {} + assert len(gbuf_range_maps) == 1, "single dtype supported, for now." + for dtype, gbuf_range_map in gbuf_range_maps.items(): + + # Compute local DP contiguous shard's size. + model = self.models[model_idx] + gbuf_world_numel = model._grad_buffers[dtype].numel_padded + gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size) + local_shards = {key:torch.empty((gbuf_local_numel,), + dtype=torch.float32, + device="cpu") + for key in ("param", "exp_avg", "exp_avg_sq")} + + # Build contiguous DP rank shards (for param + optim states). + for model_param, param_range_map in \ + gbuf_range_map["param_map"].items(): + + # Main param & optimizer states. + group_index, group_order = \ + self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups \ + [group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + tensors = { + "param" : main_param, + **optim_state, + } + + # Copy states into contiguous shard. + gbuf_local_start = param_range_map["gbuf_local"].start + gbuf_local_end = param_range_map["gbuf_local"].end + for key in local_shards: + local_shards[key][gbuf_local_start:gbuf_local_end] \ + .data.copy_(tensors[key].detach().cpu()) + + # Gather contiguous shards on DP rank 0. + world_tensors = {} + for key, send_tensor in local_shards.items(): + + # Gather tensor list. + if data_parallel_rank == 0: + recv_tensors = [torch.empty((gbuf_local_numel,), + dtype=torch.float32, + device="cpu") + for _ in range(data_parallel_world_size)] + else: + recv_tensors = None + + # Gather. + torch.distributed.gather( + send_tensor, + recv_tensors, + data_parallel_global_ranks[0], + data_parallel_group_gloo, + ) + + # Concatenate. + if data_parallel_rank == 0: + world_tensors[key] = torch.cat(recv_tensors) + + # Collect world state. + dtype_state[dtype] = world_tensors + state[model_idx] = dtype_state + + # Save param state. + if data_parallel_rank == 0: + torch.save(state, filename) + + + def load_parameter_state(self, filename): + """Load parameter state (i.e., parameter & optimizer tensors). + + This method performs the reverse of save_parameter_state(): + - Load world buffers from disk (i.e., distrib_opt.pt). + - Scatter contiguous buffers from DP rank 0 to each DP rank (each DP + rank receives its relevant subset of the world buffers). + - For each DP rank, copy param & optimizer shards from contiguous CPU + buffers. (e.g., one buffer each for main_param, exp_avg, and + exp_avg_sq). + """ + + # Data parallelism variables. + data_parallel_world_size = mpu.get_data_parallel_world_size() + data_parallel_rank = mpu.get_data_parallel_rank() + data_parallel_group_gloo = mpu.get_data_parallel_group_gloo() + data_parallel_global_ranks = list(mpu._DATA_PARALLEL_GLOBAL_RANKS) + + # Load on DP rank 0. + if data_parallel_rank == 0: + loaded_state = torch.load(filename) + + # Scatter tensors to all DP ranks. + for model_idx, gbuf_range_maps in enumerate(self.model_gbuf_ranges): + for dtype, gbuf_range_map in gbuf_range_maps.items(): + + # Compute local DP contiguous shard's size. + model = self.models[model_idx] + gbuf_world_numel = model._grad_buffers[dtype].numel_padded + gbuf_local_numel = int(gbuf_world_numel/data_parallel_world_size) + + # Contiguous local shards (received from DP rank 0). + local_shards = {key:torch.empty((gbuf_local_numel,), + dtype=torch.float32, + device="cpu") + for key in ("param", "exp_avg", "exp_avg_sq")} + + # Scatter local shards from DP rank 0. + for key, recv_tensor in local_shards.items(): + + # Scatter tensor list. + if data_parallel_rank == 0: + world_tensor = loaded_state[model_idx][dtype][key] + gbuf_start_idxs = \ + list(range(0, gbuf_world_numel, gbuf_local_numel)) + send_tensors = [world_tensor[i:(i+gbuf_local_numel)] + for i in gbuf_start_idxs] + else: + send_tensors = None + + # Scatter. + torch.distributed.scatter( + recv_tensor, + send_tensors, + data_parallel_global_ranks[0], + data_parallel_group_gloo, + ) + + # Copy local contiguous shards to param/optim shards. + for model_param, param_range_map in \ + gbuf_range_map["param_map"].items(): + + # Main param & optimizer states. + group_index, group_order = \ + self.model_param_group_index_map[model_param] + main_param = self.optimizer.param_groups \ + [group_index]["params"][group_order] + optim_state = self.optimizer.state[main_param] + + tensors = { + "param" : main_param, + **optim_state, + } + + # Copy states into contiguous shard. + gbuf_local_start = param_range_map["gbuf_local"].start + gbuf_local_end = param_range_map["gbuf_local"].end + for key in local_shards: + tensors[key].data.copy_( + local_shards[key][gbuf_local_start:gbuf_local_end]) + + + def zero_grad(self, set_to_none=True): + """ + Zero grads. + + We only need to zero the model related parameters, i.e., + model_float16_groups & model_fp32_groups. We additionally zero + the remaining groups as a memory optimization to reduce + fragmentation; in the case of set_to_none==True, the space + used by this field can be safely deallocated at this point. + """ + for groups in ( + self.model_float16_groups, + self.model_fp32_groups, + self.shard_float16_groups, # grad empty/unused here? + self.shard_fp32_groups, # throws grad-access warning + self.shard_fp32_from_float16_groups): + for group in groups: + _zero_grad_group_helper(group, set_to_none) + + + @staticmethod + def get_model_buffer_dp_views(model_buffers): + """ + Get shard views of each of the DDP's param/grad buffers. + + In this nested list, the top level is grouped by the virtual model + index and the buffer's data type. The sub-level is a list of + shards of that buffer, where each shard in the list represents + a contiguous view of the buffer, that is owned by a data-parallel + rank. The shard boundary does not respect parameter boundaries, and + so the elements of some parameters are split across data parallel + ranks. + + Additionally, return references to the entire buffers, for use + in _reduce_scatter_base and _all_gather_base. + """ + + data_parallel_world_size = mpu.get_data_parallel_world_size() + + # Buffer views. + view_items = [] + for model_index, buffers in enumerate(model_buffers): + for dtype, buf in buffers.items(): + + assert buf.numel() % data_parallel_world_size == 0 + shard_size = int(buf.numel() / data_parallel_world_size) + buf_views = [buf[(r*shard_size):((r+1)*shard_size)] + for r in range(data_parallel_world_size)] + view_items.append((model_index, dtype, buf, buf_views)) + + return view_items + + + def get_model_grad_buffer_dp_views(self): + return self.get_model_buffer_dp_views([ + {dtype : mem_buffer.data} + for model in self.models + for dtype, mem_buffer in model._grad_buffers.items()]) + + + def get_model_param_buffer_dp_views(self): + return self.get_model_buffer_dp_views(self.param_buffers) + + + def reduce_model_grads(self, args, timers): + """ + Reduce-scatter model grads. + + The DDP's grad buffer is used for the reduce-scatter, and thus no + tensors are dynamically allocated. + + Note: this is a different order of reduction, versus the non- + distributed optimizer, which reduces: 1) layernorm grads, 2) all + grads, 3) embedding grads. + """ + + # All-reduce layer-norm grads (for sequence parallelism). + timers('layernorm-grads-all-reduce', log_level=1).start( + barrier=config.barrier_with_L1_time) + self.allreduce_layernorm_grads(args) + timers('layernorm-grads-all-reduce').stop() + + # All-reduce embedding grads. + timers('embedding-grads-all-reduce', log_level=1).start( + barrier=config.barrier_with_L1_time) + self.allreduce_embedding_grads(args) + timers('embedding-grads-all-reduce').stop() + + # Reduce-scatter setup. + timers('grads-reduce-scatter', log_level=1).start( + barrier=config.barrier_with_L1_time) + data_parallel_rank = mpu.get_data_parallel_rank() + data_parallel_world_size = mpu.get_data_parallel_world_size() + data_parallel_group = mpu.get_data_parallel_group() + + # Scale grad buffers by '1 / data_parallel_world_size'. + for model in self.models: + for dtype, gbuf in model._grad_buffers.items(): + gbuf.data /= data_parallel_world_size + + # Reduce-scatter all grads. + gbuf_view_items = self.get_model_grad_buffer_dp_views() + for index, (model_index, dtype, gbuf, gbuf_views) \ + in enumerate(gbuf_view_items): + + torch.distributed._reduce_scatter_base( + gbuf_views[data_parallel_rank], + gbuf, + group = data_parallel_group, + ) + + timers('grads-reduce-scatter').stop() + + + def gather_model_params(self, args, timers): + """ + All-gather updated model params. + + The DDP's param buffer is used for the all-gather, and thus no + tensors are dynamically allocated. After the all-gather, the params + can be copied from the param buffer to the param. + """ + + timers('params-all-gather', log_level=1).start( + barrier=config.barrier_with_L1_time) + + data_parallel_rank = mpu.get_data_parallel_rank() + data_parallel_group = mpu.get_data_parallel_group() + + # All-gather updated main params. + # - All param buffer views are guaranteed to have the same num elements + # across all data parallel ranks, due to grad buffer padding that is + # done in distributed.py, and extended to the param buffers. Thus, + # all sub-views will have consistent start/end indexes across data + # parallel ranks. + pbuf_view_items = self.get_model_param_buffer_dp_views() + for index, (model_index, dtype, pbuf, pbuf_views) \ + in enumerate(pbuf_view_items): + + torch.distributed._all_gather_base( + pbuf, + pbuf_views[data_parallel_rank], + group = data_parallel_group, + ) + + # Copy from param buffer to each param. + for model_id, model in enumerate(self.models): + for dtype, param_map in model._grad_buffer_param_index_map.items(): + for param, (buf_start, buf_end) in param_map.items(): + param_buf = self.param_buffers[model_id][dtype] + param_buf_shard = param_buf[buf_start:buf_end] + param.view(-1).detach().copy_(param_buf_shard) + + timers('params-all-gather').stop() + + + def _collect_main_grad_data_for_unscaling(self): + """ + Note: this should be equivalent to the float-16 optimizer's method, + but writtent differently, so the two should be combined. + """ + return [ + param.grad.data + for group in self.optimizer.param_groups + for param in group["params"] + ] + + + def _get_model_and_main_params_data_float16(self): + """ + Get aligned list of model and main params. + """ + model_data = [] + main_data = [] + for model_group, main_group in zip(self.shard_float16_groups, + self.shard_fp32_from_float16_groups): + for model_param, main_param in zip(model_group, main_group): + model_data.append(model_param.data) + main_data.append(main_param.data) + return model_data, main_data + + + def _copy_model_grads_to_main_grads(self): + """ + Copy model grads to main grads. + + Since this step follows a reduce-scatter through the DDP's grad + buffer, this method is responsible for copying the updated grads + from the grad buffer to the main shard's grad field. + """ + + # Utility method for copying group grads. + def copy_group_grads(model_groups, shard_main_groups): + for model_group, shard_main_group in zip(model_groups, + shard_main_groups): + for model_param, shard_main_param in zip(model_group, + shard_main_group): + + param_range_map = self.get_model_param_range_map(model_param) + param_range = param_range_map["param"] + assert param_range.size == shard_main_param.nelement() + + model_grad = model_param.main_grad + shard_model_grad = model_grad.view(-1) \ + [param_range.start:param_range.end] + shard_main_param.grad = shard_model_grad.float() + + # Copy model groups to shard groups. + copy_group_grads(self.model_float16_groups, + self.shard_fp32_from_float16_groups) + copy_group_grads(self.model_fp32_groups, + self.shard_fp32_groups) + + + def _copy_main_params_to_model_params(self): + """ + Copy main params to model params. + + Since this step is followed by an all-gather through the DDP's grad + buffer, this method is responsible for copying the updated params + from the main shards into the correct position in the grad buffer. + """ + + # Utility method for copying group params. + def copy_group_params(shard_main_groups, model_groups): + for shard_main_group, model_group in zip(shard_main_groups, + model_groups): + for shard_main_param, model_param in zip(shard_main_group, + model_group): + + param_range_map = self.get_model_param_range_map(model_param) + world_range = param_range_map["gbuf_world"] + + assert world_range.size == shard_main_param.nelement() + + model_id, dtype = self.model_param_gbuf_map[model_param] + model_param_buffer = self.param_buffers[model_id][dtype] + + shard_model_param = model_param_buffer.view(-1) \ + [world_range.start:world_range.end] + + shard_model_param.data.copy_(shard_main_param) + + # Copy shard groups to model groups. + copy_group_params(self.shard_fp32_from_float16_groups, + self.model_float16_groups) + copy_group_params(self.shard_fp32_groups, + self.model_fp32_groups) + + + def _copy_model_params_to_main_params(self): + """ + Copy model params to main params. + + During finetuning, this method is used to reload the main params from + the model params. This copy does not make use of the grad buffer as + an intermediary. + """ + + # Utility method for copying group params. + def copy_group_params(model_groups, shard_main_groups): + for model_group, shard_main_group in zip(model_groups, + shard_main_groups): + for model_param, shard_main_param in zip(model_group, + shard_main_group): + + param_range_map = self.get_model_param_range_map(model_param) + param_range = param_range_map["param"] + assert param_range.size == shard_main_param.nelement() + + shard_model_param = model_param.view(-1) \ + [param_range.start:param_range.end] + shard_main_param.data.copy_(shard_model_param) + + # Copy model groups to shard groups. + copy_group_params(self.model_float16_groups, + self.shard_fp32_from_float16_groups) + copy_group_params(self.model_fp32_groups, + self.shard_fp32_groups) diff --git a/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py b/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py new file mode 100644 index 000000000..a45225aed --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/optimizer/grad_scaler.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Megatron grad scaler.""" + +from abc import ABC +from abc import abstractmethod + +import torch + +class MegatronGradScaler(ABC): + + def __init__(self, initial_scale): + """Initialize scale value with the input initial scale.""" + assert initial_scale > 0.0 + self._scale = torch.cuda.FloatTensor([initial_scale]) + + @property + def scale(self): + return self._scale + + @property + def inv_scale(self): + return self._scale.double().reciprocal().float() + + @abstractmethod + def update(self, found_inf): + pass + + @abstractmethod + def state_dict(self): + pass + + @abstractmethod + def load_state_dict(self, state_dict): + pass + + + +class ConstantGradScaler(MegatronGradScaler): + + def update(self, found_inf): + pass + + def state_dict(self): + return dict() + + def load_state_dict(self, state_dict): + pass + + + +class DynamicGradScaler(MegatronGradScaler): + + def __init__(self, initial_scale, min_scale, + growth_factor, backoff_factor, + growth_interval, hysteresis): + """"Grad scaler with dynamic scale that gets adjusted + during training.""" + super(DynamicGradScaler, self).__init__(initial_scale) + + # Lower bound on the scale. + assert min_scale > 0.0 + assert min_scale <= initial_scale + self.min_scale = torch.cuda.FloatTensor([min_scale]) + # Growth and backoff factors for the scale. + assert growth_factor > 1.0 + self.growth_factor = torch.cuda.FloatTensor([growth_factor]) + assert backoff_factor < 1.0 + assert backoff_factor > 0.0 + self.backoff_factor = torch.cuda.FloatTensor([backoff_factor]) + # Interval over which if we don't see any inf/nan, + # we will scale the grad scale by the growth factor. + assert growth_interval > 0 + self.growth_interval = growth_interval + # Number of inf/nans we should see before scaling down + # the grad scale by the backoff factor. + assert hysteresis > 0 + self.hysteresis = hysteresis + + # Trackers. + self._growth_tracker = 0 + self._hysteresis_tracker = self.hysteresis + + + def update(self, found_inf): + + # If we have an inf/nan, growth tracker is set to 0 + # and hysterisis tracker is reduced by 1. + if found_inf: + self._growth_tracker = 0 + self._hysteresis_tracker -= 1 + # Now if we are out of hysteresis count, scale down the loss. + if self._hysteresis_tracker <= 0: + self._scale = torch.max(self._scale * self.backoff_factor, + self.min_scale) + else: + # If there is no nan/inf, increment the growth tracker. + self._growth_tracker += 1 + # If we have had enough consequitive intervals with no nan/inf: + if self._growth_tracker == self.growth_interval: + # Reset the tracker and hysteresis trackers, + self._growth_tracker = 0 + self._hysteresis_tracker = self.hysteresis + # and scale up the loss scale. + self._scale = self._scale * self.growth_factor + + + def state_dict(self): + state_dict = {} + state_dict['scale'] = self._scale + state_dict['growth_tracker'] = self._growth_tracker + state_dict['hysteresis_tracker'] = self._hysteresis_tracker + return state_dict + + + def load_state_dict(self, state_dict): + self._scale = state_dict['scale'].cuda(torch.cuda.current_device()) + self._growth_tracker = state_dict['growth_tracker'] + self._hysteresis_tracker = state_dict['hysteresis_tracker'] diff --git a/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py b/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py new file mode 100644 index 000000000..2d8f68886 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/optimizer/optimizer.py @@ -0,0 +1,645 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + +"""Megatron optimizer.""" + +from abc import ABC +from abc import abstractmethod + +import torch + +from model.models.module import param_is_not_shared +from optimizer.clip_grads import clip_grad_norm_fp32 + +def _zero_grad_group_helper(group, set_to_none): + """Zero out the gradient for a group of parameters. + Note: copied from torch.optim.optimizer.""" + for param in group: + if param.grad is not None: + if set_to_none: + param.grad = None + else: + if param.grad.grad_fn is not None: + param.grad.detach_() + else: + param.grad.requires_grad_(False) + param.grad.zero_() + + +def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None): + """Use multi-tensor-applier to copy values from one list to another. + We don't have a blfoat16 implementation so for now if the overflow_buf + is not provided, we default back to simple loop copy to be compatible + with bfloat16.""" + if overflow_buf: + overflow_buf.fill_(0) + # Scaling with factor `1.0` is equivalent to copy. + multi_tensor_applier(amp_C.multi_tensor_scale, + overflow_buf, + [this, that], + 1.0) + else: + for this_, that_ in zip(this, that): + that_.copy_(this_) + + + +class MegatronOptimizer(ABC): + + + def __init__(self, optimizer, clip_grad, + params_have_main_grad, + use_contiguous_buffers_in_local_ddp, + models): + + """Input optimizer is the base optimizer for example Adam.""" + self.optimizer = optimizer + assert self.optimizer, 'no optimizer is provided.' + # Set gradient clipping and logging params. + self.clip_grad = clip_grad + self.params_have_main_grad = params_have_main_grad + self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp + + # 'models' are retained for access to the contiguous grad buffers. + # (see distributed optimizer) + self.models = models + + if self.use_contiguous_buffers_in_local_ddp: + assert self.params_have_main_grad, \ + "use of contiguous buffer requires that params have main grad" + + + def get_parameters(self): + params = [] + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + params.append(param) + return params + + + def get_main_grads_for_grad_norm(self): + + # Filter parameters based on: + # - grad should not be none + # - parameter should not be shared + # - should not be a replica due to tensor model parallelism + params = self.get_parameters() + grads_for_norm = [] + for param in params: + grad = param.grad + grad_not_none = grad is not None + is_not_shared = param_is_not_shared(param) + if grad_not_none and is_not_shared: + grads_for_norm.append(grad) + + return grads_for_norm + + + def clip_grad_norm(self, clip_grad): + params = self.get_parameters() + grads_for_norm = self.get_main_grads_for_grad_norm() + return clip_grad_norm_fp32( + params, grads_for_norm, clip_grad) + + + @abstractmethod + def zero_grad(self, set_to_none=True): + pass + + + @abstractmethod + def get_loss_scale(self): + """The output should be a cuda tensor of size 1.""" + pass + + + def scale_loss(self, loss): + """Simple scaling.""" + return self.get_loss_scale() * loss + + + @abstractmethod + def reload_model_params(self): + """Refreshes any internal state from the current model parameters. + Call whenever the parameters are changed outside of the optimizer. + For example, when we load a model from a checkpoint without loading + the optimizer, the model parameters are updated but for fp16 optimizer + with main parameters, the main parameters need to also be updated.""" + pass + + + @abstractmethod + def state_dict(self): + pass + + + @abstractmethod + def load_state_dict(self, state_dict): + pass + + + # Promote state so it can be retrieved or set via + # "optimizer_instance.state" + def _get_state(self): + return self.optimizer.state + + def _set_state(self, value): + self.optimizer.state = value + + state = property(_get_state, _set_state) + + + # Promote param_groups so it can be retrieved or set via + # "optimizer_instance.param_groups" + # (for example, to adjust the learning rate) + def _get_param_groups(self): + return self.optimizer.param_groups + + def _set_param_groups(self, value): + self.optimizer.param_groups = value + + param_groups = property(_get_param_groups, _set_param_groups) + + + @abstractmethod + def step(self, args, timers): + pass + + + def gather_model_params(self, args, timers): + """ + For the case of a non-distributed-optimizer, there is nothing to + do here. + """ + pass + + + def allreduce_word_embedding_grads(self, args): + """ + All-reduce word embedding grads. + + Reduce grads across first and last stages to ensure that word_embeddings + parameters stay in sync. This should only run for models that support + pipelined model parallelism (BERT and GPT-2). + """ + pass + + + def allreduce_position_embedding_grads(self, args): + """ + All-reduce position_embeddings grad across first (encoder) and + split (decoder) stages to ensure that position embeddings parameters + stay in sync. This should only run for T5 models with pipeline + parallelism. + """ + pass + + + def allreduce_embedding_grads(self, args): + """All-reduce both word and position embeddings.""" + self.allreduce_word_embedding_grads(args) + self.allreduce_position_embedding_grads(args) + + + def allreduce_layernorm_grads(self, args): + """All-reduce layernorm grads (for sequence parallelism).""" + + # All-reduce layernorm parameters across model parallel nodes + # when sequence parallelism is used + pass + + + def reduce_model_grads(self, args, timers): + """All-reduce all grads, and all-reduce embeddings.""" + + self.allreduce_layernorm_grads(args) + + # All-reduce embedding grads. + self.allreduce_embedding_grads(args) + + +class MixedPrecisionOptimizer(MegatronOptimizer): + """Base class for both the float-16 and the distributed optimizer. + + Arguments: + optimizer: base optimizer such as Adam or SGD + clip_grad: clip gradeints with this global L2 norm. Note + that clipping is ignored if clip_grad == 0 + params_have_main_grad: flag indicating if parameters have + a `main_grad` field. If this is set, we are assuming + that the model parameters are store in the `main_grad` + field instead of the typical `grad` field. This happens + for the DDP cases where there is a continuous buffer + holding the gradients. For example for bfloat16, we want + to do gradient accumulation and all-reduces in float32 + and as a result we store those gradients in the main_grad. + Note that main grad is not necessarily in float32. + use_contiguous_buffers_in_local_ddp: if true, the local DDP model + is using a contiguous buffer to hold the model grads. + fp16: if true, the model is running in fp16. + bf16: if true, the model is running in bfloat16. + params_dtype: used by distributed optimizer. + grad_scaler: used for scaling gradients. Note that this can be + None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constnat gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + models: list of models (i.e., the virtual pipelining models). This + is used by the distributed optimizer for mapping parameters. + """ + + def __init__(self, optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + fp16, bf16, params_dtype, grad_scaler, + models): + + super().__init__( + optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + models) + + self.fp16 = fp16 + self.bf16 = bf16 + self.params_dtype = params_dtype + self.grad_scaler = grad_scaler + + # None grad scaler is only supported for bf16. + if self.grad_scaler is None: + assert not self.fp16, 'fp16 expects a grad scaler.' + + # Tensor used to determine if a nan/if has happend. + # Any non-zero value indicates inf/nan. + # Note that we keep this for the cases that grad scaler is none. + # We still record nan/inf if we have a bfloat16 with a grad scaler. + if self.grad_scaler: + self.found_inf = torch.cuda.FloatTensor([0.0]) + + # Dummy tensor needed for apex multi-apply tensor. + # For bfloat, we don't have multi-tensor apply and for now + # we set it to none so the multi-tensor apply gets ignored. + if bf16: + self._dummy_overflow_buf = None + else: + self._dummy_overflow_buf = torch.cuda.IntTensor([0]) + + # In case grad scaler is not passed, define the unity scale. + if self.grad_scaler is None: + self._scale_one = torch.cuda.FloatTensor([1.0]) + + + def get_loss_scale(self): + if self.grad_scaler is None: + return self._scale_one + return self.grad_scaler.scale + + + def reload_model_params(self): + self._copy_model_params_to_main_params() + + + def _unscale_main_grads_and_check_for_nan(self): + + # Collect main grads. + main_grads = self._collect_main_grad_data_for_unscaling() + + # Reset found inf. + self.found_inf.fill_(0.0) + + # Unscale and set found inf/nan + torch._amp_foreach_non_finite_check_and_unscale_( + main_grads, self.found_inf, self.grad_scaler.inv_scale) + + # Check for nan. + found_inf_flag = (self.found_inf.item() > 0) + + return found_inf_flag + + + @torch.no_grad() + def step(self, args, timers): + + # Copy gradients from model params to main params. + self._copy_model_grads_to_main_grads() + + # Do unscale, check for inf, and update grad scaler only for + # the case that grad scaler is provided. + if self.grad_scaler: + + # Unscale and check for inf/nan. + found_inf_flag = self._unscale_main_grads_and_check_for_nan() + + # We are done with scaling gradients + # so we can update the loss scale. + self.grad_scaler.update(found_inf_flag) + + # If we found inf/nan, skip the update. + if found_inf_flag: + return False, None, None + + # Clip the main gradients. + grad_norm = None + if self.clip_grad > 0.0: + grad_norm = self.clip_grad_norm(self.clip_grad) + + # Count the zeros in the grads. + num_zeros_in_grad = None + + # Step the optimizer. + self.optimizer.step() + + # Update params from main params. + self._copy_main_params_to_model_params() + + # Successful update. + return True, grad_norm, num_zeros_in_grad + + +class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer): + """Float16 optimizer for fp16 and bf16 data types. + + Arguments: + optimizer: base optimizer such as Adam or SGD + clip_grad: clip gradeints with this global L2 norm. Note + that clipping is ignored if clip_grad == 0 + params_have_main_grad: flag indicating if parameters have + a `main_grad` field. If this is set, we are assuming + that the model parameters are store in the `main_grad` + field instead of the typical `grad` field. This happens + for the DDP cases where there is a continuous buffer + holding the gradients. For example for bfloat16, we want + to do gradient accumulation and all-reduces in float32 + and as a result we store those gradients in the main_grad. + Note that main grad is not necessarily in float32. + use_contiguous_buffers_in_local_ddp: if true, the local DDP model + is using a contiguous buffer to hold the model grads. + fp16: if true, the model is running in fp16. + bf16: if true, the model is running in bfloat16. + grad_scaler: used for scaling gradients. Note that this can be + None. This case happens when `bf16 = True` and we don't + use any loss scale. Note that for `bf16 = True`, we can have + a constnat gradient scaler. Also for `bf16 = False`, we + always require a grad scaler. + models: list of models (i.e., the virtual pipelining models). This + is used by the distributed optimizer for mapping parameters. + """ + + def __init__(self, optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + fp16, bf16, params_dtype, grad_scaler, models): + + super().__init__( + optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + fp16, bf16, params_dtype, grad_scaler, models) + + # ====================== + # main parameter stuff + # ====================== + + # Three groups of parameters: + # float16_groups: original float16 parameters + # fp32_from_float16_groups: fp32 copy of float16 parameters + # fp32_from_fp32_groups: original fp32 parameters + self.float16_groups = [] + self.fp32_from_float16_groups = [] + self.fp32_from_fp32_groups = [] + + # For all the groups in the original optimizer: + for param_group in self.optimizer.param_groups: + float16_params_this_group = [] + fp32_params_this_group = [] + fp32_from_float16_params_this_group = [] + # For all the parameters in this group: + for i, param in enumerate(param_group['params']): + if param.requires_grad: + + # float16 params: + if param.type() in ['torch.cuda.HalfTensor', + 'torch.cuda.BFloat16Tensor', + 'torch.xpu.HalfTensor', + 'torch.xpu.BFloat16Tensor']: + float16_params_this_group.append(param) + # Create a copy + main_param = param.detach().clone().float() + + if hasattr(param, 'shared'): + main_param.shared = param.shared + # Replace the optimizer params with the new fp32 copy. + param_group['params'][i] = main_param + + fp32_from_float16_params_this_group.append(main_param) + # Reset existing state dict key to the new main param. + if param in self.optimizer.state: + self.optimizer.state[main_param] \ + = self.optimizer.state.pop(param) + # fp32 params. + elif param.type() in ['torch.cuda.FloatTensor', + 'torch.xpu.FloatTensor']: + fp32_params_this_group.append(param) + param_group['params'][i] = param + + else: + raise TypeError('Wrapped parameters must be one of ' + 'torch.cuda.FloatTensor, or ' + 'torch.cuda.HalfTensor, or ' + 'torch.cuda.BFloat16Tensor, or ' + 'torch.xpu.FloatTensor, or ' + 'torch.xpu.HalfTensor, or ' + 'torch.xpu.BFloat16Tensor. ' + 'Received {}'.format(param.type())) + + self.float16_groups.append(float16_params_this_group) + self.fp32_from_float16_groups.append( + fp32_from_float16_params_this_group) + self.fp32_from_fp32_groups.append(fp32_params_this_group) + + + def zero_grad(self, set_to_none=True): + """We only need to zero the model related parameters, i.e., + float16_groups & fp32_from_fp32_groups. We additionally zero + fp32_from_float16_groups as a memory optimization to reduce + fragmentation; in the case of set_to_none==True, the space + used by this field can be safely deallocated at this point.""" + for group in self.float16_groups: + _zero_grad_group_helper(group, set_to_none) + for group in self.fp32_from_float16_groups: + _zero_grad_group_helper(group, set_to_none) + for group in self.fp32_from_fp32_groups: + _zero_grad_group_helper(group, set_to_none) + + + def _collect_main_grad_data_for_unscaling(self): + + main_grads = [] + + # fp32 params from float16 ones. + for main_group in self.fp32_from_float16_groups: + for main_param in main_group: + if main_param.grad is not None: + main_grads.append(main_param.grad.data) + + # Append fp32 parameters. + for main_group in self.fp32_from_fp32_groups: + for main_param in main_group: + if main_param.grad is not None: + main_grads.append(main_param.grad.data) + + return main_grads + + + def _get_model_and_main_params_data_float16(self): + model_data = [] + main_data = [] + for model_group, main_group in zip(self.float16_groups, + self.fp32_from_float16_groups): + for model_param, main_param in zip(model_group, main_group): + model_data.append(model_param.data) + main_data.append(main_param.data) + return model_data, main_data + + + def _copy_model_grads_to_main_grads(self): + # This only needs to be done for the float16 group. + for model_group, main_group in zip(self.float16_groups, + self.fp32_from_float16_groups): + for model_param, main_param in zip(model_group, main_group): + if self.params_have_main_grad and hasattr(model_param, 'main_grad'): + main_param.grad = model_param.main_grad.float() + else: + if model_param.grad is not None: + main_param.grad = model_param.grad.float() + + # Safe to deallocate model's grad/main_grad after copying. + # (If using contiguous buffers, main_grad's memory should + # persist and therefore should not be deallocated.) + model_param.grad = None + if self.params_have_main_grad and \ + not self.use_contiguous_buffers_in_local_ddp: + model_param.main_grad = None + + # For fp32 grads, we need to reset the grads to main grad. + if self.params_have_main_grad: + for model_group in self.fp32_from_fp32_groups: + for model_param in model_group: + model_param.grad = model_param.main_grad + + # Safe to de-reference model's main_grad after copying. + # (If using contiguous buffers, main_grad's memory should + # persist and therefore should not be deallocated.) + if not self.use_contiguous_buffers_in_local_ddp: + model_param.main_grad = None + + + def _copy_main_params_to_model_params(self): + # Only needed for the float16 params. + model_data, main_data = self._get_model_and_main_params_data_float16() + _multi_tensor_copy_this_to_that(this=main_data, that=model_data, + overflow_buf=self._dummy_overflow_buf) + + + def _copy_model_params_to_main_params(self): + # Only needed for the float16 params. + model_data, main_data = self._get_model_and_main_params_data_float16() + _multi_tensor_copy_this_to_that(this=model_data, that=main_data, + overflow_buf=self._dummy_overflow_buf) + + + def state_dict(self): + state_dict = {} + state_dict['optimizer'] = self.optimizer.state_dict() + if self.grad_scaler: + state_dict['grad_scaler'] = self.grad_scaler.state_dict() + state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups + return state_dict + + + def load_state_dict(self, state_dict): + # Optimizer. + optimizer_key = 'optimizer' + if optimizer_key not in state_dict: + optimizer_key = 'optimizer_state_dict' + + self.optimizer.load_state_dict(state_dict[optimizer_key]) + + # Grad scaler. + if 'grad_scaler' in state_dict: + if self.grad_scaler: + self.grad_scaler.load_state_dict(state_dict['grad_scaler']) + + # Copy data for the main params. + fp32_from_float16_params_key = 'fp32_from_fp16_params' + if fp32_from_float16_params_key not in state_dict: + fp32_from_float16_params_key = 'fp32_from_fp16' + for current_group, saved_group in zip( + self.fp32_from_float16_groups, + state_dict[fp32_from_float16_params_key]): + for current_param, saved_param in zip(current_group, saved_group): + current_param.data.copy_(saved_param.data) + + +class FP32Optimizer(MegatronOptimizer): + + def __init__(self, optimizer, clip_grad, + params_have_main_grad, + use_contiguous_buffers_in_local_ddp, + models): + + super(FP32Optimizer, self).__init__( + optimizer, clip_grad, + params_have_main_grad, use_contiguous_buffers_in_local_ddp, + models) + + self._scale = torch.cuda.FloatTensor([1.0]) + + + def zero_grad(self, set_to_none=True): + """Copied from torch.optim.optimizer""" + for group in self.optimizer.param_groups: + _zero_grad_group_helper(group['params'], set_to_none) + + + def get_loss_scale(self): + """FP32 optimizer does not do any scaling.""" + return self._scale + + + @torch.no_grad() + def step(self): + """Clip gradients (if needed) and step the base optimizer. + Always return successful since there is no overflow.""" + + # Copy main_grads to grads. + if self.params_have_main_grad: + for param_group in self.optimizer.param_groups: + for param in param_group['params']: + param.grad = param.main_grad + + # Safe to de-reference model's main_grad after copying. + # (If using contiguous buffers, main_grad's memory should + # persist and therefore should not be deallocated.) + if not self.use_contiguous_buffers_in_local_ddp: + param.main_grad = None + + # Clip gradients. + grad_norm = None + if self.clip_grad > 0.0: + grad_norm = self.clip_grad_norm(self.clip_grad) + + # count the zeros in the grads + num_zeros_in_grad = None + # Update parameters. + self.optimizer.step() + + # No overflow for FP32 optimizer. + return True, grad_norm, num_zeros_in_grad + + + def reload_model_params(self): + pass + + + def state_dict(self): + return self.optimizer.state_dict() + + + def load_state_dict(self, state_dict): + self.optimizer.load_state_dict(state_dict) diff --git a/training/benchmarks/gpt2/pytorch/run_pretraining.py b/training/benchmarks/gpt2/pytorch/run_pretraining.py new file mode 100644 index 000000000..e4c3d0738 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/run_pretraining.py @@ -0,0 +1,144 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") + +"""GPT2 Pretraining""" + +import argparse +import os +import random +import sys +import time +from functools import partial + +import numpy as np +import torch + +from train.trainer import Trainer +from train import trainer_adapter +from train.evaluator import Evaluator +from train.training_state import TrainingState +from dataloaders.gpt_dataset import build_train_test_datasets, build_train_test_data_dataloaders + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import Driver, Event, dist_pytorch, check + +logger = None + + +def main(): + import config + global logger + + if config.use_env and 'LOCAL_RANK' in os.environ: + config.local_rank = int(os.environ['LOCAL_RANK']) + + gpt2_driver = Driver(config, config.mutable_params) + gpt2_driver.setup_config(argparse.ArgumentParser("GPT2")) + gpt2_driver.setup_modules(globals(), locals()) + + logger = gpt2_driver.logger + dist_pytorch.init_dist_training_env(config) + + check.check_config(config) + + dist_pytorch.barrier(config.vendor) + gpt2_driver.event(Event.INIT_START) + init_start_time = logger.previous_log_time + + random.seed(config.seed) + np.random.seed(config.seed) + torch.manual_seed(config.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(config.seed) + + config.global_batch_size = config.train_batch_size * config.n_device * config.gradient_accumulation_steps + + train_data_path = os.path.join(config.data_dir, config.train_data_prefix) + test_data_path = os.path.join(config.data_dir, config.test_data_prefix) + build_train_test_dataset_fn = partial( + build_train_test_datasets, + seq_length=config.seq_length, + seed=config.seed, + skip_warmup=(not config.mmap_warmup), + train_data_prefix=train_data_path, + test_data_prefix=test_data_path, + ) + train_dataloader, eval_dataloader= build_train_test_data_dataloaders(build_train_test_dataset_fn) + + evaluator = Evaluator(config, eval_dataloader) + training_state = TrainingState() + trainer = Trainer(driver=gpt2_driver, + adapter=trainer_adapter, + evaluator=evaluator, + training_state=training_state, + device=config.device, + config=config) + + training_state._trainer = trainer + + dist_pytorch.barrier(config.vendor) + trainer.init() + + dist_pytorch.barrier(config.vendor) + init_evaluation_start = time.time() + training_state.eval_lambada_acc = evaluator.evaluate( + trainer) + init_evaluation_end = time.time() + init_evaluation_info = dict( + eval_lambada_acc=training_state.eval_lambada_acc, + time=init_evaluation_end - init_evaluation_start) + gpt2_driver.event(Event.INIT_EVALUATION, init_evaluation_info) + + if not config.do_train: + return config, training_state + + gpt2_driver.event(Event.INIT_END) + init_end_time = logger.previous_log_time + training_state.init_time = (init_end_time - init_start_time) / 1e+3 + + dist_pytorch.barrier(config.vendor) + epoch = -1 + gpt2_driver.event(Event.TRAIN_START) + raw_train_start_time = logger.previous_log_time + while training_state.global_steps < config.max_steps and not training_state.end_training: + epoch += 1 + training_state.epoch = epoch + trainer.train_one_epoch(train_dataloader) + gpt2_driver.event(Event.TRAIN_END) + raw_train_end_time = logger.previous_log_time + training_state.raw_train_time = (raw_train_end_time - + raw_train_start_time) / 1e+3 + return config, training_state + + +if __name__ == "__main__": + now = time.time() + config_updated, state = main() + + if not dist_pytorch.is_main_process(): + exit() + + e2e_time = time.time() - now + trained_samples = state.num_trained_samples + if config_updated.do_train: + finished_info = { + "e2e_time": e2e_time, + "train_samples": trained_samples, + "train_time": state.raw_train_time, + "train_no_eval_time": state.no_eval_time, + "pure_training_computing_time": state.pure_compute_time, + "throughput(sps)_raw": + trained_samples / state.raw_train_time, + "throughput(sps)_no_eval": + trained_samples / state.no_eval_time, + "throughput(sps)_pure_compute": + trained_samples / state.pure_compute_time, + "converged": state.converged, + "final_accuracy": state.eval_lambada_acc, + } + else: + finished_info = {"e2e_time": e2e_time} + logger.log(Event.FINISHED, message=finished_info, stacklevel=0) + diff --git a/training/benchmarks/gpt2/pytorch/schedulers/__init__.py b/training/benchmarks/gpt2/pytorch/schedulers/__init__.py new file mode 100755 index 000000000..3e685cf5a --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/schedulers/__init__.py @@ -0,0 +1 @@ +from .factory import create_scheduler diff --git a/training/benchmarks/gpt2/pytorch/schedulers/factory.py b/training/benchmarks/gpt2/pytorch/schedulers/factory.py new file mode 100755 index 000000000..9cf37c048 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/schedulers/factory.py @@ -0,0 +1,34 @@ +from schedulers.optimizer_param_scheduler import OptimizerParamScheduler +import config + +def create_scheduler(optimizer): + """Build the learning rate scheduler.""" + + # Iteration-based training. + if config.max_steps: + config.lr_decay_iters = config.max_steps + lr_decay_steps = config.lr_decay_iters * config.global_batch_size + wd_incr_steps = config.max_steps* config.global_batch_size + if config.lr_warmup_fraction is not None: + lr_warmup_steps = config.lr_warmup_fraction * lr_decay_steps + else: + lr_warmup_steps = config.lr_warmup_iters * config.global_batch_size + else: + raise Exception( + 'either train-iters or train-samples should be provided.') + + opt_param_scheduler = OptimizerParamScheduler( + optimizer, + max_lr=config.lr, + min_lr=config.min_lr, + lr_warmup_steps=lr_warmup_steps, + lr_decay_steps=lr_decay_steps, + lr_decay_style=config.lr_decay_style, + start_wd=config.start_weight_decay, + end_wd=config.end_weight_decay, + wd_incr_steps=wd_incr_steps, + wd_incr_style=config.weight_decay_incr_style, + ) + + return opt_param_scheduler + diff --git a/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py b/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py new file mode 100644 index 000000000..aaa0a37f3 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/schedulers/optimizer_param_scheduler.py @@ -0,0 +1,214 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +"""Learning rate decay and weight decay incr functions.""" + +import math + +class OptimizerParamScheduler(object): + """Anneals learning rate and weight decay""" + + def __init__(self, optimizer, max_lr, min_lr, + lr_warmup_steps, lr_decay_steps, lr_decay_style, + start_wd, end_wd, wd_incr_steps, wd_incr_style, + use_checkpoint_opt_param_scheduler=False, + override_opt_param_scheduler=False): + + # Class values. + self.optimizer = optimizer + + self.max_lr = float(max_lr) + self.min_lr = min_lr + assert self.min_lr >= 0.0 + assert self.max_lr >= self.min_lr + + self.lr_warmup_steps = lr_warmup_steps + self.num_steps = 0 + self.lr_decay_steps = lr_decay_steps + assert self.lr_decay_steps > 0 + assert self.lr_warmup_steps < self.lr_decay_steps + + self.lr_decay_style = lr_decay_style + + self.start_wd = start_wd + self.end_wd = end_wd + assert self.start_wd >= 0.0 + assert self.end_wd >= self.start_wd + self.wd_incr_steps = wd_incr_steps + self.wd_incr_style = wd_incr_style + + self.override_opt_param_scheduler = override_opt_param_scheduler + self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler + if self.override_opt_param_scheduler: + assert not self.use_checkpoint_opt_param_scheduler, 'both override and '\ + 'use-checkpoint are set.' + + # Set the learning rate + self.step(0) + + + def get_wd(self): + """ Weight decay incr functions""" + if self.num_steps > self.wd_incr_steps: + return self.end_wd + + if self.wd_incr_style == 'constant': + assert self.start_wd == self.end_wd + return self.end_wd + + incr_ratio = float(self.num_steps) / float(self.wd_incr_steps) + assert incr_ratio >= 0.0 + assert incr_ratio <= 1.0 + delta_wd = self.end_wd - self.start_wd + + if self.wd_incr_style == 'linear': + coeff = incr_ratio + elif self.wd_incr_style == 'cosine': + coeff = 0.5 * (math.cos(math.pi * (1 - incr_ratio)) + 1.0) + else: + raise Exception('{} weight decay increment style is not supported.'.format( + self.wd_incr_style)) + + return self.start_wd + coeff * delta_wd + + + def get_lr(self): + """Learning rate decay functions from: + https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" + + # Use linear warmup for the initial part. + if self.lr_warmup_steps > 0 and self.num_steps <= self.lr_warmup_steps: + return self.max_lr * float(self.num_steps) / \ + float(self.lr_warmup_steps) + + # If the learning rate is constant, just return the initial value. + if self.lr_decay_style == 'constant': + return self.max_lr + + # For any steps larger than `self.lr_decay_steps`, use `self.min_lr`. + if self.num_steps > self.lr_decay_steps: + return self.min_lr + + # If we are done with the warmup period, use the decay style. + if self.lr_decay_style == 'inverse-square-root': + warmup_steps = max(self.lr_warmup_steps, 1) + num_steps = max(self.num_steps, 1) + lr = self.max_lr * warmup_steps ** 0.5 / (num_steps ** 0.5) + return max(self.min_lr, lr) + + num_steps_ = self.num_steps - self.lr_warmup_steps + decay_steps_ = self.lr_decay_steps - self.lr_warmup_steps + decay_ratio = float(num_steps_) / float(decay_steps_) + assert decay_ratio >= 0.0 + assert decay_ratio <= 1.0 + delta_lr = self.max_lr - self.min_lr + + if self.lr_decay_style == 'linear': + coeff = (1.0 - decay_ratio) + elif self.lr_decay_style == 'cosine': + coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) + else: + raise Exception('{} decay style is not supported.'.format( + self.lr_decay_style)) + + return self.min_lr + coeff * delta_lr + + + def step(self, increment): + """Set lr for all parameters groups.""" + self.num_steps += increment + new_lr = self.get_lr() + new_wd = self.get_wd() + for group in self.optimizer.param_groups: + group['lr'] = new_lr * group.get('lr_mult', 1.0) + group['weight_decay'] = new_wd * group.get('wd_mult', 1.0) + + + def state_dict(self): + state_dict = { + 'max_lr': self.max_lr, + 'lr_warmup_steps': self.lr_warmup_steps, + 'num_steps': self.num_steps, + 'lr_decay_style': self.lr_decay_style, + 'lr_decay_steps': self.lr_decay_steps, + 'min_lr': self.min_lr, + 'start_wd': self.start_wd, + 'end_wd': self.end_wd, + 'wd_incr_style': self.wd_incr_style, + 'wd_incr_steps': self.wd_incr_steps + } + return state_dict + + + def _check_and_set(self, cls_value, sd_value, name): + """Auxiliary function for checking the values in the checkpoint and + setting them.""" + if self.override_opt_param_scheduler: + return cls_value + + if not self.use_checkpoint_opt_param_scheduler: + assert cls_value == sd_value, \ + f'OptimizerParamScheduler: class input value {cls_value} and checkpoint' \ + f'value {sd_value} for {name} do not match' + return sd_value + + + def load_state_dict(self, sd): + + if 'start_lr' in sd: + max_lr_ = sd['start_lr'] + else: + max_lr_ = sd['max_lr'] + self.max_lr = self._check_and_set(self.max_lr, max_lr_, + 'learning rate') + + self.min_lr = self._check_and_set(self.min_lr, sd['min_lr'], + 'minimum learning rate') + + if 'warmup_iter' in sd: + lr_warmup_steps_ = sd['warmup_iter'] + elif 'warmup_steps' in sd: + lr_warmup_steps_ = sd['warmup_steps'] + else: + lr_warmup_steps_ = sd['lr_warmup_steps'] + self.lr_warmup_steps = self._check_and_set(self.lr_warmup_steps, + lr_warmup_steps_, + 'warmup iterations') + + if 'end_iter' in sd: + lr_decay_steps_ = sd['end_iter'] + elif 'decay_steps' in sd: + lr_decay_steps_ = sd['decay_steps'] + else: + lr_decay_steps_ = sd['lr_decay_steps'] + self.lr_decay_steps = self._check_and_set(self.lr_decay_steps, lr_decay_steps_, + 'total number of iterations') + + if 'decay_style' in sd: + lr_decay_style_ = sd['decay_style'] + else: + lr_decay_style_ = sd['lr_decay_style'] + self.lr_decay_style = self._check_and_set(self.lr_decay_style, + lr_decay_style_, + 'learning rate decay style') + + if 'num_iters' in sd: + num_steps = sd['num_iters'] + else: + num_steps = sd['num_steps'] + self.step(increment=num_steps) + + + if 'start_wd' in sd: + self.start_wd = self._check_and_set(self.start_wd, + sd['start_wd'], + "start weight decay") + self.end_wd = self._check_and_set(self.end_wd, + sd['end_wd'], + "end weight decay") + self.wd_incr_steps = self._check_and_set(self.wd_incr_steps, + sd['wd_incr_steps'], + "total number of weight decay iterations") + self.wd_incr_style = self._check_and_set(self.wd_incr_style, + sd['wd_incr_style'], + "weight decay incr style") + diff --git a/training/benchmarks/gpt2/pytorch/train/__init__.py b/training/benchmarks/gpt2/pytorch/train/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/gpt2/pytorch/train/evaluator.py b/training/benchmarks/gpt2/pytorch/train/evaluator.py new file mode 100644 index 000000000..23d4445ff --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/train/evaluator.py @@ -0,0 +1,46 @@ +import os +import sys +import torch + +from train.utils import process_batch_eval + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) +from driver import dist_pytorch + +class Evaluator: + + def __init__(self, config, dataloader): + self.config = config + self.eval_dataloader = dataloader + + def evaluate(self, trainer): + model = trainer.model + model.eval() + + total_output = 0.0 + num_examples = len(self.eval_dataloader.dataset) + with torch.no_grad(): + # For all the batches in the dataset. + for iteration, batch in enumerate(self.eval_dataloader): + + # Get the batch. + tokens, labels, attention_mask, position_ids, loss_mask = process_batch_eval( + batch) + # Forward pass through the model. + output = model(tokens, position_ids, attention_mask) + # For accuracy, return the number of correctly predicted samples. + outputs = torch.argmax(output, -1) + correct = (outputs == labels).float() + correct[(1 - loss_mask).bool()] = 1 + correct = correct.prod(-1) + output = correct.sum() + + # Reduce across processes. + if dist_pytorch.is_dist_avail_and_initialized(): + torch.distributed.all_reduce(output) + + total_output += output + acc = total_output / num_examples + model.eval() + return acc.item() diff --git a/training/benchmarks/gpt2/pytorch/train/trainer.py b/training/benchmarks/gpt2/pytorch/train/trainer.py new file mode 100644 index 000000000..0eade57d5 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/train/trainer.py @@ -0,0 +1,190 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") + +import math +import time +import os +import sys + +import torch +from torch.types import Device + +from model import create_model +from schedulers import create_scheduler +from train.evaluator import Evaluator +from train.training_state import TrainingState +from model.losses.cross_entropy import cross_entropy +from train.utils import get_batch + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../"))) +from driver import Driver, Event, dist_pytorch + +def _transpose_first_dim(t, num_splits, num_splits_first, model): + input_shape = t.size() + # We use a self_attention module but the values extracted aren't + # specific to self attention so should work for cross attention as well + while hasattr(model, 'module'): + model = model.module + attention_module = model.language_model.encoder.layers[0].self_attention + hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head + num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition + if num_splits_first: + """[num_splits * np * hn, h] + -->(view) [num_splits, np, hn, h] + -->(tranpose) [np, num_splits, hn, h] + -->(view) [np * num_splits * hn, h] """ + + intermediate_shape = \ + (num_splits, num_attention_heads_per_partition, + hidden_size_per_attention_head) + input_shape[1:] + + t = t.view(*intermediate_shape) + t = t.transpose(0, 1).contiguous() + else: + """[np * hn * num_splits, h] + -->(view) [np, hn, num_splits, h] + -->(tranpose) [np, num_splits, hn, h] + -->(view) [np * num_splits * hn, h] """ + + intermediate_shape = \ + (num_attention_heads_per_partition, + hidden_size_per_attention_head, num_splits) +\ + input_shape[1:] + + t = t.view(*intermediate_shape) + t = t.transpose(1, 2).contiguous() + t = t.view(*input_shape) + + return t + + +class Trainer(): + + def __init__(self, driver: Driver, adapter, evaluator: Evaluator, + training_state: TrainingState, device: Device, config): + super(Trainer, self).__init__() + self.config = config + self.driver = driver + self.adapter = adapter + self.training_state = training_state + self.grad_scaler = None + + self.device = device + self.optimizer = None + self.bert_config = None + self.model = None + self.evaluator = evaluator + self.lr_scheduler = None + self.global_batch_size = None + self.overflow_buf = None + + def init(self): + self.model_config, self.model = create_model(self.config) + self.model = self._init_model(self.model, self.device) + self.model = self.adapter.convert_model(self.config, self.model) + self.model = self.model.to(self.config.device) + + self.optimizer = self.adapter.create_optimizer(self.config, self.model) + self.model, self.optimizer = self.adapter.model_to_fp16( + self.config, self.model, self.optimizer) + self.model = self.adapter.model_to_ddp(self.config, self.model) + self.lr_scheduler = create_scheduler(self.optimizer) + + def _init_model(self, model, device): + model = model.to(device) + return model + + def train_one_epoch(self, dataloader): + state = self.training_state + driver = self.driver + driver.event(Event.EPOCH_BEGIN, state.epoch) + + no_eval_start = time.time() + for _, data in enumerate(dataloader): + data['text'] = data['text'].to(self.device) + tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data) + + pure_compute_start = time.time() + state.global_steps += 1 + state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size( + self.config) + + driver.event(Event.STEP_BEGIN, step=state.global_steps) + self.train_one_step(tokens, position_ids, attention_mask, labels, loss_mask) + + train_end = time.time() + state.pure_compute_time += train_end - pure_compute_start + state.no_eval_time += train_end - no_eval_start + + other_state = dict() + if state.global_steps % self.config.gradient_accumulation_steps == 0: + sequences_per_second = state.num_trained_samples / state.no_eval_time + other_state["seq/s"] = sequences_per_second + + eval_result = None + if self.can_do_eval(state): + eval_start = time.time() + state.eval_lambada_acc = self.evaluator.evaluate( + self) + eval_end = time.time() + eval_result = dict( + global_steps=state.global_steps, + eval_lambada_acc=state.eval_lambada_acc, + time=eval_end - eval_start) + + end_training = self.detect_training_status(state) + + step_info = state.to_dict(**other_state) + driver.event(Event.STEP_END, + message=step_info, + step=state.global_steps, + loss=state.loss) + + if eval_result is not None: + driver.event(Event.EVALUATE, eval_result) + + if end_training: + break + no_eval_start = time.time() + + driver.event(Event.EPOCH_END, state.epoch) + + def train_one_step(self, tokens, position_ids, attention_mask, labels, loss_mask): + + state = self.training_state + self.model.train() + + losses = self.model(tokens, position_ids, attention_mask, labels=labels) + #loss 为标量 + loss = torch.sum(losses.view(-1) * loss_mask.view(-1)) / loss_mask.view(-1).sum() + state.loss = loss + self.adapter.backward(state.global_steps, loss, + self.optimizer, self.lr_scheduler) + self.driver.event(Event.BACKWARD, state.global_steps, state.loss, + self.optimizer) + + + def detect_training_status(self, state: TrainingState): + if state.eval_lambada_acc >= self.config.target_acc: + state.converged_success() + + if state.global_steps >= self.config.max_steps: + state.end_training = True + + return state.end_training + + def can_do_eval(self, state: TrainingState): + do_eval = all([ + self.config.test_data_prefix is not None, + state.num_trained_samples >= self.config.eval_iter_start_samples, + self.config.eval_interval_samples > 0, + state.global_steps > 1, + state.global_steps % + math.ceil(self.config.eval_interval_samples / + dist_pytorch.global_batch_size(self.config)) == 0, + ]) + + return do_eval or state.global_steps >= self.config.max_steps + diff --git a/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py b/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py new file mode 100644 index 000000000..d9462ca05 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/train/trainer_adapter.py @@ -0,0 +1,57 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") + +from typing import Tuple + +import torch +from torch.optim import Optimizer +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as NativeDDP + +import config +from optimizer import get_megatron_optimizer + +GPT_MODEL = torch.nn.Module + +def convert_model(config, model: GPT_MODEL) -> GPT_MODEL: + return model + +def create_optimizer(config, model: GPT_MODEL) -> Optimizer: + return get_megatron_optimizer(model) + + +def model_to_fp16(config, model: GPT_MODEL, + optimizer: Optimizer) -> Tuple[GPT_MODEL, Optimizer]: + if config.fp16: + model.half() + return model, optimizer + + +def model_to_ddp(config, model: GPT_MODEL) -> GPT_MODEL: + use_ddp = dist.is_initialized() + + if use_ddp: + if config.DDP_impl == 'native': + model = NativeDDP( + model, + device_ids=[config.local_rank]) + else: + assert False, "Invalid DDP type" + return model + + +def backward(step: int, + loss: torch.Tensor, + optimizer: Optimizer, + lr_scheduler): + if config.fp16: + optimizer.backward(loss) + else: + loss.backward() + + if step % config.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + increment = config.train_batch_size * config.n_device * config.gradient_accumulation_steps + lr_scheduler.step(increment) diff --git a/training/benchmarks/gpt2/pytorch/train/training_state.py b/training/benchmarks/gpt2/pytorch/train/training_state.py new file mode 100644 index 000000000..3ca60b2b9 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/train/training_state.py @@ -0,0 +1,79 @@ +# Copyright © 2022 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") + +from dataclasses import dataclass + +import torch +import inspect + +@dataclass +class TrainingState: + _trainer = None + _status = 'aborted' # later set to 'success' if termination criteria met + + global_steps = 0 + skipped_steps = 0 + iter_dataloader_idx = 0 + + loss: float = 0.0 + lambada_acc: float = 0.0 + + epoch: int = 1 + num_trained_samples = 0 + end_training: bool = False + converged: bool = False + + eval_lambada_acc = 0 + + init_time = 0 + raw_train_time = 0 + + no_eval_time = 0.0 + pure_compute_time = 0.0 + + def status(self): + if self.converged: + self._status = "success" + return self._status + + def converged_success(self): + self.end_training = True + self.converged = True + + def _is_property(self, value): + status = [ + not callable(value), not inspect.isclass(value), + not inspect.ismodule(value), not inspect.ismethod(value), + not inspect.isfunction(value), not inspect.isbuiltin(value), + "classmethod object" not in str(value) + ] + return all(status) + + def to_dict(self, **kwargs): + state_dict = dict() + + for var_name, value in self.__dict__.items(): + if not var_name.startswith("_") and self._is_property(value): + state_dict[var_name] = value + + lr = self._trainer.lr_scheduler.get_lr() + if isinstance(lr, (tuple, list)): + lr = lr[0] + state_dict["learning_rate"] = lr + + exclude = [ + "eval_lambada_acc", "skipped_steps", + "converged", "init_time", "raw_train_time" + ] + for exkey in exclude: + if exkey in state_dict: + state_dict.pop(exkey) + + state_dict.update(kwargs) + + for k in state_dict.keys(): + if torch.is_tensor(state_dict[k]): + state_dict[k] = state_dict[k].item() + + return state_dict diff --git a/training/benchmarks/gpt2/pytorch/train/utils.py b/training/benchmarks/gpt2/pytorch/train/utils.py new file mode 100644 index 000000000..25391a403 --- /dev/null +++ b/training/benchmarks/gpt2/pytorch/train/utils.py @@ -0,0 +1,101 @@ +import torch + +import config +from dataloaders import get_tokenizer + +def get_ltor_masks_and_position_ids(data, + eod_token, + reset_position_ids, + reset_attention_mask, + eod_mask_loss): + """Build masks and position id for left to right model.""" + + # Extract batch size and sequence length. + train_batch_size, seq_length = data.size() + + # Attention mask (lower triangular). + if reset_attention_mask: + att_mask_batch = train_batch_size + else: + att_mask_batch = 1 + attention_mask = torch.tril(torch.ones( + (att_mask_batch, seq_length, seq_length), device=data.device)).view( + att_mask_batch, 1, seq_length, seq_length) + + # Loss mask. + loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) + if eod_mask_loss: + loss_mask[data == eod_token] = 0.0 + + # Position ids. + position_ids = torch.arange(seq_length, dtype=torch.long, + device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data) + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(train_batch_size): + + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= (i + 1 - prev_index) + prev_index = i + 1 + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + + return attention_mask, loss_mask, position_ids + +def get_batch(data): + """Generate a batch""" + tokenizer = get_tokenizer() + + # Unpack. + tokens_ = data['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + config.reset_position_ids, + config.reset_attention_mask, + config.eod_mask_loss) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def process_batch_eval(batch): + """Process batch and produce inputs for the model.""" + tokenizer = get_tokenizer() + + loss_mask = batch['pad_mask'].long().cuda().contiguous().byte() + tokens_ = batch['text'].long().cuda().contiguous() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, _, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + config.reset_position_ids, + config.reset_attention_mask, + config.eod_mask_loss) + + return tokens, labels, attention_mask, position_ids, loss_mask diff --git a/training/nvidia/gpt2-pytorch/README.md b/training/nvidia/gpt2-pytorch/README.md new file mode 100644 index 000000000..2b981c65b --- /dev/null +++ b/training/nvidia/gpt2-pytorch/README.md @@ -0,0 +1,42 @@ +### 测试数据集下载 +[测试数据集下载](../../benchmarks/gpt2/README.md#测试数据集下载) + +### Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.12.0a0+bd13bc6 + - 依赖软件版本: + - cuda: 11.6 + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | 自然语言编码 | | +| 模型 | megatron-gpt2-345m | | +| 数据集 | lambada | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练序列数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1),单位为samples/s(seq_length=1024)| +| 训练结果 | lambada_acc,见“性能指标” | lambada任务准确率 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | lambada_acc | mem | +| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ------- | --------- | +| A100单机单卡(1x1) | fp32 | bs=32,lr=0.00015 | 6362.3 | 9.6 | 14.2 | 14.2 | | 30.2/40.0 | +| A100单机8卡(1x8) | fp32 | bs=32,lr=0.00015 | 10352.7 | 68.6 | 97.5 | 97.6 | 0.60 | 31.3/40.0 | diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py b/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py new file mode 100755 index 000000000..1d910c1db --- /dev/null +++ b/training/nvidia/gpt2-pytorch/config/config_A100x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +gradient_accumulation_steps = 8 +max_steps = 15000 \ No newline at end of file diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py b/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py new file mode 100755 index 000000000..27e5ca7f5 --- /dev/null +++ b/training/nvidia/gpt2-pytorch/config/config_A100x1x8.py @@ -0,0 +1 @@ +from config_common import * diff --git a/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py b/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py new file mode 100755 index 000000000..27e5ca7f5 --- /dev/null +++ b/training/nvidia/gpt2-pytorch/config/config_A100x2x8.py @@ -0,0 +1 @@ +from config_common import * diff --git a/training/nvidia/gpt2-pytorch/config/config_common.py b/training/nvidia/gpt2-pytorch/config/config_common.py new file mode 100755 index 000000000..47dcfb35d --- /dev/null +++ b/training/nvidia/gpt2-pytorch/config/config_common.py @@ -0,0 +1,6 @@ +vendor = 'nvidia' + +# disable fp16 +fp16 = False + +dist_backend = "nccl" diff --git a/training/nvidia/gpt2-pytorch/extern/.gitkeep b/training/nvidia/gpt2-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 81bf848d5..9d37dd8bc 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -75,4 +75,5 @@ # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2", } From 14bbdc0cc395bc3b9f21d7a2c3511752aa30e7d1 Mon Sep 17 00:00:00 2001 From: Jianbang Yang Date: Mon, 18 Sep 2023 14:35:53 +0800 Subject: [PATCH 08/18] Add T5-Small training model (#201) * add t5 small * t5_small use huggingface accelerate * fix coding style for t5_small model * update t5_small bs config * add MFU information in t5-small nvidia README * fix t5_small doc typo --- README.md | 2 +- training/benchmarks/t5_small/README.md | 68 +++++++++ .../t5_small/pytorch/config/__init__.py | 2 + .../t5_small/pytorch/config/_base.py | 46 ++++++ .../t5_small/pytorch/config/mutable_params.py | 5 + .../create_train_eval_data.py | 89 +++++++++++ .../t5_small/pytorch/dataloaders/__init__.py | 1 + .../pytorch/dataloaders/dataloader.py | 83 +++++++++++ .../t5_small/pytorch/model/__init__.py | 19 +++ .../t5_small/pytorch/optimizers/__init__.py | 27 ++++ .../t5_small/pytorch/run_pretraining.py | 137 +++++++++++++++++ .../t5_small/pytorch/schedulers/__init__.py | 11 ++ .../t5_small/pytorch/train/__init__.py | 0 .../t5_small/pytorch/train/evaluator.py | 84 +++++++++++ .../t5_small/pytorch/train/trainer.py | 141 ++++++++++++++++++ .../t5_small/pytorch/train/trainer_adapter.py | 9 ++ .../t5_small/pytorch/train/training_state.py | 41 +++++ training/nvidia/t5_small-pytorch/README.md | 57 +++++++ .../config/config_A100x1x1.py | 2 + .../config/config_A100x1x8.py | 2 + .../config/config_A100x2x8.py | 2 + .../t5_small-pytorch/config/requirements.txt | 8 + .../nvidia/t5_small-pytorch/extern/.gitkeep | 0 training/run_benchmarks/config/test_conf.py | 1 + 24 files changed, 836 insertions(+), 1 deletion(-) create mode 100644 training/benchmarks/t5_small/README.md create mode 100755 training/benchmarks/t5_small/pytorch/config/__init__.py create mode 100755 training/benchmarks/t5_small/pytorch/config/_base.py create mode 100755 training/benchmarks/t5_small/pytorch/config/mutable_params.py create mode 100644 training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py create mode 100644 training/benchmarks/t5_small/pytorch/dataloaders/__init__.py create mode 100644 training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py create mode 100644 training/benchmarks/t5_small/pytorch/model/__init__.py create mode 100644 training/benchmarks/t5_small/pytorch/optimizers/__init__.py create mode 100644 training/benchmarks/t5_small/pytorch/run_pretraining.py create mode 100644 training/benchmarks/t5_small/pytorch/schedulers/__init__.py create mode 100644 training/benchmarks/t5_small/pytorch/train/__init__.py create mode 100644 training/benchmarks/t5_small/pytorch/train/evaluator.py create mode 100644 training/benchmarks/t5_small/pytorch/train/trainer.py create mode 100644 training/benchmarks/t5_small/pytorch/train/trainer_adapter.py create mode 100644 training/benchmarks/t5_small/pytorch/train/training_state.py create mode 100644 training/nvidia/t5_small-pytorch/README.md create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x1x1.py create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x1x8.py create mode 100644 training/nvidia/t5_small-pytorch/config/config_A100x2x8.py create mode 100644 training/nvidia/t5_small-pytorch/config/requirements.txt create mode 100644 training/nvidia/t5_small-pytorch/extern/.gitkeep diff --git a/README.md b/README.md index 804d88c05..6aeab0301 100644 --- a/README.md +++ b/README.md @@ -290,7 +290,7 @@ under review表示对应case的支持已开发完毕,在review中;Incoming T5_small PyTorch - under review + Incoming N/A N/A diff --git a/training/benchmarks/t5_small/README.md b/training/benchmarks/t5_small/README.md new file mode 100644 index 000000000..2711dd125 --- /dev/null +++ b/training/benchmarks/t5_small/README.md @@ -0,0 +1,68 @@ + +## Model Introduction +### What is T5-Small(Text-To-Text Transfer Transformer)? +The developers of the Text-To-Text Transfer Transformer (T5) [write](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html): + +> With T5, we propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task. + +T5-Small is the checkpoint with 60 million parameters. + +- **Developed by:** Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. See [associated paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) and [GitHub repo](https://github.com/google-research/text-to-text-transfer-transformer#released-model-checkpoints) +- **Model type:** Language model +- **Language(s) (NLP):** English, French, Romanian, German +- **License:** Apache 2.0 +- **Related Models:** [All T5 Checkpoints](https://huggingface.co/models?search=t5) +- Resources for more information: + - [Research paper](https://jmlr.org/papers/volume21/20-074/20-074.pdf) + - [Google's T5 Blog Post](https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html) + - [GitHub Repo](https://github.com/google-research/text-to-text-transfer-transformer) + - [Hugging Face T5 Docs](https://huggingface.co/docs/transformers/model_doc/t5) + +## Model and Training Scripts source code +Pytorch case: +This repository includes software from https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization_no_trainer.py +licensed under the Apache License 2.0. + +Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf. + +## Dataset and Model Checkpoints + +> Dataset website:https://huggingface.co/datasets/cnn_dailymail and https://github.com/abisee/cnn-dailymail + +> Model checkpoint website: https://huggingface.co/t5-small/tree/main + +We have already preprocessed the dataset and the model checkpoint files(The preprocessing script is `training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py`). +The preprocessed can be downloaded directly from https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/t5_small_train.tar. +No additional preprocessing steps need to be conducted. + +After decompressing, the dataset and model checkpoint files are organized as the following: + +``` +t5_small_train +├── dataset # dataset files +│ ├── eval_dataset.npz +│ └── train_dataset.npz +├── metrics # metrics for evaluation +│ └── rouge +│ └── rouge.py +├── model # model checkpoint and config files +│ ├── config.json +│ ├── generation_config.json +│ ├── model.safetensors +│ ├── spiece.model +│ ├── tokenizer.json +│ └── tokenizer_config.json +└── nltk_data # nltk data for evaluation + └── tokenizers + └── punkt +``` + +## Benchmark Task and Target Accuracy +This experiment is to finetune a summarization task on CNN/Daily Mail dataset with t5-small pretrained checkpoints. +After finetuning 3 epoches, the t5-small model is able to achieve a ROUGE-1 score of 41+, which matches the evaluation result on the [paper](https://arxiv.org/abs/1910.10683). + +## AI Frameworks && Accelerators supports + +| | Pytorch | Paddle | TensorFlow2 | +| ---------- | ------- | ------ | ----------- | +| Nvidia GPU | [✅](../../nvidia/t5_small-pytorch/README.md) | N/A | N/A | diff --git a/training/benchmarks/t5_small/pytorch/config/__init__.py b/training/benchmarks/t5_small/pytorch/config/__init__.py new file mode 100755 index 000000000..96e0aae70 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/config/__init__.py @@ -0,0 +1,2 @@ +from ._base import * +from .mutable_params import mutable_params diff --git a/training/benchmarks/t5_small/pytorch/config/_base.py b/training/benchmarks/t5_small/pytorch/config/_base.py new file mode 100755 index 000000000..f105c6d31 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/config/_base.py @@ -0,0 +1,46 @@ +# DO NOT MODIFY THESE REQUIRED PARAMETERS + +# Required parameters +vendor: str = None +data_dir: str = None +name: str = "t5_small" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True + +# Optional parameters + +# ========================================================= +# loss scale +# ========================================================= +lr: float = 5e-5 +weight_decay = 0.0 + +# ========================================================= +# train && evaluate +# ========================================================= +train_batch_size: int = 32 +eval_batch_size: int = 32 + +max_epoch: int = 3 +target_rouge1: float = 40.5 + +do_train = True +distributed: bool = True + +# ========================================================= +# utils +# ========================================================= +seed: int = 0 +dist_backend: str = 'nccl' +device: str = None + +# ========================================================= +# for driver +# ========================================================= +local_rank: int = -1 +use_env: bool = True +log_freq: int = 500 +print_freq: int = 500 +n_device: int = 1 +sync_bn: bool = False +gradient_accumulation_steps: int = 1 diff --git a/training/benchmarks/t5_small/pytorch/config/mutable_params.py b/training/benchmarks/t5_small/pytorch/config/mutable_params.py new file mode 100755 index 000000000..87649996a --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/config/mutable_params.py @@ -0,0 +1,5 @@ +mutable_params = [ + 'vendor', 'data_dir', 'lr', 'weight_decay', 'train_batch_size', + 'eval_batch_size', 'do_train', 'distributed', 'dist_backend', 'device', + 'cudnn_benchmark', 'cudnn_deterministic' +] diff --git a/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py b/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py new file mode 100644 index 000000000..6bc0dff06 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/data_preprocessing/create_train_eval_data.py @@ -0,0 +1,89 @@ +import os + +import numpy as np +import datasets +from transformers import AutoTokenizer + + +def save_dataset(ds, save_path): + np.savez(save_path, + input_ids=ds['input_ids'], + attention_mask=ds['attention_mask'], + labels=ds['labels']) + + +def main(): + data_prefix = 't5_small_train/dataset' + os.makedirs(data_prefix, exist_ok=True) + train_datapath = os.path.join(data_prefix, 'train_dataset.npz') + eval_datapath = os.path.join(data_prefix, 'eval_dataset.npz') + + tokenizer = AutoTokenizer.from_pretrained('t5-small', + use_fast=True, + revision='main') + + raw_datasets = datasets.load_dataset('cnn_dailymail', '3.0.0') + + def preprocess_function(examples): + # remove pairs where at least one record is None + text_column = 'article' + summary_column = 'highlights' + prefix = 'summarize: ' + max_source_length = 1024 + max_target_length = 128 + ignore_pad_token_for_loss = True + padding = "max_length" + + inputs, targets = [], [] + for i in range(len(examples[text_column])): + if examples[text_column][i] and examples[summary_column][i]: + inputs.append(examples[text_column][i]) + targets.append(examples[summary_column][i]) + + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer(inputs, + max_length=max_source_length, + padding=padding, + truncation=True) + + # Tokenize targets with the `text_target` keyword argument + labels = tokenizer(text_target=targets, + max_length=max_target_length, + padding=padding, + truncation=True) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and ignore_pad_token_for_loss: + labels["input_ids"] = [[ + (l if l != tokenizer.pad_token_id else -100) for l in label + ] for label in labels["input_ids"]] + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + train_dataset = raw_datasets["train"] + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + num_proc=32, + remove_columns=raw_datasets["train"].column_names, + load_from_cache_file=True, + desc="Running tokenizer on train dataset", + ).with_format('numpy') + save_dataset(train_dataset, train_datapath) + + eval_dataset = raw_datasets["validation"] + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + num_proc=32, + remove_columns=raw_datasets["validation"].column_names, + load_from_cache_file=True, + desc="Running tokenizer on train dataset", + ).with_format('numpy') + save_dataset(eval_dataset, eval_datapath) + + +if __name__ == "__main__": + main() diff --git a/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py b/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py new file mode 100644 index 000000000..83fa73435 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/dataloaders/__init__.py @@ -0,0 +1 @@ +from .dataloader import build_train_dataloader, build_eval_dataloader diff --git a/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py b/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py new file mode 100644 index 000000000..26f2e99bf --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/dataloaders/dataloader.py @@ -0,0 +1,83 @@ +import os +import numpy as np +import torch +from torch.utils.data import Dataset +from torch.utils.data.dataloader import default_collate + + +class T5Dataset(Dataset): + def __init__(self, filepath): + origin_data = np.load(filepath) + self.input_ids = origin_data['input_ids'] + self.attention_mask = origin_data['attention_mask'] + self.labels = origin_data['labels'] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + sample = { + 'input_ids': self.input_ids[idx], + 'attention_mask': self.attention_mask[idx], + 'labels': self.labels[idx] + } + return sample + + +def _prepare_decoder_input_ids_from_labels(input_ids): + """ + https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/t5/modeling_t5.py#L1800 + https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/t5/modeling_t5.py#L851 + """ + decoder_start_token_id = 0 + pad_token_id = 0 + + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def my_collate(batch): + """ + https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/data/data_collator.py#L600 + """ + new_batch = default_collate(batch) + new_batch["decoder_input_ids"] = _prepare_decoder_input_ids_from_labels( + new_batch["labels"]) + return new_batch + + +def build_train_dataloader(config): + train_dataset = T5Dataset( + os.path.join(config.data_dir, 'dataset', 'train_dataset.npz')) + + data_loader = torch.utils.data.DataLoader( + train_dataset, + shuffle=True, + batch_size=config.train_batch_size, + collate_fn=my_collate) + return data_loader + + +def build_eval_dataloader(config): + eval_dataset = T5Dataset( + os.path.join(config.data_dir, 'dataset', 'eval_dataset.npz')) + + data_loader = torch.utils.data.DataLoader( + eval_dataset, batch_size=config.eval_batch_size, collate_fn=my_collate) + return data_loader + + +if __name__ == '__main__': + from collections import namedtuple + Config = namedtuple( + 'Config', + ['data_dir', 'distributed', 'train_batch_size', 'eval_batch_size']) + config = Config('t5_small_train/dataset', False, 4, 4) + eval_dataloader = build_eval_dataloader(config) + for i, batch in enumerate(eval_dataloader): + break diff --git a/training/benchmarks/t5_small/pytorch/model/__init__.py b/training/benchmarks/t5_small/pytorch/model/__init__.py new file mode 100644 index 000000000..8f3c84c32 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/model/__init__.py @@ -0,0 +1,19 @@ +import os +from transformers import T5Config, T5ForConditionalGeneration, T5TokenizerFast + + +def create_model(config): + model_path = os.path.join(config.data_dir, 'model') + hfconfig = T5Config.from_pretrained(model_path) + model = T5ForConditionalGeneration.from_pretrained(model_path, + config=hfconfig) + tokenizer = T5TokenizerFast.from_pretrained(model_path) + return model, hfconfig, tokenizer + + +if __name__ == '__main__': + + from collections import namedtuple + Config = namedtuple('Config', ['data_dir']) + config = Config('t5_small_train') + model, tokenizer = create_model(config) diff --git a/training/benchmarks/t5_small/pytorch/optimizers/__init__.py b/training/benchmarks/t5_small/pytorch/optimizers/__init__.py new file mode 100644 index 000000000..b5cdf1e3e --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/optimizers/__init__.py @@ -0,0 +1,27 @@ +import torch + + +def create_optimizer(model, args): + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in model.named_parameters() + if not any(nd in n for nd in no_decay) + ], + "weight_decay": + args.weight_decay, + }, + { + "params": [ + p for n, p in model.named_parameters() + if any(nd in n for nd in no_decay) + ], + "weight_decay": + 0.0, + }, + ] + optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr) + return optimizer diff --git a/training/benchmarks/t5_small/pytorch/run_pretraining.py b/training/benchmarks/t5_small/pytorch/run_pretraining.py new file mode 100644 index 000000000..a5d3feca0 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/run_pretraining.py @@ -0,0 +1,137 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +# 标准库 +import os +import sys +import time +from typing import Any, Tuple + +# 三方库 + +# benchmarks目录 append到sys.path +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, + "../../"))) # benchmarks目录 +# 本地库 +import config +from driver import Event, dist_pytorch +from driver.helper import InitHelper + +# 导入相关的模块、方法、变量。这里保持名称一致,实现可以不同。 +from train import trainer_adapter +from train.evaluator import Evaluator +from train.trainer import Trainer +from train.training_state import TrainingState +from dataloaders.dataloader import build_train_dataloader, build_eval_dataloader + +logger = None + + +def main() -> Tuple[Any, Any]: + global logger + global config + + # init + init_helper = InitHelper(config) + model_driver = init_helper.init_driver(globals(), locals()) + config = model_driver.config + dist_pytorch.init_dist_training_env(config) + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.INIT_START) + + config.distributed = dist_pytorch.get_world_size() > 1 + # logger + logger = model_driver.logger + + train_dataloader = build_train_dataloader(config) + eval_dataloader = build_eval_dataloader(config) + + seed = config.seed + + init_helper.set_seed(seed, model_driver.config.vendor) + + # 创建TrainingState对象 + training_state = TrainingState() + + # 构建 trainer:依赖 evaluator、TrainingState对象 + evaluator = Evaluator(config) + trainer = Trainer(driver=model_driver, + adapter=trainer_adapter, + evaluator=evaluator, + training_state=training_state, + device=config.device, + config=config) + training_state._trainer = trainer + + # 设置分布式环境, trainer init() + dist_pytorch.barrier(config.vendor) + train_dataloader, eval_dataloader = trainer.init(train_dataloader, + eval_dataloader) + dist_pytorch.barrier(config.vendor) + + # evaluation统计 + init_evaluation_start = time.time() # evaluation起始时间,单位为秒 + + trainer.evaluate(trainer.model, eval_dataloader, device=trainer.device) + + init_evaluation_end = time.time() # evaluation结束时间,单位为秒 + + init_evaluation_info = dict(time=init_evaluation_end - + init_evaluation_start) + + model_driver.event(Event.INIT_EVALUATION, init_evaluation_info) + + if not config.do_train: + return config, training_state + + model_driver.event(Event.INIT_END) + + # TRAIN_START + dist_pytorch.barrier(config.vendor) + model_driver.event(Event.TRAIN_START) + train_start_time = time.time() + + # 训练过程 + epoch = 0 + while not training_state.end_training: + training_state.epoch = epoch + trainer.train_one_epoch(train_dataloader, eval_dataloader) + epoch += 1 + + # TRAIN_END事件 + training_state.traintime = time.time() - train_start_time + model_driver.event(Event.TRAIN_END) + + return config, training_state + + +if __name__ == "__main__": + start = time.time() + config_update, state = main() + if not dist_pytorch.is_main_process(): + sys.exit(0) + + # 训练信息写日志 + e2e_time = time.time() - start + if config_update.do_train: + + finished_info = { + "e2e_time": e2e_time, + "train_time": state.traintime, + "train_no_eval_time": state.noevaltime, + "pure_training_computing_time": state.purecomputetime, + "throughput(ips)_raw": state.num_trained_samples / state.traintime, + "throughput(ips)_no_eval": + state.num_trained_samples / state.noevaltime, + "throughput(ips)_pure_compute": + state.num_trained_samples / state.purecomputetime, + "converged": state.converged, + "rouge1": state.rouge1, + "rouge2": state.rouge2, + "rougeL": state.rougeL, + "rougeLsum": state.rougeLsum, + } + else: + finished_info = {"e2e_time": e2e_time} + logger.log(Event.FINISHED, message=finished_info, stacklevel=0) diff --git a/training/benchmarks/t5_small/pytorch/schedulers/__init__.py b/training/benchmarks/t5_small/pytorch/schedulers/__init__.py new file mode 100644 index 000000000..06d6fb0d1 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/schedulers/__init__.py @@ -0,0 +1,11 @@ +from transformers import get_scheduler + + +def create_scheduler(optimizer, train_dataloader, args): + lr_scheduler = get_scheduler( + name='linear', + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=len(train_dataloader) * args.max_epoch, + ) + return lr_scheduler diff --git a/training/benchmarks/t5_small/pytorch/train/__init__.py b/training/benchmarks/t5_small/pytorch/train/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/training/benchmarks/t5_small/pytorch/train/evaluator.py b/training/benchmarks/t5_small/pytorch/train/evaluator.py new file mode 100644 index 000000000..174f54e69 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/train/evaluator.py @@ -0,0 +1,84 @@ +import os + +import nltk +import numpy as np +import evaluate +import torch +import torch.distributed as dist + + +def postprocess_text(preds, labels): + """ + https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L621 + """ + preds = [pred.strip() for pred in preds] + labels = [label.strip() for label in labels] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] + + return preds, labels + + +def pad_across_processes(config, preds, labels, tokenizer): + if not config.distributed: + return preds, labels + + max_pred_len = torch.tensor(preds.shape[1], + dtype=torch.int64, + device=config.device) + dist.all_reduce(max_pred_len, dist.ReduceOp.MAX) + max_pred_len = int(max_pred_len) + + if max_pred_len > preds.shape[1]: + pad_index = tokenizer.pad_token_id + new_preds = preds.new_zeros(preds.shape[0], max_pred_len) + pad_index + new_preds[:, :preds.shape[1]] = preds + preds = new_preds + + all_preds = [preds.clone() for _ in range(dist.get_world_size())] + dist.all_gather(all_preds, preds) + + all_labels = [labels.clone() for _ in range(dist.get_world_size())] + dist.all_gather(all_labels, labels) + + return torch.cat(all_preds, dim=0), torch.cat(all_labels, dim=0) + + +class Evaluator: + """Evaluator""" + def __init__(self, config): + self.config = config + nltk.data.path.append(os.path.join(config.data_dir, 'nltk_data')) + self.metric_path = os.path.join(config.data_dir, 'metrics', 'rouge', + 'rouge.py') + self.reset() + + def reset(self): + self.metric = evaluate.load(self.metric_path) + + def add_batch(self, tokenizer, preds, labels): + preds, labels = pad_across_processes(self.config, preds, labels, + tokenizer) + + preds = preds.cpu().numpy() + labels = labels.cpu().numpy() + + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = tokenizer.batch_decode(labels, + skip_special_tokens=True) + + decoded_preds, decoded_labels = postprocess_text( + decoded_preds, decoded_labels) + self.metric.add_batch( + predictions=decoded_preds, + references=decoded_labels, + ) + + def compute_acc(self): + result = self.metric.compute(use_stemmer=True) + result = {k: round(v * 100, 4) for k, v in result.items()} + return result diff --git a/training/benchmarks/t5_small/pytorch/train/trainer.py b/training/benchmarks/t5_small/pytorch/train/trainer.py new file mode 100644 index 000000000..ba805aa0c --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/train/trainer.py @@ -0,0 +1,141 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +import time +import torch +import torch.utils.data +from torch.types import Device +import os +import sys +import torch.distributed as dist +from accelerate import Accelerator + +from model import create_model +from optimizers import create_optimizer +from schedulers import create_scheduler +from train.evaluator import Evaluator +from train.training_state import TrainingState + +CURR_PATH = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) +from driver import Driver, Event, dist_pytorch + + +class Trainer: + def __init__(self, driver: Driver, adapter, evaluator: Evaluator, + training_state: TrainingState, device: Device, config): + super(Trainer, self).__init__() + self.driver = driver + self.adapter = adapter + self.training_state = training_state + self.device = device + self.config = config + self.evaluator = evaluator + + def init(self, train_dataloader, eval_dataloader): + device = torch.device(self.config.device) + dist_pytorch.main_proc_print("Init progress:") + self.model, self.model_config, self.tokenizer = create_model( + self.config) + self.model.to(self.device) + + self.model = self.adapter.convert_model(self.model) + + self.optimizer = create_optimizer(self.model, self.config) + self.lr_scheduler = create_scheduler(self.optimizer, train_dataloader, + self.config) + + self.accelerator = Accelerator() + self.model, self.optimizer, train_dataloader, eval_dataloader, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, train_dataloader, eval_dataloader, + self.lr_scheduler) + + return train_dataloader, eval_dataloader + + def process_batch(self, batch, device: Device): + """Process batch and produce inputs for the model.""" + for k, v in batch.items(): + batch[k] = v.to(device, non_blocking=True) + return batch + + def train_one_epoch(self, train_dataloader, eval_dataloader): + + model = self.model + optimizer = self.optimizer + data_loader = train_dataloader + device = self.device + epoch = self.training_state.epoch + print("Epoch " + str(epoch + 1)) + + model.train() + noeval_start_time = time.time() + + for step, batch in enumerate(data_loader): + batch = self.process_batch(batch, device) + + pure_start_time = time.time() + + outputs = model(**batch) + loss = outputs.loss + + self.accelerator.backward(loss) + optimizer.step() + self.lr_scheduler.step() + optimizer.zero_grad() + + if step % self.config.log_freq == 0: + print("Train Step " + str(step) + "/" + str(len(data_loader)) + + ", Loss : " + str(float(loss))) + + self.training_state.purecomputetime += time.time( + ) - pure_start_time + + self.training_state.noevaltime += time.time() - noeval_start_time + + eval_result = self.evaluate(self.model, + eval_dataloader, + device=self.device) + + state = self.training_state + config = self.config + + state.rouge1, state.rouge2, state.rougeL, state.rougeLsum = eval_result.values( + ) + if state.rouge1 >= config.target_rouge1: + dist_pytorch.main_proc_print( + f"converged_success. eval_rouge1: {state.rouge1}, target_rouge1: {config.target_rouge1}" + ) + state.converged_success() + + if epoch + 1 >= config.max_epoch: + state.end_training = True + state.num_trained_samples += len(data_loader.dataset) + + def evaluate(self, model, data_loader, device): + self.model.eval() + self.evaluator.reset() + for step, batch in enumerate(data_loader): + if step % self.config.log_freq == 0: + print("Eval Step " + str(step) + "/" + str(len(data_loader))) + batch = self.process_batch(batch, device) + input_ids, labels = batch['input_ids'], batch['labels'] + + # https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L178 + # https://github.com/huggingface/transformers/blob/v4.31.0/examples/pytorch/summarization/run_summarization.py#L651C2-L655 + # according to huggingface run_summarization.py, max_length is 128, num_beams is 1 + def _unwrap_model(model): + if hasattr(model, "module"): + return _unwrap_model(model.module) + else: + return model + + model = _unwrap_model(model) + output = model.generate(input_ids, + max_length=128, + num_beams=self.model_config.num_beams) + self.evaluator.add_batch(self.tokenizer, output, labels) + + result = self.evaluator.compute_acc() + dist_pytorch.main_proc_print(result) + + return result diff --git a/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py b/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py new file mode 100644 index 000000000..ff46be1b8 --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/train/trainer_adapter.py @@ -0,0 +1,9 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +from torch import nn + + +def convert_model(model: nn.Module) -> nn.Module: + """convert_model""" + return model diff --git a/training/benchmarks/t5_small/pytorch/train/training_state.py b/training/benchmarks/t5_small/pytorch/train/training_state.py new file mode 100644 index 000000000..dbecce66f --- /dev/null +++ b/training/benchmarks/t5_small/pytorch/train/training_state.py @@ -0,0 +1,41 @@ +# Copyright (c) 2023 BAAI. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License") +from dataclasses import dataclass + + +@dataclass +class TrainingState: + """TrainingState dataclass""" + _trainer = None + _status = 'aborted' # later set to 'success' if termination criteria met + + global_steps = 0 + + loss: float = 0.0 + rouge1: float = 0.0 + rouge2: float = 0.0 + rougeL: float = 0.0 + rougeLsum: float = 0.0 + + epoch: int = 1 + + end_training: bool = False + converged: bool = False + + traintime = 0.0 + noevaltime = 0.0 + purecomputetime = 0.0 + + num_trained_samples = 0 + + def status(self): + """get status""" + if self.converged: + self._status = "success" + return self._status + + def converged_success(self): + """converged success""" + self.end_training = True + self.converged = True diff --git a/training/nvidia/t5_small-pytorch/README.md b/training/nvidia/t5_small-pytorch/README.md new file mode 100644 index 000000000..b27ae4b3d --- /dev/null +++ b/training/nvidia/t5_small-pytorch/README.md @@ -0,0 +1,57 @@ +### 1. 下载数据集和模型 +[下载链接](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/t5_small_train.tar) + +### 2. 设置test_conf.py + +为了使得`training/nvidia/t5_small-pytorch/config/requirements.txt`里的依赖库均能被下载,需要将`training/run_benchmarks/config/test_conf.py`里的`PIP_SOURCE`的值修改为`https://pypi.tuna.tsinghua.edu.cn/simple` + +### 3. Nvidia GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.8.0a0+52ea372 + - 依赖软件版本: + - cuda: 11.4 + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------- | +| 任务类别 | Summarization | | +| 模型 | t5_small | | +| 数据集 | CNN/Daily Mail | | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际样本数数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | rouge1,见“性能指标” | rouge1分数 | +| 训练结果 | rouge2,见“性能指标” | rouge2分数 | +| 训练结果 | rougeL,见“性能指标” | rougeL分数 | +| 训练结果 | rougeLsum,见“性能指标” | rougeLsum分数 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | rouge1 | rouge2 | rougeL | rougeLsum | mem | +| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| A100单机8卡(1x1) | fp32 | / | | | | | | | | | | +| A100单机8卡(1x8) | fp32 | / | 996.11 | 338 | 398 | 400 | 41.12 | 18.84 | 29.15 | 38.32 | 35.3 /40.0 | +| A100单机8卡(2x8) | fp32 | / | | | | | | | | | | + +注意: T5模型MFU数值较低, 为11.8% +1x8训练的MFU计算过程如下: +`MFU = 400.26068691305795 * 1024 * (60 * 10^6) * 6 / (156 * 1000^4) / 8 = 11.8%` + +其中, 1024为seq_len, 60 millions为参数量, (156 * 1000^4)为A100 tf32算力 + diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py new file mode 100644 index 000000000..c1a1569cc --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x1.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py new file mode 100644 index 000000000..c1a1569cc --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/config_A100x1x8.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py new file mode 100644 index 000000000..c1a1569cc --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/config_A100x2x8.py @@ -0,0 +1,2 @@ +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/nvidia/t5_small-pytorch/config/requirements.txt b/training/nvidia/t5_small-pytorch/config/requirements.txt new file mode 100644 index 000000000..9ee08e5d1 --- /dev/null +++ b/training/nvidia/t5_small-pytorch/config/requirements.txt @@ -0,0 +1,8 @@ +transformers==4.31.0 +evaluate==0.4.0 +datasets==2.14.4 +accelerate==0.21.0 +tokenizers==0.13.3 +nltk==3.8.1 +absl-py==1.4.0 +rouge-score==0.1.2 diff --git a/training/nvidia/t5_small-pytorch/extern/.gitkeep b/training/nvidia/t5_small-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 9d37dd8bc..700f0c65a 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -75,5 +75,6 @@ # "transformer:pytorch_1.13:A100:1:8:1": "/raid/home_datasets_ckpt/transformer/train/", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "t5_small:pytorch_1.8:A100:1:8:1": "/home/datasets_ckpt/t5_small_train", # "gpt2:pytorch_1.12:A100:1:8:1": "/raid/dataset/gpt2", } From 5f9e761b3c112283fdd1b335e6745ca43d72d210 Mon Sep 17 00:00:00 2001 From: clveryang <50865584+clveryang@users.noreply.github.com> Date: Fri, 22 Sep 2023 14:42:16 +0800 Subject: [PATCH 09/18] iluvatar_infer_resnet50 (#259) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 杨智超 --- inference/benchmarks/resnet50/README.md | 2 +- inference/inference_engine/iluvatar/ixrt.py | 193 +++++++++++--------- 2 files changed, 103 insertions(+), 92 deletions(-) diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index aaf3c14fa..0eee2b55f 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -120,6 +120,6 @@ find ./val -name "*JPEG" | wc -l | tensorrt | fp16 | 256 |613.4 | 1358.9 | 4469.4 | 1391.4 | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 | | tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 | | torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | -| ixrt | fp16 | 256 | 136.4 | / | / | 1146.6 | 2679.9 | 11.5% | 76.2 | 4.3/32.0 | +| ixrt | fp16 (W16A32) | 256 | 261.467 | / | / | 1389.332 | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 | | kunlunxin_xtcl | fp32 | 128 | 311.215 | / | / | 837.507 | 1234.727 | / | 76.2/76.2 | / | diff --git a/inference/inference_engine/iluvatar/ixrt.py b/inference/inference_engine/iluvatar/ixrt.py index 620cc32f3..44fc85c4b 100644 --- a/inference/inference_engine/iluvatar/ixrt.py +++ b/inference/inference_engine/iluvatar/ixrt.py @@ -1,10 +1,13 @@ -from ixrt import IxRT, RuntimeConfig, RuntimeContext -import torch import os -import subprocess -from loguru import logger +import torch +from torch import autocast +import tensorrt as trt + import numpy as np +import pycuda.driver as cuda +import pycuda.autoinit import time +import subprocess class InferModel: @@ -16,114 +19,122 @@ def __init__(self, host_mem, device_mem): self.device = device_mem def __str__(self): - return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + return "Host:\n" + str(self.host) + "\nDevice:\n" + str( + self.device) def __repr__(self): return self.__str__() def __init__(self, config, onnx_path, model): - self.str_to_numpy_dict = { - "int32": np.int32, - "float16": np.float16, - "float32": np.float32, - } - self.engine = self.build_engine(config, onnx_path) - self.outputs = self.allocate_buffers(self.engine) - - def config_init_engine(self, config, onnx_path): - quant_file = None - - runtime_config = RuntimeConfig() + self.config = config - input_shapes = [config.batch_size, 3, config.image_size, config.image_size] - runtime_config.input_shapes = [("input", input_shapes)] - runtime_config.device_idx = 0 + self.logger = trt.Logger(trt.Logger.WARNING) + self.runtime = trt.Runtime(self.logger) - precision = "float16" - if precision == "int8": - assert quant_file, "Quant file must provided for int8 inferencing." - - runtime_config.runtime_context = RuntimeContext( - precision, - "nhwc", - use_gpu=True, - pipeline_sync=True, - input_types=config.input_types, - output_types=config.output_types, - input_device="gpu", - output_device="gpu", - ) + self.engine = self.build_engine(config, onnx_path) - runtime = IxRT.from_onnx(onnx_path, quant_file, runtime_config) - return runtime + self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers( + self.engine) + + self.context = self.engine.create_execution_context() + self.numpy_to_torch_dtype_dict = { + bool: torch.bool, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + np.complex64: torch.complex64, + np.complex128: torch.complex128, + } + self.str_to_torch_dtype_dict = { + "bool": torch.bool, + "uint8": torch.uint8, + "int8": torch.int8, + "int16": torch.int16, + "int32": torch.int32, + "int64": torch.int64, + "float16": torch.float16, + "float32": torch.float32, + "float64": torch.float64, + "complex64": torch.complex64, + "complex128": torch.complex128, + } def build_engine(self, config, onnx_path): if config.exist_compiler_path is None: - output_path = config.log_dir + "/" + config.ixrt_tmp_path + trt_path = config.log_dir + "/" + config.ixrt_tmp_path - dir_output_path = os.path.dirname(output_path) - os.makedirs(dir_output_path, exist_ok=True) + dir_trt_path = os.path.dirname(trt_path) + os.makedirs(dir_trt_path, exist_ok=True) time.sleep(10) - runtime = self.config_init_engine(config, onnx_path) - print(f"Build Engine File: {output_path}") - runtime.BuildEngine() - runtime.SerializeEngine(output_path) - print("Build Engine done!") + trtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + trt_path + if config.fp16: + trtexec_cmd += " --precision fp16" + if config.has_dynamic_axis: + trtexec_cmd += " --minShapes=" + config.minShapes + trtexec_cmd += " --optShapes=" + config.optShapes + trtexec_cmd += " --maxShapes=" + config.maxShapes + + p = subprocess.Popen(trtexec_cmd, shell=True) + p.wait() else: - output_path = config.exist_compiler_path - print(f"Use existing engine: {output_path}") + trt_path = config.exist_compiler_path - runtime = IxRT() - runtime.LoadEngine(output_path, config.batch_size) - return runtime + with open(trt_path, "rb") as f: + return self.runtime.deserialize_cuda_engine(f.read()) def allocate_buffers(self, engine): - output_map = engine.GetOutputShape() - output_io_buffers = [] - output_types = {} - config = engine.GetConfig() - for key, val in config.runtime_context.output_types.items(): - output_types[key] = str(val) - for name, shape in output_map.items(): - # 1. apply memory buffer for output of the shape - buffer = np.zeros( - shape.dims, dtype=self.str_to_numpy_dict[output_types[name]] - ) - buffer = torch.tensor(buffer).cuda() - # 2. put the buffer to a list - output_io_buffers.append([name, buffer, shape]) + inputs = [] + outputs = [] + bindings = [] + stream = cuda.Stream() + + for binding in range(engine.num_bindings): + size = trt.volume(engine.get_binding_shape(binding)) + dtype = trt.nptype(engine.get_binding_dtype(binding)) + + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + bindings.append(int(device_mem)) + + if engine.binding_is_input(binding): + inputs.append(self.HostDeviceMem(host_mem, device_mem)) + else: + outputs.append(self.HostDeviceMem(host_mem, device_mem)) - engine.BindIOBuffers(output_io_buffers) - return output_io_buffers + return inputs, outputs, bindings, stream def __call__(self, model_inputs: list): - batch_size = np.unique(np.array([i.size(dim=0) for i in model_inputs])) - batch_size = batch_size[0] - input_map = self.engine.GetInputShape() - input_io_buffers = [] for i, model_input in enumerate(model_inputs): - model_input = torch.tensor(model_input.numpy(), dtype=torch.float32).cuda() - if not model_input.is_contiguous(): - model_input = model_input.contiguous() - name, shape = list(input_map.items())[0] - _shape, _padding = shape.dims, shape.padding - _shape = [i + j for i, j in zip(_shape, _padding)] - _shape = [_shape[0], *_shape[2:], _shape[1]] - input_io_buffers.append([name, model_input, shape]) - - self.engine.BindIOBuffers(self.outputs) - self.engine.LoadInput(input_io_buffers) - - # torch.cuda.synchronize() - self.engine.Execute() - # torch.cuda.synchronize() - - gpu_io_buffers = [] - for buffer in self.outputs: - # gpu_io_buffers.append([buffer[0], buffer[1], buffer[2]]) - gpu_io_buffers.append(buffer[1]) - - return gpu_io_buffers, 0 + model_input = model_input.cuda() + + cuda.memcpy_dtod_async( + self.inputs[i].device, + model_input.data_ptr(), + model_input.element_size() * model_input.nelement(), + self.stream, + ) + + self.context.execute_async_v2(bindings=self.bindings, + stream_handle=self.stream.handle) + result = [] + for out in self.outputs: + out_tensor = torch.empty(out.host.shape, device="cuda").to( + self.str_to_torch_dtype_dict[str(out.host.dtype)]) + cuda.memcpy_dtod_async( + out_tensor.data_ptr(), + out.device, + out_tensor.element_size() * out_tensor.nelement(), + self.stream, + ) + result.append(out_tensor) + + self.stream.synchronize() + return result, 0 From aa4a3d7f9e20d5faac861f693aa9cdc09729c8a3 Mon Sep 17 00:00:00 2001 From: feldmanshan <145551134+feldmanshan@users.noreply.github.com> Date: Fri, 22 Sep 2023 14:53:21 +0800 Subject: [PATCH 10/18] zixiao:add resnet50 inference configs && results (#256) * zixiao:add resnet50 inference configs && results * zixiao: modify resnet50 config & add log file * zixiao: remote log file * zixiao: fix resnet50 inference result --- inference/benchmarks/resnet50/README.md | 19 ++ .../vendor_config/zixiao_configurations.yaml | 4 + .../docker_images/zixiao/pytorch/Dockerfile | 85 ++++++ .../zixiao/pytorch/packages/README.md | 5 + .../zixiao/pytorch/pytorch_install.sh | 6 + .../zixiao/pytorch/sdk_installers/README.md | 8 + .../docker_images/zixiao/zixiao_analysis.py | 16 ++ .../docker_images/zixiao/zixiao_monitor.py | 256 ++++++++++++++++++ inference/inference_engine/zixiao/zxrt.py | 114 ++++++++ inference/tools/torch_sync.py | 4 + 10 files changed, 517 insertions(+) create mode 100644 inference/configs/resnet50/vendor_config/zixiao_configurations.yaml create mode 100644 inference/docker_images/zixiao/pytorch/Dockerfile create mode 100644 inference/docker_images/zixiao/pytorch/packages/README.md create mode 100644 inference/docker_images/zixiao/pytorch/pytorch_install.sh create mode 100644 inference/docker_images/zixiao/pytorch/sdk_installers/README.md create mode 100644 inference/docker_images/zixiao/zixiao_analysis.py create mode 100644 inference/docker_images/zixiao/zixiao_monitor.py create mode 100755 inference/inference_engine/zixiao/zxrt.py diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 0eee2b55f..5bd96adb3 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -96,6 +96,24 @@ find ./val -name "*JPEG" | wc -l - IXRT: ixrt-0.4.0+corex.3.2.0 +#### 2.5 腾讯紫霄 C100 + +- ##### 硬件环境 + - 机器、加速卡型号: C100 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-78-generic + - 加速卡驱动版本:2.4.12 + - Docker 版本:24.0.4 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + +- 推理工具包 + + - zxrt 2.4.12 + ### 3. 运行情况 * 指标列表 @@ -122,4 +140,5 @@ find ./val -name "*JPEG" | wc -l | torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | | ixrt | fp16 (W16A32) | 256 | 261.467 | / | / | 1389.332 | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 | | kunlunxin_xtcl | fp32 | 128 | 311.215 | / | / | 837.507 | 1234.727 | / | 76.2/76.2 | / | +| zixiao | fp16 | 192 | 380 | / | / | 1528.373 | 5853.771 | / | 76.2/76.2 | / | diff --git a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml new file mode 100644 index 000000000..14071e7ec --- /dev/null +++ b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml @@ -0,0 +1,4 @@ +compiler: zxrt +no_validation: true +batch_size: 192 +exist_onnx_path: onnxs/resnet50_pytorch.onnx \ No newline at end of file diff --git a/inference/docker_images/zixiao/pytorch/Dockerfile b/inference/docker_images/zixiao/pytorch/Dockerfile new file mode 100644 index 000000000..ec332e343 --- /dev/null +++ b/inference/docker_images/zixiao/pytorch/Dockerfile @@ -0,0 +1,85 @@ +FROM amd64/ubuntu:20.04 + +LABEL VERSION="1.1.9" + +ARG DEBIAN_FRONTEND=noninteractive +USER root +WORKDIR /tmp +ARG MOFED_VER=5.4-3.1.0.0 +RUN if [ $MOFED_VER ]; then echo "MOFED_VER=$MOFED_VER" & exit 0;else echo "no MOFED_VER, please check" & sleep 1 & exit 1; fi + + +ARG WHICH_MIRROR=mirrors.cloud.tencent.com/repo + +# change source to tencent cloud +RUN sed -i 's#http://archive.ubuntu.com/ubuntu/#http://mirrors.cloud.tencent.com/ubuntu/#g' /etc/apt/sources.list && \ + sed -i 's#http://security.ubuntu.com/ubuntu/#http://mirrors.cloud.tencent.com/ubuntu/#g' /etc/apt/sources.list && \ + apt update && apt -yq install apt-transport-https wget +RUN wget -O /etc/apt/sources.list http://${WHICH_MIRROR}/ubuntu20_sources.list && apt update -yq +RUN mkdir /root/.pip && echo "[global]\nindex- url = https://mirrors.cloud.tencent.com/pypi/simple\ntrusted-host = mirrors.cloud.tencent.com" > /root/.pip/pip.conf + +################################ BASIC LIBRARY ################################# +# install packages +RUN apt-get update && apt-get install -yq apt-utils sudo vim curl \ + autoconf automake dialog libtool pkg-config libffi-dev \ + libexpat1-dev libpciaccess-dev libxml2-dev \ + bison flex xutils-dev zlib1g-dev ninja-build git locate \ + zip unzip g++ \ + # install ssh + openssh-server openssh-client \ + # fix hexdump missing issue + bsdmainutils \ + # fix header missing for tensorflow verbs support + libibverbs-dev \ + #install default python3.8 + python3 python3-pip python3-dev python3-tk libjpeg-dev \ + # RMA dependency library + graphviz dpatch swig gfortran chrpath tk tcl libnl-3-200 libnl-route-3-dev lsof \ + libnl-3-dev libgfortran5 libnl-route-3-200 ethtool libnuma1 libnuma-dev udev \ + # ECCL dependency library + libncurses5-dev hwloc libhwloc-dev libhwloc-common libboost-all-dev libevent-dev python2-dev && \ + apt-get clean + + +# Set timezone +RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime +RUN echo 'Asia/Shanghai' >/etc/timezone + +# Install miniconda +# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 +# RUN curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ +RUN wget -O ~/miniconda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ + chmod +x ~/miniconda.sh && \ + bash ~/miniconda.sh -b -p /root/miniconda && \ + rm ~/miniconda.sh && \ + /root/miniconda/bin/conda config --set show_channel_urls yes && \ + /root/miniconda/bin/conda create --name python38 python=3.8 -y && \ + /root/miniconda/bin/conda clean -ya + +# hyperparamer, typing_extensions, numpy requests +RUN /root/miniconda/envs/python38/bin/pip install \ + --no-cache-dir \ + -i https://pypi.tuna.tsinghua.edu.cn/simple \ + hyperparameter \ + typing_extensions \ + numpy \ + requests \ + onnx \ + onnxruntime \ + attrs \ + regex \ + decorator \ + loguru \ + schedule \ + munch \ + pyyaml \ + tqdm \ + scipy + +RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu + +ENV PATH /root/miniconda/envs/python38/bin:$PATH +ENV PATH=/usr/local/zx-smi/zx-smi-1.20.0:$PATH + + + diff --git a/inference/docker_images/zixiao/pytorch/packages/README.md b/inference/docker_images/zixiao/pytorch/packages/README.md new file mode 100644 index 000000000..6be446920 --- /dev/null +++ b/inference/docker_images/zixiao/pytorch/packages/README.md @@ -0,0 +1,5 @@ +# 以下软件包需联系腾讯蓬莱实验室获取 + +>联系邮箱: feldmanshan@tencent.com + +TopsInference-2.4.12-py3.8-none-any.wh diff --git a/inference/docker_images/zixiao/pytorch/pytorch_install.sh b/inference/docker_images/zixiao/pytorch/pytorch_install.sh new file mode 100644 index 000000000..9f1433527 --- /dev/null +++ b/inference/docker_images/zixiao/pytorch/pytorch_install.sh @@ -0,0 +1,6 @@ +#!/bin/bash +pip3 install ./packages/TopsInference-2.4.12-py3.8-none-any.whl + +dpkg -i ./sdk_installers/topsruntime_2.4.12-1_amd64.deb +dpkg -i ./sdk_installers/tops-sdk_2.4.12-1_amd64.deb + diff --git a/inference/docker_images/zixiao/pytorch/sdk_installers/README.md b/inference/docker_images/zixiao/pytorch/sdk_installers/README.md new file mode 100644 index 000000000..644527273 --- /dev/null +++ b/inference/docker_images/zixiao/pytorch/sdk_installers/README.md @@ -0,0 +1,8 @@ +# 以下软件包需联系腾讯蓬莱实验室获取 + +>联系邮箱: feldmanshan@tencent.com + +zixiao-x86_64-gcc-2.4.12.run + +topsruntime_2.4.12-1_amd64.deb +tops-sdk_2.4.12-1_amd64.deb \ No newline at end of file diff --git a/inference/docker_images/zixiao/zixiao_analysis.py b/inference/docker_images/zixiao/zixiao_analysis.py new file mode 100644 index 000000000..d23cf3f8b --- /dev/null +++ b/inference/docker_images/zixiao/zixiao_analysis.py @@ -0,0 +1,16 @@ +def analysis_log(logpath): + logfile = open(logpath) + + max_usage = 0.0 ## usage_mem + max_mem = 16.0 + for line in logfile.readlines(): + ''' + zx-smi pwr DTemp MUsed Mem + ''' + if "zx-smi" in line: + line = line[:-1] + usage = line.split(" ")[3] + usage = float(usage)*16/100 + max_usage = max(max_usage, usage) + return round(max_usage, 2), max_mem, eval("30e12"), eval("120e12") + diff --git a/inference/docker_images/zixiao/zixiao_monitor.py b/inference/docker_images/zixiao/zixiao_monitor.py new file mode 100644 index 000000000..af1f857c9 --- /dev/null +++ b/inference/docker_images/zixiao/zixiao_monitor.py @@ -0,0 +1,256 @@ +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + cmd = "zx-smi -dmon -s mp -i 0 -c 1 | grep '16'|awk '{print $3,$4,$9,$10}'" ## pwr DTemp MUsed Mem + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n zx-smi " + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/zixiao_monitor.pid') + log_fn = str(log_path + '/zixiao_monitor.log') + err_fn = str(log_path + '/zixiao_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/zixiao_monitor.log') + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/inference/inference_engine/zixiao/zxrt.py b/inference/inference_engine/zixiao/zxrt.py new file mode 100755 index 000000000..a2ceacf1c --- /dev/null +++ b/inference/inference_engine/zixiao/zxrt.py @@ -0,0 +1,114 @@ +import onnx +import onnxruntime +import torch +import os +import subprocess +from loguru import logger +import numpy as np +import time +import TopsInference + +def type2dtype(types): + dtypes = [] + for elem_type in types: + if elem_type == 1: + dtypes.append(TopsInference.DT_FLOAT32) + elif elem_type == 7: + dtypes.append(TopsInference.DT_INT64) + elif elem_type == 6: + dtypes.append(TopsInference.DT_INT32) + elif elem_type == 3: + dtypes.append(TopsInference.DT_INT8) + elif elem_type == 4: + dtypes.append(TopsInference.DT_UINT8) + elif elem_type == 9: + dtypes.append(TopsInference.DT_BOOL) + elif elem_type == 10: + dtypes.append(TopsInference.DT_FLOAT16) + else: + raise Exception("unknown default dtypes:{}, {}".format(elem_type)) + return dtypes + +class InferModel: + + def __init__(self, config, onnx_path, model): + self.input_names = [] + self.engine = self.build_engine(config, onnx_path) + self.test_index = 0 + self.batch_size = config.batch_size + self.zixiao_VG_num = 6 + + def build_engine(self, config, onnx_path): + self.handler = TopsInference.set_device(0, -1) + onnx_model = onnx.load(onnx_path) + self.input_shapes = [] + self.input_dtype = [] + for input in onnx_model.graph.input: + input_shape = input.type.tensor_type.shape.dim + input_shape = [a.dim_value for a in input_shape] + input_shape[0] = config.batch_size // 6 + input_name = input.name + self.input_names.append(input_name) + self.input_shapes.append(input_shape) + self.input_dtype.append(input.type.tensor_type.elem_type) + self.input_dtype = type2dtype(self.input_dtype) + if config.fp16 == True: + set_input_dtype = [] + for tops_dtype in self.input_dtype: + if tops_dtype == TopsInference.DT_FLOAT32: + set_input_dtype.append(TopsInference.DT_FLOAT16) + else: + set_input_dtype.append(tops_dtype) + self.input_dtype = set_input_dtype + + onnx_parser = TopsInference.create_parser(TopsInference.ONNX_MODEL) + onnx_parser.set_input_names(self.input_names) + onnx_parser.set_input_dtypes(self.input_dtype) + onnx_parser.set_input_shapes(input_shape) + + network = onnx_parser.read(onnx_path) + optimizer = TopsInference.create_optimizer() + if config.fp16 == True: + optimizer.set_build_flag(TopsInference.KFP16_MIX) + engine = optimizer.build(network) + engine.save_executable(onnx_path+".bin") + engine = TopsInference.load(onnx_path+".bin") + self.streams = [] + for i in range(12): + self.streams.append(TopsInference.create_stream()) + return engine + + def __call__(self, model_inputs: list): + inputs = [] + outputs = [] + foo_time_start = time.time() + for input in model_inputs: + inputs.append(input.numpy()) + input_batch = inputs[0].shape[0] + # zixiao acceleration card has 6 compute cells + assert input_batch % self.zixiao_VG_num == 0 + vg_batch = input_batch // self.zixiao_VG_num + foo_time = time.time() - foo_time_start + for i in range(self.zixiao_VG_num): + vg_input = [] + foo_time_start_data_slice = time.time() + for input in inputs: + vg_input.append(input[vg_batch * i: vg_batch * (i + 1)]) + foo_time += time.time() - foo_time_start_data_slice + outputs.append(self.engine.runV2(vg_input, + py_stream=self.streams[self.test_index % 12])) + self.test_index += 1 + zx_outputs = [] + for i in range(self.zixiao_VG_num): + zx_outputs.append([output for output in outputs[i].get()]) + # concat vg_batch result + foo_time_start2 = time.time() + host_output = [] + for i in range(len(zx_outputs[0])): + tmp_output = [] + for j in range(self.zixiao_VG_num): + tmp_output.append(zx_outputs[j][i]) + host_output.append(np.concatenate(tmp_output)) + infer_output = [torch.from_numpy(output) for output in host_output] + foo_time += time.time() - foo_time_start2 + return infer_output, foo_time diff --git a/inference/tools/torch_sync.py b/inference/tools/torch_sync.py index 4b2c631e5..85475d789 100644 --- a/inference/tools/torch_sync.py +++ b/inference/tools/torch_sync.py @@ -10,3 +10,7 @@ def torch_sync(config): # kunlunxin case # xpu sync already finsh after InferModel.__call__ pass + if config.vendor == "zixiao": + # zixiao case + # zixiao sync already finsh after InferModel.__call__ + pass \ No newline at end of file From 858c7228c3320d37c0a4d17fc8f9842aeb1d9164 Mon Sep 17 00:00:00 2001 From: feldmanshan <145551134+feldmanshan@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:00:31 +0800 Subject: [PATCH 11/18] zixiao: update zxrt.py & resnet50 result (#262) * zixiao: update zxrt.py & resnet50 result * zixiao: update resnet50 test batch_size --- inference/benchmarks/resnet50/README.md | 2 +- .../vendor_config/zixiao_configurations.yaml | 6 ++- inference/inference_engine/zixiao/zxrt.py | 37 ++++++++++--------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 5bd96adb3..4e802f022 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -140,5 +140,5 @@ find ./val -name "*JPEG" | wc -l | torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | | ixrt | fp16 (W16A32) | 256 | 261.467 | / | / | 1389.332 | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 | | kunlunxin_xtcl | fp32 | 128 | 311.215 | / | / | 837.507 | 1234.727 | / | 76.2/76.2 | / | -| zixiao | fp16 | 192 | 380 | / | / | 1528.373 | 5853.771 | / | 76.2/76.2 | / | +| zixiao | fp16 | 32*6 | 261.103 | / | / | 193.151 | 6342.191 | / | 76.2/76.2 | / | diff --git a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml index 14071e7ec..00586e225 100644 --- a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml +++ b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml @@ -1,4 +1,6 @@ compiler: zxrt no_validation: true -batch_size: 192 -exist_onnx_path: onnxs/resnet50_pytorch.onnx \ No newline at end of file +batch_size: 50000 +exist_onnx_path: onnxs/resnet50_pytorch.onnx +repeat: 1 +zixiao_test_batch_size: 32 \ No newline at end of file diff --git a/inference/inference_engine/zixiao/zxrt.py b/inference/inference_engine/zixiao/zxrt.py index a2ceacf1c..f9f6b2b22 100755 --- a/inference/inference_engine/zixiao/zxrt.py +++ b/inference/inference_engine/zixiao/zxrt.py @@ -34,19 +34,18 @@ class InferModel: def __init__(self, config, onnx_path, model): self.input_names = [] self.engine = self.build_engine(config, onnx_path) - self.test_index = 0 - self.batch_size = config.batch_size + self.batch_size = config.zixiao_test_batch_size self.zixiao_VG_num = 6 def build_engine(self, config, onnx_path): - self.handler = TopsInference.set_device(0, -1) + self.handler = TopsInference.set_device(4, -1) onnx_model = onnx.load(onnx_path) self.input_shapes = [] self.input_dtype = [] for input in onnx_model.graph.input: input_shape = input.type.tensor_type.shape.dim input_shape = [a.dim_value for a in input_shape] - input_shape[0] = config.batch_size // 6 + input_shape[0] = config.zixiao_test_batch_size input_name = input.name self.input_names.append(input_name) self.input_shapes.append(input_shape) @@ -84,31 +83,33 @@ def __call__(self, model_inputs: list): foo_time_start = time.time() for input in model_inputs: inputs.append(input.numpy()) - input_batch = inputs[0].shape[0] + total_input_num = inputs[0].shape[0] + total_test_batch = (total_input_num + self.batch_size - 1) // self.batch_size # zixiao acceleration card has 6 compute cells - assert input_batch % self.zixiao_VG_num == 0 - vg_batch = input_batch // self.zixiao_VG_num foo_time = time.time() - foo_time_start - for i in range(self.zixiao_VG_num): - vg_input = [] + for i in range(total_test_batch): foo_time_start_data_slice = time.time() + vg_input = [] for input in inputs: - vg_input.append(input[vg_batch * i: vg_batch * (i + 1)]) + vg_input.append(input[self.batch_size * i: self.batch_size * (i + 1)]) foo_time += time.time() - foo_time_start_data_slice - outputs.append(self.engine.runV2(vg_input, - py_stream=self.streams[self.test_index % 12])) - self.test_index += 1 + outputs.append(self.engine.runV2(vg_input, py_stream=self.streams[i % 12])) + # zixiao sync + for i in range(12): + outputs[i-12].get() + # zixiao sync done + + # concat batch result + foo_time_start_d2h = time.time() zx_outputs = [] - for i in range(self.zixiao_VG_num): + for i in range(total_test_batch): zx_outputs.append([output for output in outputs[i].get()]) - # concat vg_batch result - foo_time_start2 = time.time() host_output = [] for i in range(len(zx_outputs[0])): tmp_output = [] - for j in range(self.zixiao_VG_num): + for j in range(total_test_batch): tmp_output.append(zx_outputs[j][i]) host_output.append(np.concatenate(tmp_output)) infer_output = [torch.from_numpy(output) for output in host_output] - foo_time += time.time() - foo_time_start2 + foo_time += time.time() - foo_time_start_d2h return infer_output, foo_time From a8599e7f5f164e66833070370e4910d984dece90 Mon Sep 17 00:00:00 2001 From: KungYork <30741085+KungYork@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:20:46 +0800 Subject: [PATCH 12/18] kunlunxin: add BERT readme (#260) * Add BERT readme * Update 1x8 result in README.md * Update header in README.md --- training/kunlunxin/bert-pytorch/README.md | 48 +++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 training/kunlunxin/bert-pytorch/README.md diff --git a/training/kunlunxin/bert-pytorch/README.md b/training/kunlunxin/bert-pytorch/README.md new file mode 100644 index 000000000..43d955af3 --- /dev/null +++ b/training/kunlunxin/bert-pytorch/README.md @@ -0,0 +1,48 @@ +### 模型Checkpoint下载 +[模型Checkpoint下载](../../benchmarks/bert/README.md#模型Checkpoint下载) +### 测试数据集下载 +[测试数据集下载](../../benchmarks/bert/README.md#测试数据集下载) + +### 昆仑芯XPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir + - 训练编译器版本:xacc + - 依赖软件版本:pytorch-1.12.1+cpu + +#### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ------------------------------ | ------------------------------------------- | +| 任务类别 | 通用语言模型 | | +| 模型 | bert | | +| 数据集 | Wikipedia for bert | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | acc,见“性能指标” | 分类准确率(mlm_accuracy) | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | mlm_accuracy | mem | +| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| R300单机单卡(1x1) | fp32 | bs=8,lr=3.5e-04 | | | | | | | +| R300单机8卡(1x8) | fp32 | bs=8,lr=3.5e-04 | | | | | 0.36 | 26.5/32.0 | +| R300两机8卡(2x8) | fp32 | bs=8,lr=3.5e-04 | | | | | | | From 3d4c4376c3c71e612a97235ffc3a3d24547662a1 Mon Sep 17 00:00:00 2001 From: clveryang <50865584+clveryang@users.noreply.github.com> Date: Fri, 22 Sep 2023 17:55:38 +0800 Subject: [PATCH 13/18] Iluvatar Ixrt environment (#265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Ixrt environment * add touch config --------- Co-authored-by: 杨智超 --- .../resnet50/vendor_config/iluvatar_configurations.yaml | 4 ---- inference/docker_images/iluvatar/pytorch/Dockerfile | 6 +++--- .../docker_images/iluvatar/pytorch/packages/README.md | 8 +++++--- .../docker_images/iluvatar/pytorch/pytorch_install.sh | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml index c721ede09..86ee38560 100644 --- a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml +++ b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml @@ -2,7 +2,3 @@ ixrt_tmp_path: iluvatar_tmp/resnet50-fp16.engine has_dynamic_axis: false repeat: 1 image_size: 224 -exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx -# exist_compiler_path: resnet50-fp16.engine -output_types: {"output":"float32"} -input_types: {"input": "float32"} \ No newline at end of file diff --git a/inference/docker_images/iluvatar/pytorch/Dockerfile b/inference/docker_images/iluvatar/pytorch/Dockerfile index 3e72721cf..2b502c689 100644 --- a/inference/docker_images/iluvatar/pytorch/Dockerfile +++ b/inference/docker_images/iluvatar/pytorch/Dockerfile @@ -38,9 +38,9 @@ RUN apt-get install -y --fix-missing \ # Configure anaconda -RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \ - bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \ - /root/miniconda/bin/conda clean -tipsy && \ +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \ + bash ./Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -b -p /root/miniconda && \ + /root/miniconda/bin/conda clean -tip && \ ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate base" >> ~/.bashrc && \ diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md index 88a18b3dc..d528a13fb 100644 --- a/inference/docker_images/iluvatar/pytorch/packages/README.md +++ b/inference/docker_images/iluvatar/pytorch/packages/README.md @@ -2,8 +2,10 @@ >联系邮箱: contact-us@iluvatar.com -ixrt-0.4.0+corex.3.2.0-cp38-cp38-linux_x86_64.whl +ixrt-0.7.0+corex.latest.version-cp310-cp310-linux_x86_64.whl -torch-1.13.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl +torchvision-0.14.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl -torchvision-0.14.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl \ No newline at end of file +pycuda-2022.2.2+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl + +torch-1.13.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl \ No newline at end of file diff --git a/inference/docker_images/iluvatar/pytorch/pytorch_install.sh b/inference/docker_images/iluvatar/pytorch/pytorch_install.sh index 859591930..63cd26993 100644 --- a/inference/docker_images/iluvatar/pytorch/pytorch_install.sh +++ b/inference/docker_images/iluvatar/pytorch/pytorch_install.sh @@ -14,7 +14,7 @@ done search_sdk_results=`find ${SDK_DIR} -name "corex*.run"` for installer in $search_sdk_results; do echo "Install ${installer}" - sh "${installer}" -- --silent --driver --toolkit + sh "${installer}" -- --silent --toolkit done search_packages_results=`find ${PKG_DIR} -name "*.whl"` From bf17dfbeb1e98e8851fbecb9d56ac56201e0e2ca Mon Sep 17 00:00:00 2001 From: gganduu_zz Date: Mon, 25 Sep 2023 16:13:01 +0800 Subject: [PATCH 14/18] Add ViT model for FlagPerf (#200) * Add ViT model * update the script based on zhiyuan's model * Update script based on PR review * Update ViT performance in README.md --- inference/benchmarks/vit_l_16/README.md | 1 + inference/configs/host.yaml | 2 +- .../kunlunxin_configurations.yaml | 5 + .../kunlunxin/kunlunxin_analysis.py | 44 +- .../kunlunxin/kunlunxin_monitor.py | 513 +++++++++--------- .../kunlunxin/pytorch_1.13/Dockerfile | 1 + inference/inference_engine/kunlunxin/xtcl.py | 1 + inference/run.py | 4 +- 8 files changed, 289 insertions(+), 282 deletions(-) create mode 100644 inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md index 5998c0cf9..cd77ab738 100644 --- a/inference/benchmarks/vit_l_16/README.md +++ b/inference/benchmarks/vit_l_16/README.md @@ -83,4 +83,5 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 64 |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 | | tensorrt | fp32 | 32 | 1275.9 | 482.4 | 491.1 | 555.5 | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 | +| kunlunxin_xtcl | W32A16 | 32 | 2118.307 | / | / | 130.006 | 144.914 | 27.9% | 79.3/79.3 | / | diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml index f5ec9d0ac..25c7f796b 100644 --- a/inference/configs/host.yaml +++ b/inference/configs/host.yaml @@ -13,4 +13,4 @@ PIP_SOURCE: "https://mirror.baidu.com/pypi/simple" CLEAR_CACHES: True ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES" CASES: - "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val" \ No newline at end of file + "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val" diff --git a/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..bf71dd82c --- /dev/null +++ b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,5 @@ +compiler: xtcl +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: true +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: /home/FlagPerf/inference/onnxs/vit_l_16_bs32_pytorch_fp16False.onnx diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py index 388f89cee..be1a60b1b 100644 --- a/inference/docker_images/kunlunxin/kunlunxin_analysis.py +++ b/inference/docker_images/kunlunxin/kunlunxin_analysis.py @@ -1,23 +1,21 @@ -def analysis_log(logpath): - logfile = open(logpath) - - max_usage = 0.0 ## usage_mem - max_mem = 0.0 - for line in logfile.readlines(): - ''' - xpu_smi temp power mem w_mem use_rate - ''' - if "xpu_smi" in line: - line = line[:-1] - usage = line.split(" ")[4] - usage = float(usage) - max_usage = max(max_usage, usage) - max_mem = line.split(" ")[5] - max_mem = float(max_mem) - - return round(max_usage / 1024.0, - 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") - - -if __name__ == "__main__": - max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log") +def analysis_log(logpath): + logfile = open(logpath) + + max_usage = 0.0 ## usage_mem + max_mem = 0.0 + for line in logfile.readlines(): + ''' + xpu_smi temp power mem w_mem use_rate + ''' + if "xpu_smi" in line: + line = line[:-1] + usage = line.split(" ")[4] + usage = float(usage) + max_usage = max(max_usage, usage) + max_mem = line.split(" ")[5] + max_mem = float(max_mem) + + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") + + diff --git a/inference/docker_images/kunlunxin/kunlunxin_monitor.py b/inference/docker_images/kunlunxin/kunlunxin_monitor.py index ba5a877a1..7d31179ae 100644 --- a/inference/docker_images/kunlunxin/kunlunxin_monitor.py +++ b/inference/docker_images/kunlunxin/kunlunxin_monitor.py @@ -1,256 +1,257 @@ -# !/usr/bin/env python3 -# encoding: utf-8 -''' -Usage: python3 sys-monitor.py -o operation -l [log_path] - -o, --operation start|stop|restart|status - -l, --log log path , ./logs/ default -''' - -import os -import sys -import time -import signal -import atexit -import argparse -import datetime -from multiprocessing import Process -import subprocess -import schedule - - -class Daemon: - ''' - daemon subprocess class. - usage: subclass this daemon and override the run() method. - sys-monitor.pid: in the /tmp/, auto del when unexpected exit. - verbose: debug mode, disabled default. - ''' - - def __init__(self, - pid_file, - log_file, - err_file, - gpu_log, - log_path, - rate=5, - stdin=os.devnull, - stdout=os.devnull, - stderr=os.devnull, - home_dir='.', - umask=0o22, - verbose=0): - self.stdin = stdin - self.stdout = stdout - self.stderr = stderr - self.home_dir = home_dir - self.verbose = verbose - self.pidfile = pid_file - self.logfile = log_file - self.errfile = err_file - self.gpufile = gpu_log - self.logpath = log_path - self.rate = rate - self.umask = umask - self.verbose = verbose - self.daemon_alive = True - - def get_pid(self): - try: - with open(self.pidfile, 'r') as pf: - pid = int(pf.read().strip()) - except IOError: - pid = None - except SystemExit: - pid = None - return pid - - def del_pid(self): - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - - def run(self): - ''' - NOTE: override the method in subclass - ''' - - def gpu_mon(file): - TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') - cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate - process = subprocess.Popen(cmd, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - encoding='utf-8') - try: - out = process.communicate(timeout=10) - except subprocess.TimeoutExpired: - process.kill() - out = process.communicate() - - if process.returncode != 0: - result = "error" - result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" - with open(file, 'a') as f: - f.write(result) - - def timer_gpu_mon(): - gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) - gpu_process.start() - - schedule.every(self.rate).seconds.do(timer_gpu_mon) - while True: - schedule.run_pending() - time.sleep(5) - - def daemonize(self): - if self.verbose >= 1: - print('daemon process starting ...') - try: - pid = os.fork() - if pid > 0: - sys.exit(0) - except OSError as e: - sys.stderr.write('fork #1 failed: %d (%s)\n' % - (e.errno, e.strerror)) - sys.exit(1) - os.chdir(self.home_dir) - os.setsid() - os.umask(self.umask) - try: - pid = os.fork() - if pid > 0: - sys.exit(0) - except OSError as e: - sys.stderr.write('fork #2 failed: %d (%s)\n' % - (e.errno, e.strerror)) - sys.exit(1) - sys.stdout.flush() - sys.stderr.flush() - si = open(self.stdin, 'r') - so = open(self.stdout, 'a+') - if self.stderr: - se = open(self.stderr, 'a+') - else: - se = so - os.dup2(si.fileno(), sys.stdin.fileno()) - os.dup2(so.fileno(), sys.stdout.fileno()) - os.dup2(se.fileno(), sys.stderr.fileno()) - atexit.register(self.del_pid) - pid = str(os.getpid()) - with open(self.pidfile, 'w+') as f: - f.write('%s\n' % pid) - - def start(self): - if not os.path.exists(self.logpath): - os.makedirs(self.logpath) - elif os.path.exists(self.gpufile): - os.remove(self.gpufile) - if self.verbose >= 1: - print('ready to start ......') - # check for a pid file to see if the daemon already runs - pid = self.get_pid() - if pid: - msg = 'pid file %s already exists, is it already running?\n' - sys.stderr.write(msg % self.pidfile) - sys.exit(1) - # start the daemon - self.daemonize() - self.run() - - def stop(self): - if self.verbose >= 1: - print('stopping ...') - pid = self.get_pid() - if not pid: - msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile - sys.stderr.write(msg) - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - return - # try to kill the daemon process - try: - i = 0 - while 1: - os.kill(pid, signal.SIGTERM) - time.sleep(1) - i = i + 1 - if i % 10 == 0: - os.kill(pid, signal.SIGHUP) - except OSError as err: - err = str(err) - if err.find('No such process') > 0: - if os.path.exists(self.pidfile): - os.remove(self.pidfile) - else: - print(str(err)) - sys.exit(1) - if self.verbose >= 1: - print('Stopped!') - - def restart(self): - self.stop() - self.start() - - def status(self): - pid = self.get_pid() - if pid: - if os.path.exists('/proc/%d' % pid): - return pid - return False - - -def parse_args(): - ''' Check script input parameter. ''' - parse = argparse.ArgumentParser(description='Sys monitor script') - parse.add_argument('-o', - type=str, - metavar='[operation]', - required=True, - help='start|stop|restart|status') - parse.add_argument('-l', - type=str, - metavar='[log_path]', - required=False, - default='./logs/', - help='log path') - args = parse.parse_args() - return args - - -def main(): - sample_rate1 = 5 - args = parse_args() - operation = args.o - log_path = args.l - pid_fn = str('/tmp/xpu_monitor.pid') - log_fn = str(log_path + '/kunlunxin_monitor.log') - err_fn = str(log_path + '/kunlunxin_monitor.err') - # result for gpu - gpu_fn = str(log_path + '/kunlunxin_monitor.log') - - subdaemon = Daemon(pid_fn, - log_fn, - err_fn, - gpu_fn, - log_path, - verbose=1, - rate=sample_rate1) - if operation == 'start': - subdaemon.start() - elif operation == 'stop': - subdaemon.stop() - elif operation == 'restart': - subdaemon.restart() - elif operation == 'status': - pid = subdaemon.status() - if pid: - print('process [%s] is running ......' % pid) - else: - print('daemon process [%s] stopped' % pid) - else: - print("invalid argument!") - sys.exit(1) - - -if __name__ == '__main__': - main() +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/xpu_monitor.pid') + log_fn = str(log_path + '/kunlunxin_monitor.log') + err_fn = str(log_path + '/kunlunxin_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/kunlunxin_monitor.log') + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() + diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile index 7227b9743..fa778e7e8 100644 --- a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile +++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile @@ -72,6 +72,7 @@ ENV TVM_DIR=/root/XTCL-ubuntu_x86_64 + ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH ENV PATH /root/miniconda/envs/python38/bin:$PATH diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 2643f51d5..3afbe9634 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -82,3 +82,4 @@ def __call__(self, model_inputs: list): return output_list, foo_time + diff --git a/inference/run.py b/inference/run.py index a11fa4824..36cf49222 100644 --- a/inference/run.py +++ b/inference/run.py @@ -272,7 +272,7 @@ def start_monitors_in_cluster(dp_path, case_log_dir, nnodes): ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR, config.VENDOR + "_monitor.py") - start_mon_cmd = "cd " + dp_path + " && " + sys.executable \ + start_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \ + " " + ven_mon_path + " -o restart -l " logger.debug("Run cmd in the cluster to start vendor's monitors: " + start_mon_cmd) @@ -299,7 +299,7 @@ def stop_monitors_in_cluster(dp_path, nnodes): ven_mon_path = os.path.join(dp_path, "docker_images", config.VENDOR, config.VENDOR + "_monitor.py") - stop_mon_cmd = "cd " + dp_path + " && " + sys.executable \ + stop_mon_cmd = "cd " + dp_path + " && sudo " + sys.executable \ + " " + ven_mon_path + " -o stop" logger.debug("Run cmd in the cluster to stop vendor's monitors: " + stop_mon_cmd) From dfc9a80041e43a3e4d2603563cdfb086d6220bef Mon Sep 17 00:00:00 2001 From: TWANG07 <91315832+TWANG07@users.noreply.github.com> Date: Mon, 25 Sep 2023 16:34:45 +0800 Subject: [PATCH 15/18] support swin_transformer on XPU (#255) * support swin_transformer on XPU * support swin_transformer on XPU --------- Co-authored-by: wangdongyu04 --- .../swin_transformer-pytorch/README.md | 25 +++++++++++++++++++ .../config/config_R300x1x1.py | 3 +++ .../config/config_R300x1x8.py | 3 +++ .../config/config_R300x2x8.py | 3 +++ .../config/config_common.py | 3 +++ .../config/environment_variables.sh | 25 +++++++++++++++++++ .../config/requirements.txt | 8 ++++++ .../swin_transformer-pytorch/extern/.gitkeep | 0 8 files changed, 70 insertions(+) create mode 100644 training/kunlunxin/swin_transformer-pytorch/README.md create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/config_common.py create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh create mode 100644 training/kunlunxin/swin_transformer-pytorch/config/requirements.txt create mode 100644 training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep diff --git a/training/kunlunxin/swin_transformer-pytorch/README.md b/training/kunlunxin/swin_transformer-pytorch/README.md new file mode 100644 index 000000000..87adae7fd --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/README.md @@ -0,0 +1,25 @@ +### 测试数据集下载 +参见[测试数据集下载](../../benchmarks/swin_transformer/README.md#数据集) + +### 昆仑芯 XPU 配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: 昆仑芯AI加速器组R480-X8 + - 加速卡型号: 昆仑芯AI加速卡R300 + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-26-generic + - 加速卡驱动版本:4.0.25 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir+9bb59e9e [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/9bb59e9e/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 训练编译器版本:xacc+9bb59e9e [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/9bb59e9e/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) + - 依赖软件版本:pytorch-1.12.1+cpu + +### 运行情况 +| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | +| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | +| R300单机单卡(1x1) | config_R300x1x1 | | | | | | +| R300单机8卡(1x8) | config_R300x1x8 | 788401.61 | 81.00 | 80.598 | 1501200 | 555.28 | +| R300两机8卡(2x8) | config_R300x2x8 | | | | | | \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..52be8aa2f --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x1.py @@ -0,0 +1,3 @@ +from config_common import * + +train_batch_size = 32 \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py new file mode 100644 index 000000000..52be8aa2f --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x1x8.py @@ -0,0 +1,3 @@ +from config_common import * + +train_batch_size = 32 \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..52be8aa2f --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/config_R300x2x8.py @@ -0,0 +1,3 @@ +from config_common import * + +train_batch_size = 32 \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/config/config_common.py b/training/kunlunxin/swin_transformer-pytorch/config/config_common.py new file mode 100644 index 000000000..60ac8485a --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/config_common.py @@ -0,0 +1,3 @@ +vendor = "kunlunxin" +dist_backend = "xccl" +amp_enable = False diff --git a/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh b/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..8c1fdbe0c --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/environment_variables.sh @@ -0,0 +1,25 @@ +# ================================================= +# Export variables +# ================================================= + +export XMLIR_F_XPU_ENABLED_BOOL=true +export XMLIR_TORCH_XCCL_ENABLED=true + + +# ================================================= +# R480 config +# ================================================= + +export OMP_NUM_THREADS=1 +export XACC_ARGS="-L amp" +export XACC=1 +export BKCL_PCIE_RING=1 + +KLX_WEB_SERVER_URL=http://127.0.0.1:8000 + +pip uninstall -y xacc || true +pip install ${KLX_WEB_SERVER_URL}/flagperf/archives/9bb59e9e/xacc-0.1.0-cp38-cp38-linux_x86_64.whl +pip uninstall -y xmlir || true +pip install ${KLX_WEB_SERVER_URL}/flagperf/archives/9bb59e9e/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl + +python -m xacc.install \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt b/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt new file mode 100644 index 000000000..3151c03d6 --- /dev/null +++ b/training/kunlunxin/swin_transformer-pytorch/config/requirements.txt @@ -0,0 +1,8 @@ +psutil==5.9.5 +numpy>=1.15.4 +timm==0.4.12 +accelerate==0.20.3 + +--find-links https://download.pytorch.org/whl/torch_stable.html +torch==1.12.1+cpu +torchvision==0.13.1+cpu \ No newline at end of file diff --git a/training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep b/training/kunlunxin/swin_transformer-pytorch/extern/.gitkeep new file mode 100644 index 000000000..e69de29bb From eb867959ce56b65ec6034d15567d4b1fa5920313 Mon Sep 17 00:00:00 2001 From: zjm <815496138@qq.com> Date: Mon, 25 Sep 2023 16:36:37 +0800 Subject: [PATCH 16/18] Kunlunxin add stable diffusion v 1_4 case (#227) * kunlunxin inference * xtcl support fp16 onnx * Add stable diffusion fp32 case * kunlunxin add yolov5 case * update resnet50 fp16 performance * add stable_diffusion_v1_4 kunlunxin mem_usage --------- Co-authored-by: zhaoyixuan02 Co-authored-by: zhoujiamin01 --- inference/benchmarks/resnet50/README.md | 5 +++-- .../stable_diffusion_v1_4/README.md | 1 + inference/benchmarks/yolov5/README.md | 20 +++++++++++++++++++ .../yolov5/pytorch/kunlunxin_requirements.txt | 2 ++ .../kunlunxin_configurations.yaml | 1 + .../kunlunxin_configurations.yaml | 3 +++ .../kunlunxin/pytorch_1.13/Dockerfile | 5 +++-- inference/inference_engine/kunlunxin/xtcl.py | 15 +++++++++----- 8 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt create mode 100644 inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 4e802f022..a1650ef8e 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -76,7 +76,7 @@ find ./val -name "*JPEG" | wc -l - 推理工具包 - - XTCL 2.1 + - XTCL daily 2023.09.23 #### 2.3 天数智芯 MR-V100 @@ -139,6 +139,7 @@ find ./val -name "*JPEG" | wc -l | tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 | | torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | | ixrt | fp16 (W16A32) | 256 | 261.467 | / | / | 1389.332 | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 | -| kunlunxin_xtcl | fp32 | 128 | 311.215 | / | / | 837.507 | 1234.727 | / | 76.2/76.2 | / | +| kunlunxin_xtcl | fp32 | 128 | / | / | / | / | / | 12.1% | 76.2/76.2 | 4.52/32.0 | +| kunlunxin_xtcl | fp16 | 256 | 164.675 | / | / | 1566.407 | 3317.012 | 12.1% | 76.2/76.2 | 4.52/32.0 | | zixiao | fp16 | 32*6 | 261.103 | / | / | 193.151 | 6342.191 | / | 76.2/76.2 | / | diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md index 07aade914..0aa3ebb31 100644 --- a/inference/benchmarks/stable_diffusion_v1_4/README.md +++ b/inference/benchmarks/stable_diffusion_v1_4/README.md @@ -56,5 +56,6 @@ | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 2 |1674.9 | 11.4 | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 | | tensorrt | fp32 | 2 | 1807.4 | 8.2 | 20.6 | 7.2 | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 | +| kunlunxin_xtcl | fp32 | 2 | 213.822 | / | / | 4.755 | 9.471 | 20.1% | 26.524/25.3 | 0.07/32.0 | | null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 | | null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 | diff --git a/inference/benchmarks/yolov5/README.md b/inference/benchmarks/yolov5/README.md index 91354d40b..7e0ffa4df 100644 --- a/inference/benchmarks/yolov5/README.md +++ b/inference/benchmarks/yolov5/README.md @@ -53,6 +53,25 @@ find ./val -name "*JPEG" | wc -l - TensorRT 8.5.1.7 - torch_tensorrt 1.3.0 +#### 2.2 昆仑芯R200 + +- ##### 硬件环境 + - 机器、加速卡型号: R200 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-56-generic + - 加速卡驱动版本:4.0 + - Docker 版本:20.10.21 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + - pycocotools: 2.0.7 + +- 推理工具包 + + - XTCL 2.1 + ### 3. 运行情况 * 指标列表 @@ -75,3 +94,4 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ------------ |----------- | ---------- | | tensorrt | fp32 | 96 | 733.8 | / | / | 53.8 | 361.4 |12.6%| 0.45 | 35.44/40.0 | | tensorrt | fp16 | 96 | 1665.8 | / | / | 58.6 | 859 |15.0%| 0.45 | 26.15/40.0 | +| kunlunxin_xtcl | fp32 | 96 | / | / | / | / | / |18.9%| 0.451 | 26.42/32.0 | diff --git a/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt b/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt new file mode 100644 index 000000000..973355e88 --- /dev/null +++ b/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt @@ -0,0 +1,2 @@ +pycocotools +opencv-python-headless diff --git a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml index 4b2b5ffcb..0cad8cab4 100644 --- a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml +++ b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml @@ -2,3 +2,4 @@ fp16: false compiler: xtcl no_validation: true exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx +resnet50_fuse: true diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..ed982f5f1 --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,3 @@ +fp16: false +compiler: xtcl +no_validation: true diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile index fa778e7e8..a61287b38 100644 --- a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile +++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile @@ -46,11 +46,12 @@ RUN /root/miniconda/envs/python38/bin/pip install \ munch \ pyyaml \ tqdm \ - scipy + scipy \ + opencv-python-headless RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu -RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/XTCL-2.1/XTCL-ubuntu_x86_64.tar.gz && tar -xzf XTCL-ubuntu_x86_64.tar.gz +RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/kunlunxin_xtcl_output_ubuntu1604_daily_0923.tar.gz && tar -xzf kunlunxin_xtcl_output_ubuntu1604_daily_0923.tar.gz && mv output/XTCL XTCL-ubuntu_x86_64 RUN cd /root && wget https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.18.1/xre-ubuntu_2004_x86_64.tar.gz && tar -xzf xre-ubuntu_2004_x86_64.tar.gz diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 3afbe9634..5e38a7e41 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -26,7 +26,7 @@ def build_engine(self, config, onnx_path): for input in onnx_model.graph.input: input_shape = input.type.tensor_type.shape.dim input_shape = [a.dim_value for a in input_shape] - input_shape[0] = config.batch_size + #input_shape[0] = config.batch_size input_name = input.name #'inputs:0' self.input_names.append(input_name) shape_dict[input_name] = input_shape @@ -35,7 +35,11 @@ def build_engine(self, config, onnx_path): target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' ctx = tvm.device("xpu", 0) - build_config = {} + build_config = { + } + #os.environ["XTCL_BUILD_DEBUG"] = '1' + if config.resnet50_fuse: + os.environ["XTCL_FUSE_RES50V15"] = '1' if config.fp16 == True: os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1' input_fp16 = { name:"float16" for name in self.input_names} @@ -47,6 +51,7 @@ def build_engine(self, config, onnx_path): config_var_dtype_map=input_fp16, ).value() else: ## fp32 + os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1' os.environ['XTCL_USE_FP16'] = '1' os.environ['XTCL_QUANTIZE_WEIGHT'] = '1' @@ -70,12 +75,12 @@ def build_engine(self, config, onnx_path): def __call__(self, model_inputs: list): for index, input_name in enumerate(self.input_names): if USE_VM_COMPILE: - self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + self.engine.set_one_input("main",input_name, model_inputs[index].numpy()) else: - self.engine.set_input(input_name, tvm.nd.array(model_inputs[index])) + self.engine.set_input(input_name, model_inputs[index].numpy()) self.engine.run() - output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] foo_time_start = time.time() + output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] # d2h output_list = [torch.from_numpy(output.asnumpy()) for output in output_list] foo_time = time.time() - foo_time_start From 6b0ae6ce69e6df7d9a4eb93a08024fef416df856 Mon Sep 17 00:00:00 2001 From: liuyumoye <452803476@qq.com> Date: Mon, 25 Sep 2023 16:37:58 +0800 Subject: [PATCH 17/18] kunlunxin swinTransformer inference configs && results (#243) * kunlunxin swinTransformer inference configs && results * kunlunxin swinTransformer inference configs && results {'vendor': 'kunlunxin', 'compiler': 'xtcl', 'precision': 'fp32', 'batchsize': 256, 'flops': 723982880000.0, 'e2e_time(second)': 543.745, 'p_validation_whole(qps)': None, 'p_validation_core(qps)': None, 'p_inference_whole(qps)': 166.937, '*p_inference_core(qps)': 175.724, 'val_average_acc': None, 'infer_average_acc': 0.832} --------- Co-authored-by: SHIHONGHAO <13820618441@163.com> --- inference/benchmarks/swinTransformer/README.md | 2 +- .../configs/swinTransformer/configurations.yaml | 2 +- .../vendor_config/kunlunxin_configurations.yaml | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md index 04a97a3a6..14304fed9 100644 --- a/inference/benchmarks/swinTransformer/README.md +++ b/inference/benchmarks/swinTransformer/README.md @@ -84,4 +84,4 @@ find ./val -name "*JPEG" | wc -l | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | tensorrt | fp16 | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 | | tensorrt | fp32 | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 | - +| kunlunxin_xtcl| W32A16 | 256 | 543.745 | / | / | / | / | / | 0.832 | / | diff --git a/inference/configs/swinTransformer/configurations.yaml b/inference/configs/swinTransformer/configurations.yaml index 1b7dd0607..66a36a9ff 100644 --- a/inference/configs/swinTransformer/configurations.yaml +++ b/inference/configs/swinTransformer/configurations.yaml @@ -13,4 +13,4 @@ no_validation: false # set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) exist_onnx_path: null # set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine -exist_compiler_path: null \ No newline at end of file +exist_compiler_path: null diff --git a/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml b/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..209ee7821 --- /dev/null +++ b/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 256 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 1.55e10 +fp16: false +compiler: xtcl +num_workers: 8 +log_freq: 30 +repeat: 5 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: true +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: /home/liuyu/flagperf/FlagPerf/inference/onnxs/kunlunxin_flagperf_swinTransformer/swinTransformer_bs256_pytorch_fp16False.onnx +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null From 98d85df9eb477fa444c4a3f405096bec66c9289e Mon Sep 17 00:00:00 2001 From: Quanfeng Li Date: Mon, 25 Sep 2023 16:51:39 +0800 Subject: [PATCH 18/18] kunlunxin sam_h (#244) --- inference/benchmarks/sam_h/README.md | 18 +++++ inference/benchmarks/sam_h/pytorch/forward.py | 3 +- .../kunlunxin_configurations.yaml | 1 + .../kunlunxin_configurations.yaml | 10 +++ inference/inference_engine/kunlunxin/xtcl.py | 70 ++++++++----------- 5 files changed, 60 insertions(+), 42 deletions(-) create mode 100644 inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml diff --git a/inference/benchmarks/sam_h/README.md b/inference/benchmarks/sam_h/README.md index 02d6cf352..3ab043f48 100644 --- a/inference/benchmarks/sam_h/README.md +++ b/inference/benchmarks/sam_h/README.md @@ -36,6 +36,24 @@ - TensorRT 8.6.1 +#### 2.2 昆仑芯R200 + +- ##### 硬件环境 + - 机器、加速卡型号: R200 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-56-generic + - 加速卡驱动版本:4.0 + - Docker 版本:20.10.21 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + +- 推理工具包 + + - XTCL 2.0.0.67 + ### 3. 运行情况 * 指标列表 diff --git a/inference/benchmarks/sam_h/pytorch/forward.py b/inference/benchmarks/sam_h/pytorch/forward.py index df61177fa..9ff355c68 100644 --- a/inference/benchmarks/sam_h/pytorch/forward.py +++ b/inference/benchmarks/sam_h/pytorch/forward.py @@ -84,7 +84,6 @@ def engine_forward(model, dataloader, evaluator, config): for step, (x, y, osize, dsize) in enumerate(dataloader): if config.fp16: x = x.to(torch.float16) - y = y.to(torch.float16) torch_sync(config) core_time_start = time.time() @@ -101,7 +100,7 @@ def engine_forward(model, dataloader, evaluator, config): torch_sync(config) core_time += time.time() - core_time_start - pred = pred[0] + pred = pred[1] pred = pred.reshape(config.batch_size, 1, 3, 256, 256).float() pred = pred.cpu() diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml index c29b9c46b..7cb3e921a 100644 --- a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml +++ b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml @@ -1,3 +1,4 @@ compiler: xtcl no_validation: true +vm_enable: false exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx diff --git a/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml b/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..81b04fceb --- /dev/null +++ b/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,10 @@ +compiler: xtcl +no_validation: true +build_config: + FuseWithoutPattern: + - FuseConv2dTransposeBiasAdd + pattern_match: + - fuse_attention_sam +disabled_pass: + - xgraph_layout_opt +exist_onnx_path: onnxs/sam_h_bs4_pytorch_fp16True.onnx diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py index 5e38a7e41..eb31dfe06 100755 --- a/inference/inference_engine/kunlunxin/xtcl.py +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -1,33 +1,28 @@ +import os +import time + import onnx +import torch import tvm import tvm.relay as relay -from tvm.contrib.download import download_testdata -from tvm.relay import param_dict from tvm.contrib import graph_executor, xpu_config +from tvm.relay.xpu.patterns import custom_fuse_patterns from tvm.runtime.vm import VirtualMachine -import torch -import os -import subprocess -from loguru import logger -import numpy as np -import time -USE_VM_COMPILE = False class InferModel: - def __init__(self, config , onnx_path, model): + def __init__(self, config, onnx_path, model): self.input_names = [] self.engine = self.build_engine(config, onnx_path) + self.vm_enable = True def build_engine(self, config, onnx_path): onnx_model = onnx.load(onnx_path) shape_dict = {} - for input in onnx_model.graph.input: - input_shape = input.type.tensor_type.shape.dim - input_shape = [a.dim_value for a in input_shape] - #input_shape[0] = config.batch_size - input_name = input.name #'inputs:0' + for inp in onnx_model.graph.input: + input_name, input_shape, _, _ = relay.frontend.onnx.get_info(inp) + input_shape[0] = config.batch_size self.input_names.append(input_name) shape_dict[input_name] = input_shape @@ -35,56 +30,51 @@ def build_engine(self, config, onnx_path): target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' ctx = tvm.device("xpu", 0) - build_config = { - } + build_config = config.build_config if 'build_config' in config._fields else {} + disabled_pass = config.disabled_pass if 'disabled_pass' in config._fields else [] + self.vm_enable = config.vm_enable if 'vm_enable' in config._fields else True + if "pattern_match" in build_config: + build_config["XPUFuzzyMatch"] = xpu_config.XPUGraphMatchConfig( + pattern_match=build_config["pattern_match"]).value() + del build_config["pattern_match"] #os.environ["XTCL_BUILD_DEBUG"] = '1' if config.resnet50_fuse: os.environ["XTCL_FUSE_RES50V15"] = '1' if config.fp16 == True: os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1' - input_fp16 = { name:"float16" for name in self.input_names} build_config["XPUOutDtypeConfig"] = xpu_config.XPUOutDtypeConfig( - default_precision="float16", - config_last_node=True, - config_map={ - }, - config_var_dtype_map=input_fp16, - ).value() + default_precision="float16", + config_last_node=True, + config_map={}, + ).value() else: ## fp32 os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1' os.environ['XTCL_USE_FP16'] = '1' os.environ['XTCL_QUANTIZE_WEIGHT'] = '1' - with tvm.transform.PassContext(opt_level=3, config=build_config): - if USE_VM_COMPILE: - vm_exec = relay.backend.vm.compile(mod, - target=target_host, - target_host=target_host, - params=params) - + with tvm.transform.PassContext(opt_level=3, config=build_config, disabled_pass=disabled_pass): + if self.vm_enable: + vm_exec = relay.backend.vm.compile(mod, target=target_host, target_host=target_host, params=params) vm = VirtualMachine(vm_exec, ctx) return vm else: graph, lib, params = relay.build(mod, - target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2", - params=params) + target="xpu -libs=xdnn -split-device-funcs -device-type=xpu2", + params=params) m = graph_executor.create(graph, lib, ctx) m.set_input(**params) return m def __call__(self, model_inputs: list): for index, input_name in enumerate(self.input_names): - if USE_VM_COMPILE: - self.engine.set_one_input("main",input_name, model_inputs[index].numpy()) + if self.vm_enable: + self.engine.set_one_input("main", input_name, model_inputs[index].numpy()) else: - self.engine.set_input(input_name, model_inputs[index].numpy()) + self.engine.set_input(input_name, tvm.nd.array(model_inputs[index])) self.engine.run() foo_time_start = time.time() output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] # d2h - output_list = [torch.from_numpy(output.asnumpy()) for output in output_list] + output_list = [torch.from_numpy(output.numpy()) for output in output_list] foo_time = time.time() - foo_time_start return output_list, foo_time - - -