Skip to content

Commit

Permalink
refine cpm (#179)
Browse files Browse the repository at this point in the history
Co-authored-by: zhouyu <zhouyu@baai.ac.cn>
  • Loading branch information
yuzhou03 and zhouyu authored Aug 4, 2023
1 parent 5c13121 commit d999763
Show file tree
Hide file tree
Showing 11 changed files with 75 additions and 88 deletions.
6 changes: 3 additions & 3 deletions training/benchmarks/cpm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
> 无需预处理
#### 模型checkpoint
[下载页](https://model.baai.ac.cn/model-detail/100017)
文件及版本tab页下,pytorch_model.bin
参数数:2.6B
[下载页](https://model.baai.ac.cn/model-detail/100105)
文件及版本tab页下,cpm_model_states_medium.pt
参数数:0.33B

### 框架与芯片支持情况
| | Pytorch |Paddle|TensorFlow2|
Expand Down
7 changes: 5 additions & 2 deletions training/benchmarks/cpm/pytorch/config/_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# required parameters
vendor: str = None
# model name
name: str = "CPM"
# Training data dir
data_dir: str = None
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# random seed
seed: int = 1234
Expand Down Expand Up @@ -91,8 +96,6 @@
no_save_rng: bool = False

## data args
# Training data dir
data_dir: str = "/mnt/data/cpm/train/"

# path used to save/load sentencepiece tokenization models
tokenizer_path: str = "bpe_3w_new/"
Expand Down
66 changes: 24 additions & 42 deletions training/benchmarks/cpm/pytorch/run_pretraining.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
"""CPM Pretraining"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os
import random
import sys
import time

import numpy as np
import torch

import config
from dataloaders.tokenization_gpt2 import GPT2Tokenizer
from dataloaders.dataloader import load_data
from train.evaluator import Evaluator
Expand All @@ -22,38 +13,27 @@

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
from driver import Driver, Event, dist_pytorch, check
from driver import Event, dist_pytorch
from driver.helper import InitHelper

logger = None


def main():
import config
from config import mutable_params
global logger
global config

if config.use_env and 'LOCAL_RANK' in os.environ:
config.local_rank = int(os.environ['LOCAL_RANK'])

cpm_driver = Driver(config, config.mutable_params)
cpm_driver.setup_config(argparse.ArgumentParser("CPM"))
cpm_driver.setup_modules(globals(), locals())

init_helper = InitHelper(config)
cpm_driver = init_helper.init_driver(globals(), locals())
logger = cpm_driver.logger
dist_pytorch.init_dist_training_env(config)

check.check_config(config)

dist_pytorch.barrier(config.vendor)
cpm_driver.event(Event.INIT_START)
init_start_time = logger.previous_log_time

random.seed(config.seed)
np.random.seed(config.seed)
torch.manual_seed(config.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(config.seed)

init_helper.set_seed(config.seed, config.vendor)
# get the tokenizer
base_path = os.path.abspath(os.path.dirname(__file__))
tokenizer = GPT2Tokenizer(
Expand All @@ -70,7 +50,6 @@ def main():

evaluator = Evaluator(config, eval_dataloader)
training_state = TrainingState()
# trainer = Trainer(config, training_event, evaluator, training_state, device=device)
trainer = Trainer(driver=cpm_driver,
adapter=trainer_adapter,
evaluator=evaluator,
Expand All @@ -92,53 +71,56 @@ def main():
eval_loss=training_state.eval_avg_loss,
eval_embedding_average=training_state.eval_embedding_average,
time=init_evaluation_end - init_evaluation_start)
# training_event.on_init_evaluate(init_evaluation_info)
cpm_driver.event(Event.INIT_EVALUATION, init_evaluation_info)

if not config.do_train:
return config, training_state

# training_event.on_init_end()
cpm_driver.event(Event.INIT_END)
init_end_time = logger.previous_log_time
training_state.init_time = (init_end_time - init_start_time) / 1e+3

dist_pytorch.barrier(config.vendor)
epoch = -1
# training_event.on_train_begin()

cpm_driver.event(Event.TRAIN_START)
raw_train_start_time = logger.previous_log_time
train_start_time = time.time()
epoch = 0
while training_state.global_steps < config.max_steps and not training_state.end_training:
epoch += 1
training_state.epoch = epoch
trainer.train_one_epoch(train_dataloader)
# training_event.on_train_end()
epoch += 1
cpm_driver.event(Event.TRAIN_END)
raw_train_end_time = logger.previous_log_time
training_state.raw_train_time = (raw_train_end_time -
raw_train_start_time) / 1e+3
training_state.raw_train_time = time.time() - train_start_time
return config, training_state


if __name__ == "__main__":
now = time.time()
config, state = main()
config_updated, state = main()

if not dist_pytorch.is_main_process():
exit()

e2e_time = time.time() - now
training_perf = (dist_pytorch.global_batch_size(config) *
training_perf = (dist_pytorch.global_batch_size(config_updated) *
state.global_steps) / state.raw_train_time
if config.do_train:
if config_updated.do_train:
finished_info = {
"e2e_time": e2e_time,
"training_sequences_per_second": training_perf,
"converged": state.converged,
"final_loss": state.eval_avg_loss,
"final_mlm_accuracy": state.eval_embedding_average,
"raw_train_time": state.raw_train_time,
"init_time": state.init_time,
"raw_train_time": state.raw_train_time,
"train_no_eval_time": state.no_eval_time,
"pure_training_computing_time": state.pure_compute_time,
"throughput(ips)_raw":
state.num_trained_samples / state.raw_train_time,
"throughput(ips)_no_eval":
state.num_trained_samples / state.no_eval_time,
"throughput(ips)_pure_compute":
state.num_trained_samples / state.pure_compute_time,
}
else:
finished_info = {"e2e_time": e2e_time}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""PyTorch DataLoader for TFRecords"""

import torch
from torch.optim.lr_scheduler import _LRScheduler
import math

Expand Down
7 changes: 0 additions & 7 deletions training/benchmarks/cpm/pytorch/train/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
import os
import sys
import imp
import torch

import numpy as np
from train.metrics import average_corpus_level
from model.losses.cross_entropy import cross_entropy
from torch.nn import CrossEntropyLoss
from model.fp16 import FP16_Module

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import dist_pytorch


Expand Down
6 changes: 0 additions & 6 deletions training/benchmarks/cpm/pytorch/train/metrics.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from __future__ import division
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import print_function

import numpy as np
import collections

import torch

__all__ = [
"CorpusLevelScore",
Expand Down
29 changes: 12 additions & 17 deletions training/benchmarks/cpm/pytorch/train/trainer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import math
import time
import os
import sys

import torch
from torch.types import Device
Expand All @@ -13,9 +11,6 @@
from train.training_state import TrainingState
from model.losses.cross_entropy import cross_entropy
from model.fp16 import FP16_Module

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
from driver import Driver, Event, dist_pytorch


Expand Down Expand Up @@ -66,22 +61,22 @@ def _init_model(self, model, device):
def train_one_epoch(self, dataloader):
state = self.training_state
driver = self.driver
#training_event = self.training_event
driver.event(Event.EPOCH_BEGIN, state.epoch)
#training_event.on_epoch_begin(state.epoch)

step_start_time = time.time()

for _, data in enumerate(dataloader):
no_eval_start_time = time.time()
batch, no_model_batch = data[0], data[1]

state.global_steps += 1
state.num_trained_samples = state.global_steps * dist_pytorch.global_batch_size(
self.config)

#self.training_event.on_step_begin(state.global_steps)
driver.event(Event.STEP_BEGIN, step=state.global_steps)
self.train_one_step(batch, no_model_batch)

self.training_state.no_eval_time += time.time(
) - no_eval_start_time
other_state = dict()
if state.global_steps % self.config.gradient_accumulation_steps == 0:
step_end_time = time.time()
Expand All @@ -107,20 +102,17 @@ def train_one_epoch(self, dataloader):
end_training = self.detect_training_status(state)

step_info = state.to_dict(**other_state)
#self.training_event.on_step_end(state.global_steps, result=step_info)
driver.event(Event.STEP_END,
message=step_info,
step=state.global_steps,
loss=state.loss)

if eval_result is not None:
#self.training_event.on_evaluate(eval_result)
driver.event(Event.EVALUATE, eval_result)

if end_training:
break

#training_event.on_epoch_end(state.epoch)
driver.event(Event.EPOCH_END, state.epoch)

def train_one_step(self, batch, no_model_batch):
Expand All @@ -129,6 +121,7 @@ def train_one_step(self, batch, no_model_batch):
for k in no_model_batch:
no_model_batch[k] = no_model_batch[k].to(self.device)

pure_compute_start_time = time.time()
state = self.training_state
self.model.train()

Expand All @@ -142,6 +135,12 @@ def train_one_step(self, batch, no_model_batch):
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
state.loss = loss

self.adapter.backward(self.config, state.global_steps, state.loss,
self.optimizer)
self.training_state.pure_compute_time += time.time(
) - pure_compute_start_time

# calculate output
preds = torch.argmax(output, -1)
if isinstance(self.model.module, FP16_Module):
embeddings = self.model.module.module.word_embeddings.weight
Expand All @@ -155,11 +154,7 @@ def train_one_step(self, batch, no_model_batch):
embeddings.cpu().detach(),
no_model_batch["loss_mask"].cpu().detach())
state.embedding_average = float(embedding_average.mean)
#loss.backward()
#self.optimizer.step()
self.adapter.backward(self.config, state.global_steps, state.loss,
self.optimizer)
#self.training_event.on_backward(state.global_steps, state.loss, self.optimizer)

self.driver.event(Event.BACKWARD, state.global_steps, state.loss,
self.optimizer)
self.lr_scheduler.step()
Expand Down
2 changes: 2 additions & 0 deletions training/benchmarks/cpm/pytorch/train/training_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class TrainingState:

init_time = 0
raw_train_time = 0
no_eval_time = 0
pure_compute_time = 0

def status(self):
if self.converged:
Expand Down
33 changes: 26 additions & 7 deletions training/nvidia/cpm-pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,29 @@

#### 运行情况

| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) |
| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
| 单机1卡 | config_A100x1x1 | 2016.20 | 0.8 | 0.8041 | 4375 | 77.65 |
| 单机2卡 | config_A100x1x2 | 1767.69 | 0.8 | 0.8010 | 3756 | 151.41 |
| 单机4卡 | config_A100x1x4 | 1651.28 | 0.8 | 0.8017 | 3454 | 298.22 |
| 单机8卡 | config_A100x1x8 | 1648.99 | 0.92 | 0.9201 | 3397 | 586.92 |
| 两机8卡 | config_A100x2x8 | 1453.51 | 0.92 | 0.9208 | 2760 | 1092.23 |
* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| -------------- | ------------------------------ | ------------------------------------------- |
| 任务类别 | 文本分类、文本生成 | |
| 模型 | cpm | |
| 数据集 | CPM-Finetune-data | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) |
| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 |
| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) |
| 训练结果 | acc,见“性能指标” | 分类准确率(mlm_accuracy) |
| 额外修改项 || |

* 性能指标

| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem |
| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- |
| A100单机8卡(1x8) | fp16 | / | 1641 | 587 | 835 | 1059 | 0.92 | 12.9/40.0 |
| A100单机8卡(1x8) | fp16 | bs=128,lr=0.002 | 5469 | 771 | 1090 | 1292 | 0.918 | 23.1/40.0 |
| A100单机单卡(1x1) | fp16 | bs=192,lr=0.0005 | | 78.4 | 111.9 | 127.2 | | 34.8/40.0 |
| A100两机8卡(2x8) | fp16 | bs=192,lr=0.0005 | | 1583 | 2221 | 2583.8 | | 29.9/40.0 |
2 changes: 1 addition & 1 deletion training/nvidia/cpm-pytorch/config/config_A100x1x1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

train_batch_size = 32
eval_batch_size = train_batch_size
max_steps = 4000000
max_steps = 60000
max_samples_termination = 439126000

warmup = 0.2
Expand Down
4 changes: 2 additions & 2 deletions training/nvidia/cpm-pytorch/config/config_A100x2x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

gradient_accumulation_steps = 1

train_batch_size = 32
train_batch_size = 192
eval_batch_size = train_batch_size
max_steps = 10000
max_steps = 2000

warmup = 0.2
learning_rate = 0.0005
Expand Down

0 comments on commit d999763

Please sign in to comment.