import transformers import torch import os import json import random import numpy as np import argparse from datetime import datetime from tqdm import tqdm from torch.nn import DataParallel def build_files(data_path, tokenized_data_path, num_pieces, full_tokenizer, min_length): if not os.path.exists(tokenized_data_path): os.mkdir(tokenized_data_path) with open(data_path, 'r', encoding='utf8') as f: print('reading lines') lines = json.load(f) lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束 all_len = len(lines) for i in tqdm(range(num_pieces)): sublines = lines[all_len // num_pieces * i: all_len // num_pieces * (i + 1)] if i == num_pieces - 1: sublines.extend(lines[all_len // num_pieces * (i + 1):]) # 把尾部例子添加到最后一个piece sublines = [full_tokenizer.tokenize(line) for line in sublines if len(line) > min_length] # 只考虑长度超过min_length的句子 sublines = [full_tokenizer.convert_tokens_to_ids(line) for line in sublines] full_line = [] for subline in sublines: full_line.append(full_tokenizer.convert_tokens_to_ids('[MASK]')) # 文章开头添加MASK表示文章开始 full_line.extend(subline) full_line.append(full_tokenizer.convert_tokens_to_ids('[CLS]')) # 文章之间添加CLS表示文章结束 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f: for id in full_line: f.write(str(id) + ' ') print('finish') def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/eval.json', type=str, required=False, help='原始语料') parser.add_argument('--tokenized_data_path', default='data/tokenized_eval/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--batch_size', default=8, type=int, required=False, help='batch size') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次') parser.add_argument('--stride', default=768, type=int, required=False, help='取数据的窗口步长') parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型起点路径') parser.add_argument('--output_dir', default='eval_result/', type=str, required=False, help='结果输出路径') args = parser.parse_args() print('args:\n' + args.__repr__()) # if args.no_wordpiece: # from tokenizations import tokenization_bert_without_wordpiece as tokenization_bert # else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 batch_size = args.batch_size log_step = args.log_step stride = args.stride num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: print('you need to specify a trained model.') exit(1) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.eval() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 total_loss = 0 total_steps = 0 # eval now = datetime.now() print('time: {}'.format(now)) piece_num = 0 for i in range(num_pieces): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point: start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last))]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size: (step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() total_loss += loss total_steps += 1 if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {}, ppl {}'.format( datetime.now().hour, datetime.now().minute, (step + 1), piece_num, torch.exp(loss))) piece_num += 1 if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) else: with open(args.output_dir + 'result.txt', 'w') as f: f.write(np.exp(total_loss / total_steps)) if __name__ == '__main__': main()