From 2b1e289dec0838c275e003f3d75abfe2014f1a15 Mon Sep 17 00:00:00 2001 From: SeasonMay <1447833641@qq.com> Date: Thu, 24 Apr 2025 09:15:06 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E5=BC=80=E6=BA=90=E5=AE=9E=E4=B9=A0?= =?UTF-8?q?=E3=80=91Bartpho=E6=A8=A1=E5=9E=8B=E5=BE=AE=E8=B0=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm/finetune/bartpho/bartpho_finetune.md | 27 ++ .../bartpho/bartpho_finetune_mindnlp.py | 178 ++++++++++ .../bartpho/bartpho_finetune_pytorch.py | 307 ++++++++++++++++++ 3 files changed, 512 insertions(+) create mode 100644 llm/finetune/bartpho/bartpho_finetune.md create mode 100644 llm/finetune/bartpho/bartpho_finetune_mindnlp.py create mode 100644 llm/finetune/bartpho/bartpho_finetune_pytorch.py diff --git a/llm/finetune/bartpho/bartpho_finetune.md b/llm/finetune/bartpho/bartpho_finetune.md new file mode 100644 index 000000000..9c81c1fb9 --- /dev/null +++ b/llm/finetune/bartpho/bartpho_finetune.md @@ -0,0 +1,27 @@ +# finetune bartpho + +## dataset resources + +HuyPham235711/BARTpho_Corrector + +文本纠错任务 + +## mindnlp+ascend + +| Epoch | train_loss | eval_loss | BLEU | +| :---: | ---------- | --------- | ------- | +| 1 | 0.4209 | 0.3085 | 86.5538 | +| 2 | 0.3052 | 0.2757 | 87.7926 | +| 3 | 0.2307 | 0.2531 | 88.8163 | +| 4 | 0.1864 | 0.2303 | 89.1350 | +| 5 | 0.1535 | 0.2048 | 90.4295 | + +## pytorch+cuda: + +| Epoch | train_loss | eval_loss | BLEU | +| :---: | ---------- | --------- | ------- | +| 1 | 0.9188 | 0.3580 | 86.3481 | +| 2 | 0.4843 | 0.3022 | 86.9550 | +| 3 | 0.3602 | 0.2728 | 87.5816 | +| 4 | 0.2866 | 0.2623 | 88.8584 | +| 5 | 0.2312 | 0.2659 | 89.8917 | \ No newline at end of file diff --git a/llm/finetune/bartpho/bartpho_finetune_mindnlp.py b/llm/finetune/bartpho/bartpho_finetune_mindnlp.py new file mode 100644 index 000000000..e0fee4544 --- /dev/null +++ b/llm/finetune/bartpho/bartpho_finetune_mindnlp.py @@ -0,0 +1,178 @@ +import os +import numpy as np +import random +import mindspore as ms +from mindspore import nn, ops, Tensor, set_seed +from mindspore.dataset import GeneratorDataset +from mindnlp.transformers import AutoModelForSeq2SeqLM, BartphoTokenizer +from mindnlp.engine import Trainer, TrainingArguments, TrainerCallback +from datasets import load_dataset + +import evaluate + +# 加载评估指标 +sacrebleu_metric = evaluate.load("sacrebleu") + +# 定义模型和数据路径 +MODEL_NAME = "vinai/bartpho-syllable" +MAX_LENGTH = 32 # 最大序列长度 +output_dir = './saved_model_weights' # 模型保存路径 +os.makedirs(output_dir, exist_ok=True) + + +# 自定义训练回调函数来打印每个epoch的loss +class LossLoggerCallback(TrainerCallback): + def on_epoch_end(self, args, state, control, **kwargs): + """在每个epoch结束时调用""" + # 获取当前训练信息 + epoch = state.epoch + loss = state.log_history[-1].get('loss', 0.0) if state.log_history else 0.0 + + # 打印当前epoch的训练loss + print(f"Epoch {epoch}: train_loss = {loss:.6f}") + + # 如果有评估结果,也打印出来 + if 'eval_loss' in state.log_history[-1]: + eval_loss = state.log_history[-1].get('eval_loss', 0.0) + eval_metric = state.log_history[-1].get('eval_sacrebleu', 0.0) + print(f"Epoch {epoch}: eval_loss = {eval_loss:.6f}, eval_sacrebleu = {eval_metric:.4f}") + + +# 数据预处理函数 +def preprocess_function(examples): + # 对输入和目标文本进行tokenize + return tokenizer( + examples["error"], + text_target=examples["original"], + max_length=MAX_LENGTH, + truncation=True, + padding="max_length" + ) + + +# 计算评估指标 +def compute_metrics(eval_preds): + preds, labels = eval_preds + + # 如果模型返回的是元组,取第一个元素(预测logits) + if isinstance(preds, tuple): + preds = preds[0] + + # 解码预测和标签 + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + + # 处理标签中的pad token + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # 简单的后处理 + decoded_preds = [pred.strip() for pred in decoded_preds] + decoded_labels = [[label.strip()] for label in decoded_labels] # sacrebleu需要列表的列表 + + # 计算BLEU分数 + result = sacrebleu_metric.compute( + predictions=decoded_preds, + references=decoded_labels + ) + + return { + "sacrebleu": round(result["score"], 4) + } + + +# 为MindSpore创建数据集 +def create_mindspore_dataset(data, batch_size=8): + data_list = list(data) + + def generator(): + for item in data_list: + yield ( + Tensor(item["input_ids"], dtype=ms.int32), + Tensor(item["attention_mask"], dtype=ms.int32), + Tensor(item["labels"], dtype=ms.int32) + ) + + return GeneratorDataset( + generator, + column_names=["input_ids", "attention_mask", "labels"] + ).batch(batch_size) + + +# 对logits进行预处理,防止内存溢出 +def preprocess_logits_for_metrics(logits, labels): + """防止内存溢出""" + pred_ids = ms.mint.argmax(logits[0], dim=-1) + return pred_ids, labels + + +# 主函数 +def main(): + global tokenizer # 使tokenizer在函数外可用 + + # 加载模型和tokenizer + print("正在加载模型和tokenizer...") + tokenizer = BartphoTokenizer.from_pretrained(MODEL_NAME) + model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) + + # 加载数据集 + print("正在加载数据集...") + train_path = './train.csv' + test_path = './test.csv' + dataset = load_dataset("csv", data_files={"train": train_path, "test": test_path}) + + print(f"训练集样本数: {len(dataset['train'])}") + print(f"测试集样本数: {len(dataset['test'])}") + + # 数据预处理 + print("正在进行数据预处理...") + tokenized_datasets = dataset.map( + preprocess_function, + batched=True, + remove_columns=dataset["train"].column_names, + ) + + # 创建MindSpore数据集 + print("正在创建MindSpore数据集...") + train_dataset = create_mindspore_dataset(tokenized_datasets["train"], batch_size=8) + eval_dataset = create_mindspore_dataset(tokenized_datasets["test"], batch_size=8) + + # 定义训练参数 + training_args = TrainingArguments( + output_dir="./results", + evaluation_strategy="epoch", + learning_rate=1e-5, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + num_train_epochs=5, + weight_decay=0.01, + save_strategy="epoch", + save_total_limit=2, + ) + + # 初始化训练器 + print("初始化训练器...") + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + callbacks=[LossLoggerCallback()] + ) + + # 开始训练 + print("开始训练...") + trainer.train() + # 保存模型 + print(f"训练完成,保存模型到 {output_dir}...") + model.save_pretrained(output_dir) + # 模型评估 + print("进行模型最终评估...") + eval_results = trainer.evaluate() + print(f"最终评估结果: {eval_results}") + + +if __name__ == "__main__": + main() diff --git a/llm/finetune/bartpho/bartpho_finetune_pytorch.py b/llm/finetune/bartpho/bartpho_finetune_pytorch.py new file mode 100644 index 000000000..76d159777 --- /dev/null +++ b/llm/finetune/bartpho/bartpho_finetune_pytorch.py @@ -0,0 +1,307 @@ +import os +import torch +import numpy as np +from torch.utils.data import Dataset, DataLoader +from torch.optim import AdamW +from torch.nn import CrossEntropyLoss +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_linear_schedule_with_warmup +from datasets import load_dataset +import evaluate +from tqdm import tqdm +import pandas as pd +import logging +import random +import warnings +import itertools + +warnings.filterwarnings("ignore") + +# 设置日志 +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + +# 设置种子和配置参数 +MODEL_NAME = "vinai/bartpho-syllable" +MAX_LENGTH = 32 # 保持较小的最大长度以获得更好的性能 +BATCH_SIZE = 8 +LEARNING_RATE = 1e-5 +WEIGHT_DECAY = 0.01 +EPOCHS = 5 +WARMUP_STEPS = 500 +OUTPUT_DIR = './custom_model_weights' +os.makedirs(OUTPUT_DIR, exist_ok=True) + +# 设置HuggingFace镜像(如果需要) +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + +# 加载数据集 +logger.info("正在加载数据集...") +train_path = './train.csv' +test_path = './test.csv' +dataset = load_dataset("csv", data_files={"train": train_path, "test": test_path}) + +# 加载tokenizer +logger.info(f"正在加载tokenizer: {MODEL_NAME}") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +logger.info(f"已加载Tokenizer: {MODEL_NAME}") + +# 加载评估指标 +metric = evaluate.load("/root/autodl-tmp/evaluate/metrics/sacrebleu") +logger.info(f"已加载评估指标") + + +# 自定义数据集类 +class VietnameseTextCorrectionDataset(Dataset): + def __init__(self, data, tokenizer, max_length): + self.data = data + self.tokenizer = tokenizer + self.max_length = max_length + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + item = self.data[idx] + error_text = item['error'] + original_text = item['original'] + + # 对输入文本进行编码 + inputs = self.tokenizer( + error_text, + max_length=self.max_length, + padding='max_length', + truncation=True, + return_tensors='pt' + ) + + # 对目标文本进行编码 + with self.tokenizer.as_target_tokenizer(): + targets = self.tokenizer( + original_text, + max_length=self.max_length, + padding='max_length', + truncation=True, + return_tensors='pt' + ) + + input_ids = inputs['input_ids'].squeeze() + attention_mask = inputs['attention_mask'].squeeze() + labels = targets['input_ids'].squeeze() + + # 将padding的token替换为-100,这样在计算损失时会被忽略 + labels[labels == self.tokenizer.pad_token_id] = -100 + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'labels': labels + } + + +# 创建数据集实例 +logger.info("创建数据集实例...") +train_dataset = VietnameseTextCorrectionDataset(dataset['train'], tokenizer, MAX_LENGTH) +val_dataset = VietnameseTextCorrectionDataset(dataset['test'], tokenizer, MAX_LENGTH) + +logger.info(f"训练集大小: {len(train_dataset)}") +logger.info(f"验证集大小: {len(val_dataset)}") + +# 创建数据加载器 +train_dataloader = DataLoader( + train_dataset, + batch_size=BATCH_SIZE, + shuffle=True +) + +val_dataloader = DataLoader( + val_dataset, + batch_size=BATCH_SIZE, + shuffle=False +) + +# 加载模型 +logger.info(f"正在加载模型: {MODEL_NAME}") +model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) + +# 将模型移动到GPU(如果可用) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +logger.info(f"使用设备: {device}") +model.to(device) + +# 设置优化器 +optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + +# 计算总训练步数并设置学习率调度器(不使用梯度累积) +total_steps = len(train_dataloader) * EPOCHS +scheduler = get_linear_schedule_with_warmup( + optimizer, + num_warmup_steps=WARMUP_STEPS, + num_training_steps=total_steps +) + + +# 使用evaluate库的计算BLEU分数的函数 +def compute_bleu(preds, labels): + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + + # 处理标签,替换-100 + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # 一些简单的后处理 + decoded_preds = [pred.strip() for pred in decoded_preds] + decoded_labels = [[label.strip()] for label in decoded_labels] + + # 使用evaluate加载的sacrebleu计算指标 + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + + # 打印一些示例进行调试 + # for i in range(min(3, len(decoded_preds))): + # logger.debug(f"预测: {decoded_preds[i]}") + # logger.debug(f"参考: {decoded_labels[i][0]}") + # logger.debug("---") + + return result["score"] + + +# 评估函数 +def evaluate(full_validation=False): + model.eval() + val_loss = 0 + all_preds = [] + all_labels = [] + + # 在训练期间进行快速验证,使用验证数据的子集 + eval_dataloader = val_dataloader if full_validation else itertools.islice(val_dataloader, 10) + + with torch.no_grad(): + for batch in tqdm(eval_dataloader, desc="评估中"): + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + labels = batch['labels'].to(device) + + # 评估损失 + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels + ) + + loss = outputs.loss + val_loss += loss.item() + + # 使用优化的参数生成预测 + generation_params = { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'max_length': MAX_LENGTH, + } + + if full_validation: + generation_params['num_beams'] = 4 # 在完整验证中使用束搜索 + + generated_ids = model.generate(**generation_params) + + all_preds.extend(generated_ids.detach().cpu().numpy()) + all_labels.extend(labels.detach().cpu().numpy()) + + # 计算平均验证损失 + divisor = len(val_dataloader) if full_validation else 10 + avg_val_loss = val_loss / divisor + + # 计算BLEU分数 + bleu_score = compute_bleu(all_preds, np.array(all_labels)) + + return bleu_score, avg_val_loss + + +# 训练函数(不使用梯度累积) +def train(): + logger.info("开始训练...") + + # 记录最佳模型的BLEU分数 + best_bleu = 0.0 + + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + + progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader)) + for step, batch in progress_bar: + # 将数据移动到设备 + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + labels = batch['labels'].to(device) + + # 清空梯度 + optimizer.zero_grad() + + # 前向传播 + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels + ) + + loss = outputs.loss + + # 直接反向传播 + loss.backward() + + # 立即更新参数 + optimizer.step() + + # 更新学习率 + scheduler.step() + + # 累计损失 + total_loss += loss.item() + + # 更新进度条 + progress_bar.set_description(f"轮次 {epoch + 1}/{EPOCHS} | 损失: {total_loss / (step + 1):.4f}") + + # 保存中间检查点 + if step > 0 and step % 1000 == 0: + logger.info(f"轮次 {epoch + 1}, 步骤 {step}: 损失 = {total_loss / (step + 1):.4f}") + + # 计算平均训练损失 + avg_train_loss = total_loss / len(train_dataloader) + logger.info(f"轮次 {epoch + 1}/{EPOCHS} - 平均训练损失: {avg_train_loss:.4f}") + + # 在训练期间进行快速验证 + bleu_score, val_loss = evaluate(full_validation=False) + logger.info(f"轮次 {epoch + 1}/{EPOCHS} - 快速验证 - 损失: {val_loss:.4f}, BLEU: {bleu_score:.4f}") + + # 只在特定间隔进行完整验证 + if (epoch + 1) % 2 == 0 or epoch == EPOCHS - 1: + logger.info("执行完整验证...") + full_bleu_score, full_val_loss = evaluate(full_validation=True) + logger.info( + f"轮次 {epoch + 1}/{EPOCHS} - 完整验证 - 损失: {full_val_loss:.4f}, BLEU: {full_bleu_score:.4f}") + bleu_score = full_bleu_score + + # 保存最佳模型 + if bleu_score > best_bleu: + best_bleu = bleu_score + save_path = os.path.join(OUTPUT_DIR, f"best_model_epoch_{epoch + 1}") + model.save_pretrained(save_path) + tokenizer.save_pretrained(save_path) + logger.info(f"在轮次 {epoch + 1} 保存了新的最佳模型,BLEU分数为: {best_bleu:.4f}") + + # 训练完成后保存最终模型 + final_path = os.path.join(OUTPUT_DIR, "final_model") + model.save_pretrained(final_path) + tokenizer.save_pretrained(final_path) + logger.info(f"训练完成!最终模型保存在 {final_path}") + + return best_bleu + + +if __name__ == "__main__": + # 开始训练 + best_bleu = train() + logger.info(f"训练完成,最佳BLEU分数为: {best_bleu:.4f}")