From 246d03251ceb36f2186589556ed8b3540ee81b85 Mon Sep 17 00:00:00 2001 From: 4ever Date: Thu, 6 Mar 2025 19:50:41 +0800 Subject: [PATCH 1/6] BEiT finetune --- llm/finetune/BEiT/BEiT finetune.md | 30 ++++ .../BEiT/BEiT finetune.md:Zone.Identifier | 0 llm/finetune/BEiT/BEiT_mind.py | 149 ++++++++++++++++++ .../BEiT/BEiT_mind.py:Zone.Identifier | 0 4 files changed, 179 insertions(+) create mode 100644 llm/finetune/BEiT/BEiT finetune.md create mode 100644 llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier create mode 100644 llm/finetune/BEiT/BEiT_mind.py create mode 100644 llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier diff --git a/llm/finetune/BEiT/BEiT finetune.md b/llm/finetune/BEiT/BEiT finetune.md new file mode 100644 index 000000000..93da8f116 --- /dev/null +++ b/llm/finetune/BEiT/BEiT finetune.md @@ -0,0 +1,30 @@ +# Finetune microsoft beit-base-patch16-224 model +- base model: [microsoft beit-base-patch16-224](https://huggingface.co/microsoft/beit-base-patch16-224) +- dataset: [cifar10](https://huggingface.co/datasets/uoft-cs/cifar10) +- pytorch version finetune [github](https://github.com/4everImmortality/microsoft-beit-cifar10-finetune) +# requirments +## pytorch +- GPU: RTX 4070ti 12G +- cuda: 11.8 +- Python version: 3.10 +- torch version: 2.5.0 +- transformers version : 4.47.0 +## mindspore +- Ascend: 910B +- python: 3.9 +- mindspore: 2.3.1 +- mindnlp: 0.4.0 +# Result for finetune +training for 3 epochs +## torch +| Epoch | eval_loss | eval_accuracy | +|-------|-----------|--------------| +| 1 | 0.193 | 94.4% | +| 2 | 0.157 | 95.4% | +| 3 | 0.117 | 96.2% | +## mindspore +| Epoch | eval_loss | eval_accuracy | +|-------|-----------|--------------| +| 1 | 0.416 | 96.4% | +| 2 | 0.193 | 96.8% | +| 3 | 0.158 | 97.2% | \ No newline at end of file diff --git a/llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier b/llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier new file mode 100644 index 000000000..e69de29bb diff --git a/llm/finetune/BEiT/BEiT_mind.py b/llm/finetune/BEiT/BEiT_mind.py new file mode 100644 index 000000000..61416ac03 --- /dev/null +++ b/llm/finetune/BEiT/BEiT_mind.py @@ -0,0 +1,149 @@ +from mindspore.dataset import GeneratorDataset as ds_GeneratorDataset +import numpy as np +from sklearn.metrics import accuracy_score +from mindnlp.engine import TrainingArguments, Trainer +from mindnlp.transformers import AutoImageProcessor, BeitForImageClassification +import mindspore +from mindspore import Tensor +from mindspore.dataset.transforms.py_transforms import Compose +from mindspore.dataset.vision.py_transforms import ( + RandomResizedCrop, RandomHorizontalFlip, Resize, CenterCrop, ToTensor, Normalize +) +from datasets import load_dataset + +# 加载数据集 +train_ds, test_ds = load_dataset( + 'uoft-cs/cifar10', split=['train[:5000]', 'test[:2000]']) +splits = train_ds.train_test_split(test_size=0.1) +train_ds_hf = splits['train'] +val_ds_hf = splits['test'] +test_ds_hf = test_ds + +# 构造标签映射 +id2label = {id: label for id, label in enumerate( + train_ds_hf.features['label'].names)} +label2id = {label: id for id, label in id2label.items()} + +# 初始化图像处理器 +processor = AutoImageProcessor.from_pretrained( + 'microsoft/beit-base-patch16-224') +image_mean, image_std = processor.image_mean, processor.image_std +size = processor.size["height"] + +# 定义预处理流程 +normalize = Normalize(mean=image_mean, std=image_std) +transform_train = Compose([ + RandomResizedCrop(size), + RandomHorizontalFlip(), + ToTensor(), + normalize, +]) +transform_val = Compose([ + Resize(size), + CenterCrop(size), + ToTensor(), + normalize, +]) + +# 定义Hugging Face数据变换 + + +def train_transforms(examples): + examples['pixel_values'] = [transform_train( + image.convert("RGB")) for image in examples['img']] + return examples + + +def val_transforms(examples): + examples['pixel_values'] = [transform_val( + image.convert("RGB")) for image in examples['img']] + return examples + + +# 应用transform到原始数据集 +train_ds_hf.set_transform(train_transforms) +val_ds_hf.set_transform(val_transforms) +test_ds_hf.set_transform(val_transforms) + +# ???MindSpore Dataset(??????) +def create_mindspore_dataset(hf_dataset): + def generator(): + for example in hf_dataset: + # ??????????????numpy?? + pixel_data = np.array(example['pixel_values'], dtype=np.float32) + + # ??:????????? + # print("Raw pixel_data shape:", pixel_data.shape) # ?? (C, H, W) ? (3, 224, 224) + + # ?????,????????? + if pixel_data.ndim == 4 and pixel_data.shape[0] == 1: + pixel_data = pixel_data.squeeze(0) # ? (1, C, H, W) ?? (C, H, W) + + yield pixel_data, np.int32(example['label']) + + return ds_GeneratorDataset( + generator, + column_names=['pixel_values', 'labels'], + column_types=[mindspore.float32, mindspore.int32] + ) + +# ???????????? +train_ds = create_mindspore_dataset(train_ds_hf).batch(10, drop_remainder=True) # ???????? +val_ds = create_mindspore_dataset(val_ds_hf).batch(4, drop_remainder=True) +test_ds = create_mindspore_dataset(test_ds_hf).batch(4, drop_remainder=True) + +# ?????? +#for batch in train_ds.create_tuple_iterator(): +# pixel_batch, label_batch = batch +# print("Batch shape:", pixel_batch.shape) # ??? (10, 3, 224, 224) +# break + +# 加载模型 +# ??????,??????(????) +args = TrainingArguments( + output_dir="checkpoints", + save_strategy="epoch", + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=10, + per_device_eval_batch_size=4, + num_train_epochs=3, + weight_decay=0.01, + load_best_model_at_end=True, + metric_for_best_model="accuracy", + logging_dir='logs', + remove_unused_columns=False, + max_grad_norm=0.0, # 禁用梯度裁剪 否则 Infer type failed. +) + +# ????????????? +model = BeitForImageClassification.from_pretrained( + 'microsoft/beit-base-patch16-224', + num_labels=10, + id2label=id2label, + label2id=label2id, + ignore_mismatched_sizes=True, + +) + +# 定义评估指标 + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return {"accuracy": accuracy_score(predictions, labels)} + + +# 初始化Trainer +trainer = Trainer( + model=model, + args=args, + train_dataset=train_ds, + eval_dataset=val_ds, + compute_metrics=compute_metrics, + tokenizer=processor, +) + +# 开始训练 +trainer.train() diff --git a/llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier b/llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier new file mode 100644 index 000000000..e69de29bb From b4ebba0a06b9040b0ea946cc5d93d31c423f3b1d Mon Sep 17 00:00:00 2001 From: "Mr.Lawrence" <69960602+4everImmortality@users.noreply.github.com> Date: Thu, 6 Mar 2025 20:24:03 +0800 Subject: [PATCH 2/6] Delete llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier --- llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier diff --git a/llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier b/llm/finetune/BEiT/BEiT finetune.md:Zone.Identifier deleted file mode 100644 index e69de29bb..000000000 From b5b2904d0ca25b5d9134c162b6c198ff313f5383 Mon Sep 17 00:00:00 2001 From: "Mr.Lawrence" <69960602+4everImmortality@users.noreply.github.com> Date: Thu, 6 Mar 2025 20:24:15 +0800 Subject: [PATCH 3/6] Delete llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier --- llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier diff --git a/llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier b/llm/finetune/BEiT/BEiT_mind.py:Zone.Identifier deleted file mode 100644 index e69de29bb..000000000 From df81e3767c6b8d7ec1279ab4880e72f18e607749 Mon Sep 17 00:00:00 2001 From: "Mr.Lawrence" <69960602+4everImmortality@users.noreply.github.com> Date: Thu, 6 Mar 2025 20:35:10 +0800 Subject: [PATCH 4/6] Update BEiT_mind.py --- llm/finetune/BEiT/BEiT_mind.py | 45 +++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/llm/finetune/BEiT/BEiT_mind.py b/llm/finetune/BEiT/BEiT_mind.py index 61416ac03..6f143d29b 100644 --- a/llm/finetune/BEiT/BEiT_mind.py +++ b/llm/finetune/BEiT/BEiT_mind.py @@ -65,41 +65,46 @@ def val_transforms(examples): val_ds_hf.set_transform(val_transforms) test_ds_hf.set_transform(val_transforms) -# ???MindSpore Dataset(??????) +# 创建 MindSpore Dataset + + def create_mindspore_dataset(hf_dataset): def generator(): for example in hf_dataset: - # ??????????????numpy?? + # 获取图像数据 pixel_data = np.array(example['pixel_values'], dtype=np.float32) - - # ??:????????? - # print("Raw pixel_data shape:", pixel_data.shape) # ?? (C, H, W) ? (3, 224, 224) - - # ?????,????????? + + # 中间打印调试 + # print("Raw pixel_data shape:", pixel_data.shape) # (C, H, W) (3, 224, 224) + + # 处理图像数据维度 if pixel_data.ndim == 4 and pixel_data.shape[0] == 1: - pixel_data = pixel_data.squeeze(0) # ? (1, C, H, W) ?? (C, H, W) - + # (1, C, H, W) + pixel_data = pixel_data.squeeze(0) + yield pixel_data, np.int32(example['label']) - + return ds_GeneratorDataset( - generator, + generator, column_names=['pixel_values', 'labels'], column_types=[mindspore.float32, mindspore.int32] ) -# ???????????? -train_ds = create_mindspore_dataset(train_ds_hf).batch(10, drop_remainder=True) # ???????? + +# 创建数据集 +train_ds = create_mindspore_dataset(train_ds_hf).batch( + 10, drop_remainder=True) # 10个样本 val_ds = create_mindspore_dataset(val_ds_hf).batch(4, drop_remainder=True) test_ds = create_mindspore_dataset(test_ds_hf).batch(4, drop_remainder=True) -# ?????? -#for batch in train_ds.create_tuple_iterator(): +# 中间打印调试 +# for batch in train_ds.create_tuple_iterator(): # pixel_batch, label_batch = batch -# print("Batch shape:", pixel_batch.shape) # ??? (10, 3, 224, 224) +# print("Batch shape:", pixel_batch.shape) # 格式 (10, 3, 224, 224) # break # 加载模型 -# ??????,??????(????) +# 初始化训练参数 args = TrainingArguments( output_dir="checkpoints", save_strategy="epoch", @@ -113,17 +118,17 @@ def generator(): metric_for_best_model="accuracy", logging_dir='logs', remove_unused_columns=False, - max_grad_norm=0.0, # 禁用梯度裁剪 否则 Infer type failed. + max_grad_norm=0.0, # 禁用梯度裁剪 否则 Infer type failed. ) -# ????????????? +# 初始化模型 model = BeitForImageClassification.from_pretrained( 'microsoft/beit-base-patch16-224', num_labels=10, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True, - + ) # 定义评估指标 From 5b908a41da21d15505e5392ae8dc5dbcbbfee693 Mon Sep 17 00:00:00 2001 From: 4ever Date: Sun, 23 Mar 2025 17:43:23 +0800 Subject: [PATCH 5/6] albert StanfordIMDB finetune --- llm/finetune/albert/Albert_mind.py | 130 ++++++++++++++++++ .../albert/albert_StanfordIMDB_mindnlp.md | 58 ++++++++ 2 files changed, 188 insertions(+) create mode 100644 llm/finetune/albert/Albert_mind.py create mode 100644 llm/finetune/albert/albert_StanfordIMDB_mindnlp.md diff --git a/llm/finetune/albert/Albert_mind.py b/llm/finetune/albert/Albert_mind.py new file mode 100644 index 000000000..ca9a90174 --- /dev/null +++ b/llm/finetune/albert/Albert_mind.py @@ -0,0 +1,130 @@ +import random +import mindspore as ms +from mindspore import nn, ops, Tensor +from mindspore.dataset import GeneratorDataset +from mindnlp.transformers import AlbertTokenizer, AlbertForSequenceClassification +from mindnlp.engine import Trainer, TrainingArguments +from datasets import load_dataset +import numpy as np +import os +import evaluate + +# 1. 加载预训练模型和分词器 +model_name = "albert-base-v1" +tokenizer = AlbertTokenizer.from_pretrained(model_name) +model = AlbertForSequenceClassification.from_pretrained( + model_name, num_labels=2) + +# 2. 加载IMDb数据集 +dataset = load_dataset("stanfordnlp/imdb", trust_remote_code=True) +print("dataset:", dataset) +# 3. 数据预处理函数 + + +def tokenize_function(examples): + tokenized = tokenizer( + examples["text"], + padding="max_length", + truncation=True, + max_length=512 + ) + # 添加标签到返回字典 + tokenized["labels"] = examples["label"] + return tokenized + + +# 应用预处理 +tokenized_datasets = dataset.map(tokenize_function, batched=True) + +# 检查标签分布(修正后的代码) +print("\n==== 数据分布验证 ====") + +# 检查训练集 +train_labels = np.array(tokenized_datasets["train"]["labels"]) +print("训练集标签统计:") +print("- 唯一值:", np.unique(train_labels)) +print("- 分布:", np.bincount(train_labels)) + +# 检查测试集 +test_labels = np.array(tokenized_datasets["test"]["labels"]) +print("\n测试集标签统计:") +print("- 唯一值:", np.unique(test_labels)) +print("- 分布:", np.bincount(test_labels)) +# 4. 转换数据集格式 + +def create_dataset(data, batch_size=8): + # 将数据转换为列表以便打乱 + data_list = list(data) + random.shuffle(data_list) # 打乱数据顺序 + + def generator(): + for item in data_list: # 遍历打乱后的数据 + yield item["input_ids"], item["attention_mask"], Tensor(item["labels"], dtype=ms.int32) + + return GeneratorDataset(generator(), ["input_ids", "attention_mask", "labels"]).batch(batch_size) + + +train_dataset = create_dataset(tokenized_datasets["train"]) +eval_dataset = create_dataset(tokenized_datasets["test"]) + +# 4. 加载评估指标 +accuracy = evaluate.load("accuracy") +f1 = evaluate.load("f1") +precision = evaluate.load("precision") +recall = evaluate.load("recall") + +sample = next(iter(train_dataset)) +print("Input IDs:", sample[0]) +print("Attention Mask:", sample[1]) +print("Labels:", sample[2]) + +# 自定义指标计算函数 +def compute_metrics(eval_pred): + logits, labels = eval_pred # 直接解包为logits和labels + predictions = np.argmax(logits, axis=-1) + + return { + "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"], + "f1": f1.compute(predictions=predictions, references=labels, average="binary")["f1"], + "precision": precision.compute(predictions=predictions, references=labels, average="binary")["precision"], + "recall": recall.compute(predictions=predictions, references=labels, average="binary")["recall"] + } + + +# 5. 配置训练参数 +training_args = TrainingArguments( + num_train_epochs=3, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + learning_rate=1e-5, + weight_decay=0.01, + output_dir="./results", + logging_dir="./logs", + logging_steps=10, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", # 根据准确率选择最佳模型 + greater_is_better=True, # 准确率越高越好 +) + +# 6. 初始化并运行训练 +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, # 添加指标计算函数 +) + +trainer.train() + +# 7. 评估模型 +eval_results = trainer.evaluate(eval_dataset) +print(f"Evaluation results: {eval_results}") +print("\nFinal evaluation results:") +print(f"Accuracy: {eval_results['eval_accuracy']:.4f}") +print(f"F1 Score: {eval_results['eval_f1']:.4f}") +print(f"Precision: {eval_results['eval_precision']:.4f}") +print(f"Recall: {eval_results['eval_recall']:.4f}") + diff --git a/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md b/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md new file mode 100644 index 000000000..b7012f6ba --- /dev/null +++ b/llm/finetune/albert/albert_StanfordIMDB_mindnlp.md @@ -0,0 +1,58 @@ +# Albert mindnlp StanfordIMDB reviewer Finetune + +- Albert模型微调任务链接:[【开源实习】albert模型微调 · Issue #IAUONP · MindSpore/community - Gitee.com](https://gitee.com/mindspore/community/issues/IAUONP) +- 实现了Albert-base-v1 基准权重 在 [Sentiment analysis of IMDb reviews - Stanford University] 数据集上的微调 + +- base model: [albert/albert-base-v1 · Hugging Face](https://huggingface.co/albert/albert-base-v1) +- dataset: [stanfordnlp/imdb · Datasets at Hugging Face](https://huggingface.co/datasets/stanfordnlp/imdb) + +# Requirments +## Pytorch + +- GPU: RTX 4070ti 12G +- cuda: 11.8 +- Python version: 3.10 +- torch version: 2.5.0 +- transformers version : 4.47.0 + +## Mindspore 启智社区 Ascend910B算力资源 +- Ascend: 910B +- python: 3.11 +- mindspore: 2.5.0 +- mindnlp: 0.4.1 + +# Result for finetune + +training for 3 epochs + +## torch + +| Epoch | eval_loss | +| ------------------ | --------- | +| 1 | 0.3868 | +| 2 | 0.2978 | +| 3 | 0.3293 | +| Evaluation results | 0.2978 | + +**评估结果** + +| Accuracy | Precision | Recall | F1_score | +| -------- | --------- | ------ | -------- | +| 0.9212 | 0.9218 | 0.9284 | 0.9218 | + + + +## mindspore + +| Epoch | eval_loss | +| ------------------ | --------- | +| 1 | 0.2677 | +| 2 | 0.2314 | +| 3 | 0.2332 | +| Evaluation results | 0.2314 | + +**评估结果** + +| Accuracy | Precision | Recall | F1_score | +| -------- | --------- | ------ | -------- | +| 0.9219 | 0.9238 | 0.9218 | 0.9228 | From df56c03c3ee7cdb454b8a1e2479439c5a6074ba3 Mon Sep 17 00:00:00 2001 From: 4ever Date: Sun, 23 Mar 2025 17:53:58 +0800 Subject: [PATCH 6/6] albert StanfordIMDB finetune --- llm/finetune/albert/Albert_mind.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llm/finetune/albert/Albert_mind.py b/llm/finetune/albert/Albert_mind.py index ca9a90174..68c41bc9a 100644 --- a/llm/finetune/albert/Albert_mind.py +++ b/llm/finetune/albert/Albert_mind.py @@ -67,7 +67,7 @@ def generator(): train_dataset = create_dataset(tokenized_datasets["train"]) eval_dataset = create_dataset(tokenized_datasets["test"]) -# 4. 加载评估指标 +# 5. 加载评估指标 accuracy = evaluate.load("accuracy") f1 = evaluate.load("f1") precision = evaluate.load("precision") @@ -91,7 +91,7 @@ def compute_metrics(eval_pred): } -# 5. 配置训练参数 +# 6. 配置训练参数 training_args = TrainingArguments( num_train_epochs=3, per_device_train_batch_size=8, @@ -108,7 +108,7 @@ def compute_metrics(eval_pred): greater_is_better=True, # 准确率越高越好 ) -# 6. 初始化并运行训练 +# 7. 初始化并运行训练 trainer = Trainer( model=model, args=training_args, @@ -119,7 +119,7 @@ def compute_metrics(eval_pred): trainer.train() -# 7. 评估模型 +# 8. 评估模型 eval_results = trainer.evaluate(eval_dataset) print(f"Evaluation results: {eval_results}") print("\nFinal evaluation results:")