【开源实习】align模型微调 IAUOS5 (#1997)

huadaox · web-flow · commit b228c1da73fd · 2025-03-22T20:46:21.000+08:00
diff --git a/llm/finetune/Align/eval_model.py b/llm/finetune/Align/eval_model.py
@@ -0,0 +1,160 @@
+import collections
+import collections.abc
+
+collections.Iterable = collections.abc.Iterable
+
+import mindspore as ms
+from mindnlp.transformers import AlignModel, AlignProcessor
+from mindspore import Tensor, nn, ops, Parameter
+from pycocotools.coco import COCO
+import os
+from tqdm import tqdm
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import gc
+
+HYPERPARAMS = {
+    "model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base",
+    "batch_size": 4,
+    "val_samples": 50,
+    "max_length": 128,
+    "num_workers": 8,
+    "data_dir": "MSCOCO",
+    "data_type": "val2017",
+    "val_cache_file": "mscoco_preprocessed_val_50.pkl",
+    "save_dir": "cache/model",
+    "model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt",
+    "processor_save_path": "cache/model/finetuned_align_processor"
+}
+
+ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
+ms.context.reset_auto_parallel_context()
+
+
+def setup_coco():
+    dataDir = HYPERPARAMS["data_dir"]
+    dataType = HYPERPARAMS["data_type"]
+    os.makedirs(dataDir, exist_ok=True)
+    os.makedirs(f"{dataDir}/annotations", exist_ok=True)
+    os.makedirs(f"{dataDir}/{dataType}", exist_ok=True)
+    ann_file = f"{dataDir}/annotations/captions_{dataType}.json"
+    if not os.path.exists(ann_file):
+        ann_zip = f"{dataDir}/annotations_trainval2017.zip"
+        if not os.path.exists(ann_zip):
+            raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.")
+        print("Extracting annotations...")
+        os.system(f"unzip -o {ann_zip} -d {dataDir}")
+    return dataDir, dataType
+
+
+dataDir, dataType = setup_coco()
+annFile = f'{dataDir}/annotations/captions_{dataType}.json'
+coco = COCO(annFile)
+
+
+def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"):
+    ann_ids = coco.getAnnIds(imgIds=img_id)
+    anns = coco.loadAnns(ann_ids)
+    caption = anns[0]['caption']
+    img_info = coco.loadImgs(img_id)[0]
+    img_path = f"{cache_dir}/{img_info['file_name']}"
+    image = Image.open(img_path)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    return image, caption
+
+
+def process_sample(img_id, coco):
+    image, caption = get_image_and_caption(coco, img_id)
+    processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"])
+    inputs = processor(
+        text=caption,
+        images=image,
+        return_tensors="ms",
+        padding="max_length",
+        max_length=HYPERPARAMS["max_length"]
+    )
+    return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0])
+
+
+def preprocess_and_save(coco, num_samples, cache_file):
+    if os.path.exists(cache_file):
+        print(f"Loading preprocessed data from {cache_file}")
+        with open(cache_file, "rb") as f:
+            dataset = pickle.load(f)
+            print(f"Loaded dataset size: {len(dataset)} samples")
+            return dataset
+    img_ids = coco.getImgIds()[:num_samples]
+    dataset = []
+    with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor:
+        dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids),
+                            total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)"))
+    with open(cache_file, "wb") as f:
+        pickle.dump(dataset, f)
+    return dataset
+
+
+def create_val_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]):
+    val_dataset = preprocess_and_save(coco, HYPERPARAMS["val_samples"], HYPERPARAMS["val_cache_file"])
+    val_dataloader = ms.dataset.GeneratorDataset(
+        val_dataset,
+        column_names=["input_ids", "attention_mask", "pixel_values"]
+    ).batch(batch_size)
+    return val_dataloader
+
+
+class TrainingNet(nn.Cell):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.global_pool = nn.AdaptiveAvgPool2d(1)
+        self.text_projection = nn.Dense(768, 640)
+        self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True)
+        self.image_embeds = None
+        self.text_embeds = None
+
+    def construct(self, input_ids, attention_mask, pixel_values):
+        embedding_output = self.model.vision_model.embeddings(pixel_values)
+        encoder_outputs = self.model.vision_model.encoder(embedding_output)
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.global_pool(last_hidden_state)
+        self.image_embeds = pooled_output.reshape(pooled_output.shape[:2])
+        text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask)
+        text_embeds = text_outputs[0][:, 0, :]
+        self.text_embeds = self.text_projection(text_embeds)
+        logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale)
+        labels = ops.arange(len(logits), dtype=ms.int32)
+        loss_i2t = nn.CrossEntropyLoss()(logits, labels)
+        loss_t2i = nn.CrossEntropyLoss()(logits.T, labels)
+        return (loss_i2t + loss_t2i) / 2
+
+
+def evaluate_model(coco, epoch_to_eval):
+    processor = AlignProcessor.from_pretrained(HYPERPARAMS["processor_save_path"])
+    model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
+    net = TrainingNet(model)  # 使用 TrainingNet 包装 AlignModel
+    param_dict = ms.load_checkpoint(HYPERPARAMS["model_save_path"].format(epoch=epoch_to_eval))
+    ms.load_param_into_net(net, param_dict)  # 加载到 TrainingNet
+    net.set_train(False)
+
+    val_dataloader = create_val_dataloader(coco)
+    print(f"Val dataloader created with batch_size={HYPERPARAMS['batch_size']}, samples={HYPERPARAMS['val_samples']}")
+
+    total_val_loss = 0
+    val_steps = 0
+    for batch in tqdm(val_dataloader.create_dict_iterator(), desc=f"Evaluating Epoch {epoch_to_eval}"):
+        loss = net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"])
+        total_val_loss += loss.asnumpy()
+        val_steps += 1
+    avg_val_loss = total_val_loss / val_steps
+    print(f"Epoch {epoch_to_eval}, Eval Loss: {avg_val_loss:.4f}")
+
+    gc.collect()
+    return avg_val_loss
+
+
+if __name__ == "__main__":
+    print("Starting model evaluation...")
+    for epoch in range(1, 11):
+        evaluate_model(coco, epoch)
diff --git a/llm/finetune/Align/finetune.py b/llm/finetune/Align/finetune.py
@@ -0,0 +1,203 @@
+import collections
+import collections.abc
+
+collections.Iterable = collections.abc.Iterable
+
+import mindspore as ms
+from mindnlp.transformers import AlignModel, AlignProcessor
+from mindspore import Tensor, nn, ops, Parameter
+from PIL import Image
+from pycocotools.coco import COCO
+import os
+from tqdm import tqdm
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+
+HYPERPARAMS = {
+    "model_name": "E:/Code/align_ft_torch/cache/model/kakaobrain/align-base",
+    "epochs": 10,
+    "batch_size": 4,
+    "learning_rate": 1e-4,
+    "train_samples": 200,
+    "max_length": 128,
+    "num_workers": 8,
+    "data_dir": "MSCOCO",
+    "data_type": "val2017",
+    "train_cache_file": "mscoco_preprocessed_train_200.pkl",
+    "save_dir": "cache/model",
+    "model_save_path": "cache/model/finetuned_align_model_epoch_{epoch}.ckpt",
+    "processor_save_path": "cache/model/finetuned_align_processor"
+}
+
+ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")
+ms.context.reset_auto_parallel_context()
+
+processor = AlignProcessor.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
+model = AlignModel.from_pretrained(HYPERPARAMS["model_name"], local_files_only=True)
+model.set_train(True)
+
+print("Model config:", model.config)
+params = model.trainable_params()
+print("Number of trainable params:", len(params))
+
+
+def setup_coco():
+    dataDir = HYPERPARAMS["data_dir"]
+    dataType = HYPERPARAMS["data_type"]
+    os.makedirs(dataDir, exist_ok=True)
+    os.makedirs(f"{dataDir}/annotations", exist_ok=True)
+    os.makedirs(f"{dataDir}/{dataType}", exist_ok=True)
+    ann_file = f"{dataDir}/annotations/captions_{dataType}.json"
+    if not os.path.exists(ann_file):
+        ann_zip = f"{dataDir}/annotations_trainval2017.zip"
+        if not os.path.exists(ann_zip):
+            raise FileNotFoundError(f"{ann_zip} not found. Please download it manually.")
+        print("Extracting annotations...")
+        os.system(f"unzip -o {ann_zip} -d {dataDir}")
+    return dataDir, dataType
+
+
+dataDir, dataType = setup_coco()
+annFile = f'{dataDir}/annotations/captions_{dataType}.json'
+coco = COCO(annFile)
+
+
+def get_image_and_caption(coco, img_id, cache_dir=f"{HYPERPARAMS['data_dir']}/{HYPERPARAMS['data_type']}"):
+    ann_ids = coco.getAnnIds(imgIds=img_id)
+    anns = coco.loadAnns(ann_ids)
+    caption = anns[0]['caption']
+    img_info = coco.loadImgs(img_id)[0]
+    img_path = f"{cache_dir}/{img_info['file_name']}"
+    image = Image.open(img_path)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    return image, caption
+
+
+def process_sample(img_id, coco):
+    image, caption = get_image_and_caption(coco, img_id)
+    inputs = processor(
+        text=caption,
+        images=image,
+        return_tensors="ms",
+        padding="max_length",
+        max_length=HYPERPARAMS["max_length"]
+    )
+    return (inputs["input_ids"][0], inputs["attention_mask"][0], inputs["pixel_values"][0])
+
+
+def preprocess_and_save(coco, num_samples, cache_file):
+    if os.path.exists(cache_file):
+        print(f"Loading preprocessed data from {cache_file}")
+        with open(cache_file, "rb") as f:
+            dataset = pickle.load(f)
+            print(f"Loaded dataset size: {len(dataset)} samples")
+            return dataset
+    img_ids = coco.getImgIds()[:num_samples]
+    dataset = []
+    with ThreadPoolExecutor(max_workers=HYPERPARAMS["num_workers"]) as executor:
+        dataset = list(tqdm(executor.map(lambda x: process_sample(x, coco), img_ids),
+                            total=num_samples, desc=f"Preprocessing dataset ({num_samples} samples)"))
+    with open(cache_file, "wb") as f:
+        pickle.dump(dataset, f)
+    return dataset
+
+
+def create_train_dataloader(coco, batch_size=HYPERPARAMS["batch_size"]):
+    train_dataset = preprocess_and_save(coco, HYPERPARAMS["train_samples"], HYPERPARAMS["train_cache_file"])
+    train_dataloader = ms.dataset.GeneratorDataset(
+        train_dataset,
+        column_names=["input_ids", "attention_mask", "pixel_values"]
+    ).batch(batch_size)
+    return train_dataloader
+
+
+class TrainingNet(nn.Cell):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.global_pool = nn.AdaptiveAvgPool2d(1)
+        self.text_projection = nn.Dense(768, 640)
+        self.logit_scale = Parameter(Tensor(np.log(1 / 0.07), dtype=ms.float32), requires_grad=True)
+        self.image_embeds = None
+        self.text_embeds = None
+
+    def construct(self, input_ids, attention_mask, pixel_values):
+        embedding_output = self.model.vision_model.embeddings(pixel_values)
+        encoder_outputs = self.model.vision_model.encoder(embedding_output)
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = self.global_pool(last_hidden_state)
+        self.image_embeds = pooled_output.reshape(pooled_output.shape[:2])
+        text_outputs = self.model.text_model(input_ids=input_ids, attention_mask=attention_mask)
+        text_embeds = text_outputs[0][:, 0, :]
+        self.text_embeds = self.text_projection(text_embeds)
+        logits = ops.matmul(self.image_embeds, self.text_embeds.T) * ops.exp(self.logit_scale)
+        labels = ops.arange(len(logits), dtype=ms.int32)
+        loss_i2t = nn.CrossEntropyLoss()(logits, labels)
+        loss_t2i = nn.CrossEntropyLoss()(logits.T, labels)
+        return (loss_i2t + loss_t2i) / 2
+
+
+def convert_to_parameter(params):
+    converted = []
+    for i, param in enumerate(params):
+        if not isinstance(param, Parameter):
+            name = getattr(param, 'name', f"param_{i}") if hasattr(param, 'name') else f"param_{i}"
+            converted.append(Parameter(param.data, name=name, requires_grad=True))
+        else:
+            converted.append(param)
+    return converted
+
+
+def finetune_model(coco, model, processor,
+                   epochs=HYPERPARAMS["epochs"],
+                   batch_size=HYPERPARAMS["batch_size"],
+                   learning_rate=HYPERPARAMS["learning_rate"]):
+    train_dataloader = create_train_dataloader(coco, batch_size)
+    print(f"Train dataloader created with batch_size={batch_size}, samples={HYPERPARAMS['train_samples']}")
+
+    params = model.trainable_params()
+    if not params:
+        print("No trainable params found, enabling all parameters.")
+        for param in model.parameters_and_names():
+            param[1].requires_grad = True
+        params = model.trainable_params()
+
+    params = convert_to_parameter(params)
+    print(f"Optimizer initialized with {len(params)} parameters")
+    net = TrainingNet(model)
+    optimizer = nn.Adam(params + [net.text_projection.weight, net.text_projection.bias, net.logit_scale],
+                        learning_rate=learning_rate)
+    train_net = nn.TrainOneStepCell(net, optimizer)
+
+    for epoch in range(epochs):
+        iterator = train_dataloader.create_dict_iterator()
+        total_train_loss = 0
+        steps = 0
+        for batch in tqdm(iterator, desc=f"Epoch {epoch + 1}/{epochs} (Train)"):
+            loss = train_net(batch["input_ids"], batch["attention_mask"], batch["pixel_values"])
+            total_train_loss += loss.asnumpy()
+            steps += 1
+            if steps == 1:
+                print(f"Epoch {epoch + 1}, Step 1 - Train Loss: {loss.asnumpy():.4f}")
+                logits = ops.matmul(net.image_embeds, net.text_embeds.T) * ops.exp(net.logit_scale)
+                print(f"Logits sample: {logits[:2, :2]}")
+        avg_train_loss = total_train_loss / steps
+        print(f"Epoch {epoch + 1}/{epochs}, Average Train Loss: {avg_train_loss:.4f}")
+
+        param_after = net.text_projection.weight.asnumpy()
+        if epoch == 0:
+            param_before = param_after.copy()
+        print("Params updated:", not np.array_equal(param_before, param_after))
+
+        save_dir = HYPERPARAMS["save_dir"]
+        os.makedirs(save_dir, exist_ok=True)
+        ms.save_checkpoint(net, HYPERPARAMS["model_save_path"].format(epoch=epoch + 1))
+
+    processor.save_pretrained(HYPERPARAMS["processor_save_path"])
+    return model
+
+
+print("Starting model finetuning...")
+finetuned_model = finetune_model(coco, model, processor)