pre-trained Caduceus model to the Huggingface hub，gradient #56

github2657529567 · 2024-12-12T15:18:27Z

We used the model on Huggingface, ps and ph, but there was a problem with the gradient, the gradient was very small, looking at the code we found that there was a problem with the initialisation of the model, referring to https://github.com/kuleshov-group/caduceus/issues/37， but still the gradient was very small.

dataset

import os
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
tokenizer = AutoTokenizer.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", trust_remote_code=True
)
config = AutoConfig.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16",
    trust_remote_code=True,
    num_labels=12,
)
model = AutoModelForSequenceClassification.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16",
    trust_remote_code=True,
    config=config,
    ignore_mismatched_sizes=True,
)

from datasets import Dataset, load_dataset
def pack(_tokenizer,max_length,padding="max_length",pad_to_multiple_of=None,return_tensors="pt"):
    def padseq(line):
        inputs=tokenizer(line["seq"], max_length=max_length, truncation=True)
        return inputs
    return padseq


func=pack(tokenizer,26000,padding="max_length")
dataset_path="/home/zhengyulong/models/project/datasets/sars"
train_ds = load_dataset("csv",data_files=f"{dataset_path}/substrain_hcov_train_cls12_appendix.csv")
test_ds  = load_dataset("csv",data_files=f"{dataset_path}/substrain_hcov_test_cls12_appendix.csv" )
eval_ds  = load_dataset("csv",data_files=f"{dataset_path}/substrain_hcov_eval_cls12_appendix.csv" )
train_ds=train_ds.map(func,batched=True,num_proc=128)["train"].remove_columns("seq")
test_ds=test_ds.map(func,batched=True,num_proc=128)["train"].remove_columns("seq")
eval_ds=eval_ds.map(func,batched=True,num_proc=128)["train"].remove_columns("seq")
output_path="/pf9550-bdp-A800/yehuicheng/caduceus_env_ps/dataset/"
perfix="substrain_hcov_cls12_appendix"
train_ds.save_to_disk(f"{output_path}/{perfix}/trainset", num_proc=128)
eval_ds.save_to_disk( f"{output_path}/{perfix}/evalset" , num_proc=128)
test_ds.save_to_disk( f"{output_path}/{perfix}/testset" , num_proc=128)

my lauch.py

import os
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import torch
from sklearn.metrics import precision_recall_fscore_support
from transformers import DefaultDataCollator, Trainer, TrainingArguments

#os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7'
tokenizer = AutoTokenizer.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16", trust_remote_code=True
)
config = AutoConfig.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16",
    trust_remote_code=True,
    num_labels=12,
)
model = AutoModelForSequenceClassification.from_pretrained(
    "/pf9550-bdp-A800/yehuicheng/h100/kuleshov-group/caduceus-ps_seqlen-131k_d_model-256_n_layer-16",
    trust_remote_code=True,
    config=config,
    ignore_mismatched_sizes=True,
)

def print_grad(grad):
    print(grad)


# for name, param in model.named_parameters():
#     param.register_hook(lambda grad, name=name: print(f'{name} grad: {grad.norm()}'))

dataset_path="/pf9550-bdp-A800/yehuicheng/caduceus_env_ps/dataset/substrain_hcov_cls12_appendix"
from datasets import load_from_disk
trainset=load_from_disk(f"{dataset_path}/trainset")
evalset=load_from_disk( f"{dataset_path}/evalset")
testset=load_from_disk( f"{dataset_path}/testset")
def save_model(model, output_dir):
    model_save_path = os.path.join(output_dir, "trained_model.pth")
    torch.save(model.state_dict(), model_save_path)
    #print(f"model saved to ": {model_save_path}")

def evaluate_model(trainer, test_dataset):
    print("bengin evaluate testdataset")
    results = trainer.evaluate(test_dataset)
    #print(f{"results}")
    return results
def compute_metrics(p):
    logits,labels= p
    pred=np.argmax(logits, axis=-1)
    precision, recall, fscore, support = precision_recall_fscore_support(labels, pred, average="weighted")
    return {"precision":precision,"recall":recall,"fscore":fscore}
datacollator = DefaultDataCollator()
     
training_args=TrainingArguments(
    output_dir="/pf9550-bdp-A800/yehuicheng/Caduceus_ps/loadingsample",
    evaluation_strategy="steps",
    #gradient_checkpointing=True,
    eval_steps=10,
    save_steps=10,
    save_total_limit=10,
    learning_rate=5e-3,
    lr_scheduler_type= "cosine",
    warmup_ratio = 0.1,
    weight_decay=0.1,
    num_train_epochs=5,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    neftune_noise_alpha=0,
    max_grad_norm=1,
    bf16=True,
    logging_steps =1,
    report_to="wandb",
    optim="adamw_torch",
    save_safetensors=False
)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset= trainset,
    eval_dataset= evalset,
    data_collator=datacollator,
    compute_metrics=compute_metrics
)

trainer.train()
save_model(model, training_args.output_dir)

evaluate_model(trainer, testset)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

pre-trained Caduceus model to the Huggingface hub，gradient #56

pre-trained Caduceus model to the Huggingface hub，gradient #56

github2657529567 commented Dec 12, 2024 •

edited

Loading

pre-trained Caduceus model to the Huggingface hub，gradient #56

pre-trained Caduceus model to the Huggingface hub，gradient #56

Comments

github2657529567 commented Dec 12, 2024 • edited Loading

github2657529567 commented Dec 12, 2024 •

edited

Loading