You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi!
I'm trying to fine tune CodeT5p-770m model for text-to-Java code . I'm working in google colab, I copied the tune_codet5p_seq2seq.py file in a script.py file. I modify the script in:
import os
import pprint
import argparse
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import DatasetDict
def run_training(args, model, train_data):
print(f"Starting main loop")
print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
print(args)
print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
training_args = TrainingArguments(
report_to='tensorboard',
output_dir=args.save_dir,
overwrite_output_dir=False,
do_train=True,
save_strategy='epoch',
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size_per_replica,
gradient_accumulation_steps=args.grad_acc_steps,
learning_rate=args.lr,
weight_decay=0.05,
warmup_steps=args.lr_warmup_steps,
logging_dir=args.save_dir,
logging_first_step=True,
logging_steps=args.log_freq,
save_total_limit=1,
dataloader_drop_last=True,
dataloader_num_workers=4,
local_rank=args.local_rank,
deepspeed=args.deepspeed,
fp16=args.fp16,
)
print('££££££££££££££££££££££££££££££££')
print(training_args)
print('££££££££££££££££££££££££££££££££')
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
)
trainer.train()
if args.local_rank in [0, -1]:
final_checkpoint_dir = os.path.join(args.save_dir, "final_checkpoint")
model.save_pretrained(final_checkpoint_dir)
print(f' ==> Finish training and save to {final_checkpoint_dir}')
def load_tokenize_data(args):
# Load and tokenize dataif os.path.exists(args.cache_data):
train_data = load_from_disk(args.cache_data)
print(f' ==> Loaded {len(train_data)} samples')
return train_data
else:
# Example code to load and process code_x_glue_ct_code_to_text python dataset for code summarization task
datasets = load_from_disk('/content/drive/MyDrive/prova/Dataset')
tokenizer = AutoTokenizer.from_pretrained(args.load)
def preprocess_function(examples):
source = [''.join(ex) forexin examples["code_tokens"]]
target = [''.join(ex) forexin examples["docstring_tokens"]]
model_inputs = tokenizer(source, max_length=args.max_source_len, padding="max_length", truncation=True)
labels = tokenizer(target, max_length=args.max_target_len, padding="max_length", truncation=True)
model_inputs["labels"] = labels["input_ids"].copy()
model_inputs["labels"] = [
[(l if l != tokenizer.pad_token_id else -100) forlin label] forlabelin model_inputs["labels"]
]
return model_inputs
train_data = datasets.map(
preprocess_function,
batched=True,
remove_columns=datasets.column_names,
num_proc=64,
load_from_cache_file=False,
)
print(f' ==> Loaded {len(train_data)} samples')
train_data.save_to_disk(args.cache_data)
print(f' ==> Saved to {args.cache_data}')
return train_data
def main(args={}):
print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
print(args)
print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
argsdict = vars(args)
print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
print(pprint.pformat(argsdict))
print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
# Save command to file
with open(os.path.join(args.save_dir, "command.txt"), 'w') as f:
f.write(pprint.pformat(argsdict))
# Load and tokenize data using the tokenizer from `args.load`. If the data is already cached, load it from there.# You can customize this function to load your own data for any Seq2Seq LM tasks.
train_data = load_tokenize_data(args)
if args.data_num != -1:
train_data = train_data.select([i foriin range(args.data_num)])
# Load model from `args.load`
model = AutoModelForSeq2SeqLM.from_pretrained(args.load)
print(f" ==> Loaded model from {args.load}, model size {model.num_parameters()}")
print('@@@@@@@@@@@@@@@@@@@@@@@')
print(args)
print('@@@@@@@@@@@@@@@@@@@@@@@')
run_training(args, model, train_data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="CodeT5+ finetuning on Text-2-Code LM task")
parser.add_argument('--data-num', default=-1, type=int)
parser.add_argument('--max-source-len', default=320, type=int)
parser.add_argument('--max-target-len', default=128, type=int)
parser.add_argument('--cache-data', default= '/content/drive/MyDrive/prova/Dataset', type=str)
parser.add_argument('--load', default='Salesforce/codet5p-770m', type=str)
# Training
parser.add_argument('--epochs', default=10, type=int)
parser.add_argument('--lr', default=5e-5, type=float)
parser.add_argument('--lr-warmup-steps', default=200, type=int)
parser.add_argument('--batch-size-per-replica', default=8, type=int)
parser.add_argument('--grad-acc-steps', default=4, type=int)
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--deepspeed', default=None, type=str)
parser.add_argument('--fp16', default=False, action='store_true')
# Logging and stuff
parser.add_argument('--save-dir', default="saved_models/prova1", type=str)
parser.add_argument('--log-freq', default=10, type=int)
parser.add_argument('--save-freq', default=500, type=int)
args = parser.parse_args()
print('**************************')
print(args)
print('**************************')
os.makedirs(args.save_dir, exist_ok=True)
print('#################################')
print(args)
print('#################################')
main(args)
I add all the print for debugging my code :)
BUT when i i run ' !python3 script.py ' on colab i receive:
2024-05-21 12:07:32.240561: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 12:07:32.240620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 12:07:32.242264: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 12:07:33.539100: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
**************************
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
**************************#################################
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
#################################
%%%%%%%%%%%%%%%%%%%%%%%%%%
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
%%%%%%%%%%%%%%%%%%%%%%%%%%
^^^^^^^^^^^^^^^^^^^^^^^^^^
{'batch_size_per_replica': 8,
'cache_data': '/content/drive/MyDrive/prova/Dataset',
'data_num': -1,
'deepspeed': None,
'epochs': 10,
'fp16': False,
'grad_acc_steps': 4,
'load': 'Salesforce/codet5p-770m',
'local_rank': -1,
'log_freq': 10,
'lr': 5e-05,
'lr_warmup_steps': 200,
'max_source_len': 320,
'max_target_len': 128,
'save_dir': 'saved_models/prova1',
'save_freq': 500}
^^^^^^^^^^^^^^^^^^^^^^^^^^
==> Loaded 3 samples
==> Loaded model from Salesforce/codet5p-770m, model size 737639424
@@@@@@@@@@@@@@@@@@@@@@@
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
@@@@@@@@@@@@@@@@@@@@@@@
Starting main loop
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
££££££££££££££££££££££££££££££££
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=4,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=saved_models/prova1,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=10,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=saved_models/prova1,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['tensorboard'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=saved_models/prova1,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=epoch,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=200,
weight_decay=0.05,
)
££££££££££££££££££££££££££££££££
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
warnings.warn(_create_warning_msg(
0% 0/10 [00:00<?, ?it/s]/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.
self.pid = os.fork()
Traceback (most recent call last):
File "/content/drive/MyDrive/prova/script.py", line 168, in<module>
main(args)
File "/content/drive/MyDrive/prova/script.py", line 135, in main
run_training(args, model, train_data)
File "/content/drive/MyDrive/prova/script.py", line 61, in run_training
trainer.train()
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2216, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3231, in training_step
inputs = self._prepare_inputs(inputs)
File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3184, in _prepare_inputs
if len(inputs) == 0:
TypeError: object of type'NoneType' has no len()
0% 0/10 [00:00<?, ?it/s]
Someone can help me?
The text was updated successfully, but these errors were encountered:
Hi!
I'm trying to fine tune CodeT5p-770m model for text-to-Java code . I'm working in google colab, I copied the tune_codet5p_seq2seq.py file in a script.py file. I modify the script in:
I add all the print for debugging my code :)
BUT when i i run ' !python3 script.py ' on colab i receive:
Someone can help me?
The text was updated successfully, but these errors were encountered: