Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TypeError: object of type 'NoneType' has no len() #171

Open
SerenaDeAntoni opened this issue May 22, 2024 · 0 comments
Open

TypeError: object of type 'NoneType' has no len() #171

SerenaDeAntoni opened this issue May 22, 2024 · 0 comments

Comments

@SerenaDeAntoni
Copy link

SerenaDeAntoni commented May 22, 2024

Hi!
I'm trying to fine tune CodeT5p-770m model for text-to-Java code . I'm working in google colab, I copied the tune_codet5p_seq2seq.py file in a script.py file. I modify the script in:

import os
import pprint
import argparse
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import DatasetDict
def run_training(args, model, train_data):
    print(f"Starting main loop")
    print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    print(args)
    print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
    training_args = TrainingArguments(
        report_to='tensorboard',
        output_dir=args.save_dir,
        overwrite_output_dir=False,

        do_train=True,
        save_strategy='epoch',

        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size_per_replica,
        gradient_accumulation_steps=args.grad_acc_steps,

        learning_rate=args.lr,
        weight_decay=0.05,
        warmup_steps=args.lr_warmup_steps,

        logging_dir=args.save_dir,
        logging_first_step=True,
        logging_steps=args.log_freq,
        save_total_limit=1,

        dataloader_drop_last=True,
        dataloader_num_workers=4,

        local_rank=args.local_rank,
        deepspeed=args.deepspeed,
        fp16=args.fp16,
    )

    print('££££££££££££££££££££££££££££££££')
    print(training_args)
    print('££££££££££££££££££££££££££££££££') 

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )

    trainer.train()

    if args.local_rank in [0, -1]:
        final_checkpoint_dir = os.path.join(args.save_dir, "final_checkpoint")
        model.save_pretrained(final_checkpoint_dir)
        print(f'  ==> Finish training and save to {final_checkpoint_dir}')

def load_tokenize_data(args):
    # Load and tokenize data
    if os.path.exists(args.cache_data):
        train_data = load_from_disk(args.cache_data)
        print(f'  ==> Loaded {len(train_data)} samples')
        return train_data
    else:
        # Example code to load and process code_x_glue_ct_code_to_text python dataset for code summarization task
        datasets = load_from_disk('/content/drive/MyDrive/prova/Dataset')
        tokenizer = AutoTokenizer.from_pretrained(args.load)
       
        def preprocess_function(examples):
            source = [' '.join(ex) for ex in examples["code_tokens"]]
            target = [' '.join(ex) for ex in examples["docstring_tokens"]]

            model_inputs = tokenizer(source, max_length=args.max_source_len, padding="max_length", truncation=True)
            labels = tokenizer(target, max_length=args.max_target_len, padding="max_length", truncation=True)

            model_inputs["labels"] = labels["input_ids"].copy()
            model_inputs["labels"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
            ]
            return model_inputs

        train_data = datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=datasets.column_names,
            num_proc=64,
            load_from_cache_file=False,
        )
        print(f'  ==> Loaded {len(train_data)} samples')
        train_data.save_to_disk(args.cache_data)
        print(f'  ==> Saved to {args.cache_data}')
        return train_data

def main(args={}):
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
    print(args)
    print('%%%%%%%%%%%%%%%%%%%%%%%%%%')
    argsdict = vars(args)
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
    print(pprint.pformat(argsdict))
    print('^^^^^^^^^^^^^^^^^^^^^^^^^^')
    # Save command to file
    with open(os.path.join(args.save_dir, "command.txt"), 'w') as f:
        f.write(pprint.pformat(argsdict))

    # Load and tokenize data using the tokenizer from `args.load`. If the data is already cached, load it from there.
    # You can customize this function to load your own data for any Seq2Seq LM tasks.
    train_data = load_tokenize_data(args)
   
    if args.data_num != -1:
        train_data = train_data.select([i for i in range(args.data_num)])

    # Load model from `args.load`
    model = AutoModelForSeq2SeqLM.from_pretrained(args.load)
    print(f"  ==> Loaded model from {args.load}, model size {model.num_parameters()}")

    print('@@@@@@@@@@@@@@@@@@@@@@@')
    print(args)
    print('@@@@@@@@@@@@@@@@@@@@@@@')
    run_training(args, model, train_data)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="CodeT5+ finetuning on Text-2-Code LM task")
    parser.add_argument('--data-num', default=-1, type=int)
    parser.add_argument('--max-source-len', default=320, type=int)
    parser.add_argument('--max-target-len', default=128, type=int)
    parser.add_argument('--cache-data', default= '/content/drive/MyDrive/prova/Dataset', type=str)
    parser.add_argument('--load', default='Salesforce/codet5p-770m', type=str)

    # Training
    parser.add_argument('--epochs', default=10, type=int)
    parser.add_argument('--lr', default=5e-5, type=float)
    parser.add_argument('--lr-warmup-steps', default=200, type=int)
    parser.add_argument('--batch-size-per-replica', default=8, type=int)
    parser.add_argument('--grad-acc-steps', default=4, type=int)
    parser.add_argument('--local_rank', default=-1, type=int)
    parser.add_argument('--deepspeed', default=None, type=str)
    parser.add_argument('--fp16', default=False, action='store_true')

    # Logging and stuff
    parser.add_argument('--save-dir', default="saved_models/prova1", type=str)
    parser.add_argument('--log-freq', default=10, type=int)
    parser.add_argument('--save-freq', default=500, type=int)

    args = parser.parse_args()
    print('**************************')
    print(args)
    print('**************************')
    os.makedirs(args.save_dir, exist_ok=True)
    print('#################################')
    print(args)
    print('#################################')
    main(args)

I add all the print for debugging my code :)
BUT when i i run ' !python3 script.py ' on colab i receive:

2024-05-21 12:07:32.240561: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-21 12:07:32.240620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-21 12:07:32.242264: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-21 12:07:33.539100: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
**************************
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
**************************
#################################
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
#################################
%%%%%%%%%%%%%%%%%%%%%%%%%%
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
%%%%%%%%%%%%%%%%%%%%%%%%%%
^^^^^^^^^^^^^^^^^^^^^^^^^^
{'batch_size_per_replica': 8,
 'cache_data': '/content/drive/MyDrive/prova/Dataset',
 'data_num': -1,
 'deepspeed': None,
 'epochs': 10,
 'fp16': False,
 'grad_acc_steps': 4,
 'load': 'Salesforce/codet5p-770m',
 'local_rank': -1,
 'log_freq': 10,
 'lr': 5e-05,
 'lr_warmup_steps': 200,
 'max_source_len': 320,
 'max_target_len': 128,
 'save_dir': 'saved_models/prova1',
 'save_freq': 500}
^^^^^^^^^^^^^^^^^^^^^^^^^^
  ==> Loaded 3 samples
  ==> Loaded model from Salesforce/codet5p-770m, model size 737639424
@@@@@@@@@@@@@@@@@@@@@@@
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
@@@@@@@@@@@@@@@@@@@@@@@
Starting main loop
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Namespace(data_num=-1, max_source_len=320, max_target_len=128, cache_data='/content/drive/MyDrive/prova/Dataset', load='Salesforce/codet5p-770m', epochs=10, lr=5e-05, lr_warmup_steps=200, batch_size_per_replica=8, grad_acc_steps=4, local_rank=-1, deepspeed=None, fp16=False, save_dir='saved_models/prova1', log_freq=10, save_freq=500)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
££££££££££££££££££££££££££££££££
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=4,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
eval_strategy=no,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=saved_models/prova1,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=10,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=10,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=saved_models/prova1,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['tensorboard'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=saved_models/prova1,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=epoch,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=200,
weight_decay=0.05,
)
££££££££££££££££££££££££££££££££
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:558: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(_create_warning_msg(
  0% 0/10 [00:00<?, ?it/s]/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.
  self.pid = os.fork()
Traceback (most recent call last):
  File "/content/drive/MyDrive/prova/script.py", line 168, in <module>
    main(args)
  File "/content/drive/MyDrive/prova/script.py", line 135, in main
    run_training(args, model, train_data)
  File "/content/drive/MyDrive/prova/script.py", line 61, in run_training
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3231, in training_step
    inputs = self._prepare_inputs(inputs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3184, in _prepare_inputs
    if len(inputs) == 0:
TypeError: object of type 'NoneType' has no len()
  0% 0/10 [00:00<?, ?it/s]

Someone can help me?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant