|
| 1 | +from datasets import load_dataset |
| 2 | +from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer |
| 3 | +import json |
| 4 | + |
| 5 | +# ---------------------------------------------------------------------------------------- |
| 6 | +# hugging face model name |
| 7 | +model_name = 'FacebookAI/xlm-roberta-base' |
| 8 | + |
| 9 | +# set language |
| 10 | +language = 'en-it' |
| 11 | + |
| 12 | +# batch size |
| 13 | +batch_size = 32 |
| 14 | + |
| 15 | +# epochs |
| 16 | +epochs = 8 |
| 17 | + |
| 18 | +# learning rate |
| 19 | +learning_rate = 5e-5 |
| 20 | + |
| 21 | +# use cpu (set false for gpu) |
| 22 | +use_cpu = False |
| 23 | +# ---------------------------------------------------------------------------------------- |
| 24 | + |
| 25 | +# output folder name |
| 26 | +experiment_name = f'{model_name}-fine-tuned-{language}-{epochs}epochs-{batch_size}batch' |
| 27 | + |
| 28 | +# load dataset |
| 29 | +dataset = load_dataset( |
| 30 | + 'parquet', |
| 31 | + data_files={ |
| 32 | + 'train': f'data/ner-token-classification/train-{language}.parquet', |
| 33 | + 'validation': f'data/ner-token-classification/validation-{language}.parquet', |
| 34 | + 'test': f'data/ner-token-classification/test-{language}.parquet' |
| 35 | + } |
| 36 | +) |
| 37 | + |
| 38 | +# load labels mapping |
| 39 | +with open(f'data/ner-token-classification/labels-mapping-tokenization-{language}.json', 'r') as file: |
| 40 | + label2id_tokenization = json.load(file) |
| 41 | + |
| 42 | +with open(f'data/ner-token-classification/labels-mapping-model-{language}.json', 'r') as file: |
| 43 | + label2id = json.load(file) |
| 44 | +id2label = {id: label for label, id in label2id.items()} |
| 45 | + |
| 46 | +# load tokenizer |
| 47 | +tokenizer = AutoTokenizer.from_pretrained(model_name) |
| 48 | + |
| 49 | +# tokenize function |
| 50 | +def tokenize_text_and_create_bio_token_labels(examples): |
| 51 | + # tokenize current batch |
| 52 | + tokens_batch = tokenizer(examples['text'], truncation=True) |
| 53 | + |
| 54 | + # initialize tokens ids for the batch |
| 55 | + # this will store tokens ids for each example in the batch |
| 56 | + tokens_ids_batch = [] |
| 57 | + |
| 58 | + # loop over examples in the batch |
| 59 | + for encodings, ner_tags in zip(tokens_batch.encodings, examples['ner_tags']): |
| 60 | + # by default all tokens are assigned to outside label |
| 61 | + tokens_ids = [label2id_tokenization['O']] * len(encodings) |
| 62 | + |
| 63 | + # process ner tags in the current example and update tokens labels |
| 64 | + i = 0 |
| 65 | + for ner_tag in ner_tags: |
| 66 | + # check if tokens offsets in the original sentence match the current ner tag |
| 67 | + for index, ((offset_start, offset_end), word_id, token) in enumerate(zip(encodings.offsets, encodings.word_ids, encodings.tokens)): |
| 68 | + # if the token don't belong to any original word, then could be padding or special characters by the tokenizer |
| 69 | + # the index -100 is ignored by default in pytorch crossentropy |
| 70 | + if word_id is None: |
| 71 | + tokens_ids[index] = -100 |
| 72 | + # keep outside label for unicode whitespace |
| 73 | + elif token == '▁': |
| 74 | + continue |
| 75 | + # assign a proper begin label |
| 76 | + elif offset_start == ner_tag['start'] or offset_start < ner_tag['start'] < offset_end: |
| 77 | + tokens_ids[index] = label2id_tokenization['B-' + ner_tag['label']] |
| 78 | + i += 1 |
| 79 | + # assign a proper inside label |
| 80 | + elif ner_tag['start'] < offset_end <= ner_tag['end']: |
| 81 | + tokens_ids[index] = label2id_tokenization['I-' + ner_tag['label']] |
| 82 | + |
| 83 | + # check if all ner tags where processed in the example |
| 84 | + if len(ner_tags) != i: |
| 85 | + raise ValueError('not all ner tags were correctly processed!') |
| 86 | + |
| 87 | + # append tokens labels for the current example to the batch list |
| 88 | + tokens_ids_batch.append(tokens_ids) |
| 89 | + |
| 90 | + # return labels, input ids and attention mask |
| 91 | + return {'labels': tokens_ids_batch, 'input_ids': tokens_batch['input_ids'], 'attention_mask': tokens_batch['attention_mask']} |
| 92 | + |
| 93 | +# tokenize dataset |
| 94 | +tokenized_dataset = dataset.map(tokenize_text_and_create_bio_token_labels, batched=True) |
| 95 | + |
| 96 | +# data collator for dynamically padding batch instead of padding whole dataset to max length |
| 97 | +# this can speed up considerably the training procedure if batches samples have a short text length |
| 98 | +data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) |
| 99 | + |
| 100 | +# load model |
| 101 | +model = AutoModelForTokenClassification.from_pretrained( |
| 102 | + model_name, |
| 103 | + num_labels=len(label2id), |
| 104 | + id2label=id2label, |
| 105 | + label2id=label2id |
| 106 | +) |
| 107 | + |
| 108 | +# training arguments |
| 109 | +training_args = TrainingArguments( |
| 110 | + output_dir=f'checkpoints/ner-token-classification/{experiment_name}-checkpoints', |
| 111 | + overwrite_output_dir=True, |
| 112 | + logging_strategy='epoch', |
| 113 | + eval_strategy='epoch', |
| 114 | + per_device_train_batch_size=batch_size, |
| 115 | + per_device_eval_batch_size=batch_size, |
| 116 | + eval_delay=0, |
| 117 | + learning_rate=learning_rate, |
| 118 | + num_train_epochs=epochs, |
| 119 | + use_cpu=use_cpu, |
| 120 | + report_to='none' |
| 121 | +) |
| 122 | + |
| 123 | +# trainer |
| 124 | +trainer = Trainer( |
| 125 | + model=model, |
| 126 | + args=training_args, |
| 127 | + train_dataset=tokenized_dataset['train'], |
| 128 | + eval_dataset=tokenized_dataset['validation'], |
| 129 | + tokenizer=tokenizer, |
| 130 | + data_collator=data_collator, |
| 131 | +) |
| 132 | + |
| 133 | +# start training |
| 134 | +trainer.train() |
| 135 | + |
| 136 | +# save model |
| 137 | +trainer.save_model(f'models/ner-token-classification/{experiment_name}-model') |
0 commit comments