Skip to content

Commit f693a74

Browse files
committed
Create 01-ner-token-classification-train.py
1 parent e0849b8 commit f693a74

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

01-ner-token-classification-train.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from datasets import load_dataset
2+
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
3+
import json
4+
5+
# ----------------------------------------------------------------------------------------
6+
# hugging face model name
7+
model_name = 'FacebookAI/xlm-roberta-base'
8+
9+
# set language
10+
language = 'en-it'
11+
12+
# batch size
13+
batch_size = 32
14+
15+
# epochs
16+
epochs = 8
17+
18+
# learning rate
19+
learning_rate = 5e-5
20+
21+
# use cpu (set false for gpu)
22+
use_cpu = False
23+
# ----------------------------------------------------------------------------------------
24+
25+
# output folder name
26+
experiment_name = f'{model_name}-fine-tuned-{language}-{epochs}epochs-{batch_size}batch'
27+
28+
# load dataset
29+
dataset = load_dataset(
30+
'parquet',
31+
data_files={
32+
'train': f'data/ner-token-classification/train-{language}.parquet',
33+
'validation': f'data/ner-token-classification/validation-{language}.parquet',
34+
'test': f'data/ner-token-classification/test-{language}.parquet'
35+
}
36+
)
37+
38+
# load labels mapping
39+
with open(f'data/ner-token-classification/labels-mapping-tokenization-{language}.json', 'r') as file:
40+
label2id_tokenization = json.load(file)
41+
42+
with open(f'data/ner-token-classification/labels-mapping-model-{language}.json', 'r') as file:
43+
label2id = json.load(file)
44+
id2label = {id: label for label, id in label2id.items()}
45+
46+
# load tokenizer
47+
tokenizer = AutoTokenizer.from_pretrained(model_name)
48+
49+
# tokenize function
50+
def tokenize_text_and_create_bio_token_labels(examples):
51+
# tokenize current batch
52+
tokens_batch = tokenizer(examples['text'], truncation=True)
53+
54+
# initialize tokens ids for the batch
55+
# this will store tokens ids for each example in the batch
56+
tokens_ids_batch = []
57+
58+
# loop over examples in the batch
59+
for encodings, ner_tags in zip(tokens_batch.encodings, examples['ner_tags']):
60+
# by default all tokens are assigned to outside label
61+
tokens_ids = [label2id_tokenization['O']] * len(encodings)
62+
63+
# process ner tags in the current example and update tokens labels
64+
i = 0
65+
for ner_tag in ner_tags:
66+
# check if tokens offsets in the original sentence match the current ner tag
67+
for index, ((offset_start, offset_end), word_id, token) in enumerate(zip(encodings.offsets, encodings.word_ids, encodings.tokens)):
68+
# if the token don't belong to any original word, then could be padding or special characters by the tokenizer
69+
# the index -100 is ignored by default in pytorch crossentropy
70+
if word_id is None:
71+
tokens_ids[index] = -100
72+
# keep outside label for unicode whitespace
73+
elif token == '▁':
74+
continue
75+
# assign a proper begin label
76+
elif offset_start == ner_tag['start'] or offset_start < ner_tag['start'] < offset_end:
77+
tokens_ids[index] = label2id_tokenization['B-' + ner_tag['label']]
78+
i += 1
79+
# assign a proper inside label
80+
elif ner_tag['start'] < offset_end <= ner_tag['end']:
81+
tokens_ids[index] = label2id_tokenization['I-' + ner_tag['label']]
82+
83+
# check if all ner tags where processed in the example
84+
if len(ner_tags) != i:
85+
raise ValueError('not all ner tags were correctly processed!')
86+
87+
# append tokens labels for the current example to the batch list
88+
tokens_ids_batch.append(tokens_ids)
89+
90+
# return labels, input ids and attention mask
91+
return {'labels': tokens_ids_batch, 'input_ids': tokens_batch['input_ids'], 'attention_mask': tokens_batch['attention_mask']}
92+
93+
# tokenize dataset
94+
tokenized_dataset = dataset.map(tokenize_text_and_create_bio_token_labels, batched=True)
95+
96+
# data collator for dynamically padding batch instead of padding whole dataset to max length
97+
# this can speed up considerably the training procedure if batches samples have a short text length
98+
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
99+
100+
# load model
101+
model = AutoModelForTokenClassification.from_pretrained(
102+
model_name,
103+
num_labels=len(label2id),
104+
id2label=id2label,
105+
label2id=label2id
106+
)
107+
108+
# training arguments
109+
training_args = TrainingArguments(
110+
output_dir=f'checkpoints/ner-token-classification/{experiment_name}-checkpoints',
111+
overwrite_output_dir=True,
112+
logging_strategy='epoch',
113+
eval_strategy='epoch',
114+
per_device_train_batch_size=batch_size,
115+
per_device_eval_batch_size=batch_size,
116+
eval_delay=0,
117+
learning_rate=learning_rate,
118+
num_train_epochs=epochs,
119+
use_cpu=use_cpu,
120+
report_to='none'
121+
)
122+
123+
# trainer
124+
trainer = Trainer(
125+
model=model,
126+
args=training_args,
127+
train_dataset=tokenized_dataset['train'],
128+
eval_dataset=tokenized_dataset['validation'],
129+
tokenizer=tokenizer,
130+
data_collator=data_collator,
131+
)
132+
133+
# start training
134+
trainer.train()
135+
136+
# save model
137+
trainer.save_model(f'models/ner-token-classification/{experiment_name}-model')

0 commit comments

Comments
 (0)