-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline.py
114 lines (93 loc) · 4.41 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
wnut = load_dataset('wnut_17')
label_list=wnut["train"].features[f"ner_tags"].feature.names
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) #adds padding after tokenizing
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=14) #-100 is an extra label
# accuracy = load_metric("accuracy")
# metric = load_metric("f1")
metric = load_metric("seqeval")
def compute_metrics(df):
logits, labels = df
predictions = np.argmax(logits, axis=-1)
# #flatten predictions and labels
# flat_predictions = np.ravel(predictions)
# flat_labels = np.ravel(labels)
# #save flat predictions and flat labels to csv
# np.savetxt("flat_predictions.csv", flat_predictions, delimiter=",")
# np.savetxt("flat_labels.csv", flat_labels, delimiter=",")
# #only choose indexes that are not -100
# filtered_predictions = flat_predictions[flat_labels != -100]
# filtered_labels = flat_labels[flat_labels != -100]
# #save filtered predictions and filtered labels to csv
# np.savetxt("filtered_predictions.csv", filtered_predictions, delimiter=",")
# np.savetxt("filtered_labels.csv", filtered_labels, delimiter=",")
# f1 = metric.compute(predictions=filtered_predictions, references=filtered_labels, average='macro')
# accuracy_score = accuracy.compute(predictions=filtered_predictions, references=filtered_labels)
# Remove ignored index (special tokens)
true_predictions = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
true_labels = [
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1) #default classes predicted 0 times to 1
return {"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],}
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_wnut["train"],
eval_dataset=tokenized_wnut["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()
# # SPAN F1 EVALUATION
# filtered_labels = np.loadtxt("filtered_labels.csv", delimiter=",")
# filtered_predictions = np.loadtxt("filtered_predictions.csv", delimiter=",")
# #map using label_list
# filtered_labels = np.vectorize(label_list.__getitem__)(filtered_labels.astype(int))
# filtered_predictions = np.vectorize(label_list.__getitem__)(filtered_predictions.astype(int))
# #save filtered predictions and filtered labels to csv
# np.savetxt("filtered_predictions_mapped.csv", filtered_predictions, delimiter=",", fmt="%s")
# np.savetxt("filtered_labels_mapped.csv", filtered_labels, delimiter=",", fmt="%s")