-
Notifications
You must be signed in to change notification settings - Fork 7
/
train.py
172 lines (153 loc) · 6.48 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
import json
import random
import re
import io
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from spacy.util import minibatch, compounding
from sklearn.metrics import accuracy_score
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r',encoding="utf8") as f:
lines = f.read()
lines = json.loads(lines)
for line in lines['object']:
data = line
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
import spacy
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("data/traindata.json")
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
# random.shuffle(TRAIN_DATA)
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
text, annotations = zip(*batch)
# print(text,annotations)
# for text, annotations in TRAIN_DATA:
nlp.update(
text, # batch of texts
annotations, # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
# print(losses)
print(losses)
#test the model and evaluate it
examples = convert_dataturks_to_spacy("data/testdata.json")
tp=0
tr=0
tf=0
ta=0
examples = trim_entity_spans(examples)
c=0
for text,annot in examples:
f=io.open("tested_summaries/resume"+str(c)+".txt","w",encoding="utf-8")
doc_to_test=nlp(text)
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[]
for ent in doc_to_test.ents:
d[ent.label_].append(ent.text)
for i in set(d.keys()):
f.write("\n\n")
f.write(i +":"+"\n")
for j in set(d[i]):
f.write(j.replace('\n','')+"\n")
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[0,0,0,0,0,0]
for ent in doc_to_test.ents:
doc_gold_text= nlp.make_doc(text)
gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]
if(d[ent.label_][0]==0):
#f.write("For Entity "+ent.label_+"\n")
#f.write(classification_report(y_true, y_pred)+"\n")
(p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
a=accuracy_score(y_true,y_pred)
d[ent.label_][0]=1
d[ent.label_][1]+=p
d[ent.label_][2]+=r
d[ent.label_][3]+=f
d[ent.label_][4]+=a
d[ent.label_][5]+=1
c+=1
for i in d:
print("\n For Entity "+i+"\n")
print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
print("Precision : "+str(d[i][1]/d[i][5]))
print("Recall : "+str(d[i][2]/d[i][5]))
print("F-score : "+str(d[i][3]/d[i][5]))
nlp.to_disk("model/")
train_spacy()