-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconll2002_metrics.py
421 lines (344 loc) · 15 KB
/
conll2002_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env python
# Python version of the evaluation script from CoNLL'00-
# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported
import sys
import re
import os
from itertools import chain
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from collections import defaultdict, namedtuple
ANY_SPACE = '<SPACE>'
class FormatError(Exception):
pass
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
class EvalCounts(object):
def __init__(self):
self.correct_chunk = 0 # number of correctly identified chunks
self.correct_tags = 0 # number of correct chunk tags
self.found_correct = 0 # number of chunks in corpus
self.found_guessed = 0 # number of identified chunks
self.token_counter = 0 # token counter (ignores sentence breaks)
# counts by type
self.t_correct_chunk = defaultdict(int)
self.t_found_correct = defaultdict(int)
self.t_found_guessed = defaultdict(int)
def parse_args(argv):
import argparse
parser = argparse.ArgumentParser(
description='evaluate tagging results using CoNLL criteria',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg = parser.add_argument
arg('-b', '--boundary', metavar='STR', default='-X-',
help='sentence boundary')
arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
help='character delimiting items in input')
arg('-o', '--otag', metavar='CHAR', default='O',
help='alternative outside tag')
arg('file', nargs='?', default=None)
return parser.parse_args(argv)
def parse_tag(t):
m = re.match(r'^([^-]*)-(.*)$', t)
return m.groups() if m else (t, '')
# def evaluate(iterable, options=None):
# if options is None:
# options = parse_args([]) # use defaults
# counts = EvalCounts()
# num_features = None # number of features per line
# in_correct = False # currently processed chunks is correct until now
# last_correct = 'O' # previous chunk tag in corpus
# last_correct_type = '' # type of previously identified chunk tag
# last_guessed = 'O' # previously identified chunk tag
# last_guessed_type = '' # type of previous chunk tag in corpus
# for line in iterable:
# line = line.rstrip('\r\n')
# if options.delimiter == ANY_SPACE:
# features = line.split()
# else:
# features = line.split(options.delimiter)
# if num_features is None:
# num_features = len(features)
# elif num_features != len(features) and len(features) != 0:
# raise FormatError('unexpected number of features: %d (%d)' %
# (len(features), num_features))
# if len(features) == 0 or features[0] == options.boundary:
# features = [options.boundary, 'O', 'O']
# if len(features) < 3:
# raise FormatError('unexpected number of features in line %s' % line)
# guessed, guessed_type = parse_tag(features.pop())
# correct, correct_type = parse_tag(features.pop())
# first_item = features.pop(0)
# if first_item == options.boundary:
# guessed = 'O'
# end_correct = end_of_chunk(last_correct, correct,
# last_correct_type, correct_type)
# end_guessed = end_of_chunk(last_guessed, guessed,
# last_guessed_type, guessed_type)
# start_correct = start_of_chunk(last_correct, correct,
# last_correct_type, correct_type)
# start_guessed = start_of_chunk(last_guessed, guessed,
# last_guessed_type, guessed_type)
# if in_correct:
# if (end_correct and end_guessed and
# last_guessed_type == last_correct_type):
# in_correct = False
# counts.correct_chunk += 1
# counts.t_correct_chunk[last_correct_type] += 1
# elif (end_correct != end_guessed or guessed_type != correct_type):
# in_correct = False
# if start_correct and start_guessed and guessed_type == correct_type:
# in_correct = True
# if start_correct:
# counts.found_correct += 1
# counts.t_found_correct[correct_type] += 1
# if start_guessed:
# counts.found_guessed += 1
# counts.t_found_guessed[guessed_type] += 1
# if first_item != options.boundary:
# if correct == guessed and guessed_type == correct_type:
# counts.correct_tags += 1
# counts.token_counter += 1
# last_guessed = guessed
# last_correct = correct
# last_guessed_type = guessed_type
# last_correct_type = correct_type
# if in_correct:
# counts.correct_chunk += 1
# counts.t_correct_chunk[last_correct_type] += 1
# return counts
def evaluate(lines, options=None):
if options is None:
options = parse_args([]) # use defaults
counts = EvalCounts()
num_features = None # number of features per line
in_correct = False # currently processed chunks is correct until now
last_correct = 'O' # previous chunk tag in corpus
last_correct_type = '' # type of previously identified chunk tag
last_guessed = 'O' # previously identified chunk tag
last_guessed_type = '' # type of previous chunk tag in corpus
for line in lines:
line = line.rstrip('\r\n')
if options.delimiter == ANY_SPACE:
features = line.split()
else:
features = line.split(options.delimiter)
if num_features is None:
num_features = len(features)
elif num_features != len(features) and len(features) != 0:
raise FormatError('unexpected number of features: %d (%d)' %
(len(features), num_features))
if len(features) == 0 or features[0] == options.boundary:
features = [options.boundary, 'O', 'O']
if len(features) < 3:
raise FormatError('unexpected number of features in line %s' % line)
guessed, guessed_type = parse_tag(features.pop())
correct, correct_type = parse_tag(features.pop())
first_item = features.pop(0)
if first_item == options.boundary:
guessed = 'O'
end_correct = end_of_chunk(last_correct, correct,
last_correct_type, correct_type)
end_guessed = end_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
start_correct = start_of_chunk(last_correct, correct,
last_correct_type, correct_type)
start_guessed = start_of_chunk(last_guessed, guessed,
last_guessed_type, guessed_type)
if in_correct:
if (end_correct and end_guessed and
last_guessed_type == last_correct_type):
in_correct = False
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
elif (end_correct != end_guessed or guessed_type != correct_type):
in_correct = False
if start_correct and start_guessed and guessed_type == correct_type:
in_correct = True
if start_correct:
counts.found_correct += 1
counts.t_found_correct[correct_type] += 1
if start_guessed:
counts.found_guessed += 1
counts.t_found_guessed[guessed_type] += 1
if first_item != options.boundary:
if correct == guessed and guessed_type == correct_type:
counts.correct_tags += 1
counts.token_counter += 1
last_guessed = guessed
last_correct = correct
last_guessed_type = guessed_type
last_correct_type = correct_type
if in_correct:
counts.correct_chunk += 1
counts.t_correct_chunk[last_correct_type] += 1
return counts
def uniq(iterable):
seen = set()
return [i for i in iterable if not (i in seen or seen.add(i))]
def calculate_metrics(correct, guessed, total):
tp, fp, fn = correct, guessed-correct, total-correct
p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
f = 0 if p + r == 0 else 2 * p * r / (p + r)
return Metrics(tp, fp, fn, p, r, f)
def metrics(counts):
c = counts
overall = calculate_metrics(
c.correct_chunk, c.found_guessed, c.found_correct
)
by_type = {}
for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
by_type[t] = calculate_metrics(
c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
)
return overall, by_type
# def report(counts, out=None):
# if out is None:
# out = sys.stdout
# overall, by_type = metrics(counts)
# c = counts
# out.write('processed %d tokens with %d phrases; ' %
# (c.token_counter, c.found_correct))
# out.write('found: %d phrases; correct: %d.\n' %
# (c.found_guessed, c.correct_chunk))
# if c.token_counter > 0:
# out.write('accuracy: %6.2f%%; ' %
# (100.*c.correct_tags/c.token_counter))
# out.write('precision: %6.2f%%; ' % (100.*overall.prec))
# out.write('recall: %6.2f%%; ' % (100.*overall.rec))
# out.write('FB1: %6.2f\n' % (100.*overall.fscore))
# for i, m in sorted(by_type.items()):
# out.write('%17s: ' % i)
# out.write('precision: %6.2f%%; ' % (100.*m.prec))
# out.write('recall: %6.2f%%; ' % (100.*m.rec))
# out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
def report(counts, out=None):
if out is None:
out = sys.stdout
overall, by_type = metrics(counts)
c = counts
out.write('processed %d tokens with %d phrases; ' %
(c.token_counter, c.found_correct))
out.write('found: %d phrases; correct: %d.\n' %
(c.found_guessed, c.correct_chunk))
results = {}
if c.token_counter > 0:
results["fb1"] = 100.*overall.fscore
results['precision'] = 100.*overall.prec
results['recall'] = 100.*overall.rec
out.write('Precision:{}\t Recall:{}\tF1-score:{}\n'.format(results['precision'], results['recall'], results["fb1"]))
# comment it to not print details
# for i, m in sorted(by_type.items()):
# print('%17s: ' % i)
# print('precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f %d\n' % (100.*m.prec, 100.*m.rec, 100.*m.fscore, c.t_found_guessed[i]))
return results
# out.write('accuracy: %6.2f%%; ' %
# (100.*c.correct_tags/c.token_counter))
# out.write('precision: %6.2f%%; ' % (100.*overall.prec))
# out.write('recall: %6.2f%%; ' % (100.*overall.rec))
# out.write('FB1: %6.2f\n' % (100.*overall.fscore))
# for i, m in sorted(by_type.items()):
# out.write('%17s: ' % i)
# out.write('precision: %6.2f%%; ' % (100.*m.prec))
# out.write('recall: %6.2f%%; ' % (100.*m.rec))
# out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
def end_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_end = False
if prev_tag == 'E': chunk_end = True
if prev_tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'B': chunk_end = True
if prev_tag == 'B' and tag == 'S': chunk_end = True
if prev_tag == 'B' and tag == 'O': chunk_end = True
if prev_tag == 'I' and tag == 'B': chunk_end = True
if prev_tag == 'I' and tag == 'S': chunk_end = True
if prev_tag == 'I' and tag == 'O': chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
# these chunks are assumed to have length 1
if prev_tag == ']': chunk_end = True
if prev_tag == '[': chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
# check if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
chunk_start = False
if tag == 'B': chunk_start = True
if tag == 'S': chunk_start = True
if prev_tag == 'E' and tag == 'E': chunk_start = True
if prev_tag == 'E' and tag == 'I': chunk_start = True
if prev_tag == 'S' and tag == 'E': chunk_start = True
if prev_tag == 'S' and tag == 'I': chunk_start = True
if prev_tag == 'O' and tag == 'E': chunk_start = True
if prev_tag == 'O' and tag == 'I': chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
# these chunks are assumed to have length 1
if tag == '[': chunk_start = True
if tag == ']': chunk_start = True
return chunk_start
def main(argv):
args = parse_args(argv[1:])
if args.file is None:
counts = evaluate(sys.stdin, args)
else:
with open(args.file) as f:
counts = evaluate(f, args)
report(counts)
def conll2002_measure(lines, verbose=False):
counts = evaluate(lines, None)
return report(counts)
def conlleval(label_predict, label_path, metric_path):
"""
:param label_predict:
:param label_path:
:param metric_path:
:return:
"""
eval_perl = "conlleval_rev.pl"
with open(label_path, "w") as fw:
line = []
for sample in label_predict:
char, tag_, tag = sample.split()
tag = '0' if tag == 'O' else tag
char = char.encode("utf-8")
line.append("{} {} {}\n".format(char, tag, tag_))
fw.writelines(line)
os.system("perl {} < {} > {}".format(eval_perl, label_path, metric_path))
with open(metric_path) as fr:
metrics = [line.strip() for line in fr]
return metrics
def Evaluation(y_true, y_pred):
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels=[class_indices[cls] for cls in tagset],
target_names=tagset,
digits=4
)
def conlleval2(label_predict):
"""
:param label_predict:
:return:
"""
true = []
pred = []
for sent_result in label_predict:
char, tag_, tag = sent_result.split()
true.append(tag)
pred.append(tag_)
res = Evaluation([true],[pred])
return res