-
Notifications
You must be signed in to change notification settings - Fork 0
/
evalsegmenter.py
60 lines (45 loc) · 2.02 KB
/
evalsegmenter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Take any segmenter's predictions (in the form of a spaced text file),
and compare it against any given gold standard spaced file, returning
accuracy, precision, and recall.
Usage:
$ python3 evalsegmenter.py <segmenter's output file> <gold standard file>
"""
import re
import sys
# the segmenter in question's output
in_filename = sys.argv[1]
with open(in_filename) as f:
in_file = f.read()
# the gold standard for comparison
gold_standard_filename = sys.argv[2]
with open(gold_standard_filename) as g:
gold_file = g.read()
# replace instances of one or more whitespace of any kind with a single space
# turn into lists for evalutaion
test_list = re.sub(r'\s+', ' ', in_file)
test_list = test_list.split(' ')
gold_list = re.sub(r'\s+', ' ', gold_file)
gold_list = gold_list.split(' ')
def s_and_c(ls):
return 's'.join(['c' * (len(i) - 1) for i in ls])
sc_test_segs = s_and_c(test_list)
sc_gold_segs = s_and_c(gold_list)
# compare the two
accuracy = len([i for i, j in zip(sc_gold_segs, sc_test_segs) if i == j]) / len(sc_gold_segs) # noqa
precision = len([True for gold, pred in zip(sc_gold_segs, sc_test_segs)
if pred == 's' and gold == pred]) / sc_test_segs.count('s')
recall = len([True for gold, pred in zip(sc_gold_segs, sc_test_segs)
if pred == 's' and gold == pred]) / (sc_test_segs.count('s') + len([True for gold, pred in zip(sc_gold_segs, sc_test_segs) if pred == 'c' and gold == 's'])) # noqa
# assuming filename format "CORPUSsegs/segmenter-trad|simp-segmented.txt"
segmenter_filename_ls = in_filename.split('-')
segmenter = segmenter_filename_ls[0]
segmenter_ls = segmenter.split('/')
segmenter = segmenter_ls[1].upper()
chars = segmenter_filename_ls[1]
# assuming filename format "CORPUSsegs/corpus-trad|simp-segmented.txt"
gold_filename_ls = gold_standard_filename.split('-')
gold = gold_filename_ls[0]
gold_ls = gold.split('/')
gold = gold_ls[1].upper()
with open('segs.tsv', 'a') as outfile:
print(segmenter, gold, chars, accuracy, precision, recall, sep='\t', file=outfile) # noqa