-
Notifications
You must be signed in to change notification settings - Fork 1
/
napeas_anns.py
143 lines (114 loc) · 3.94 KB
/
napeas_anns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import utils
import json
class NapEasyAnn(object):
"""
NapEasy annotation class
"""
def __init__(self, id):
self._id = id
self._selector = None
self._text_node_index = -1
self._offset_start = -1
self._offset_end = -1
self._text = None
self._type = None
@property
def selector(self):
return self._selector
@selector.setter
def selector(self, value):
self._selector = value
@property
def text_node_index(self):
return self._text_node_index
@text_node_index.setter
def text_node_index(self, value):
self._text_node_index = value
@property
def offset_start(self):
return self._offset_start
@offset_start.setter
def offset_start(self, value):
self._offset_start = value
@property
def offset_end(self):
return self._offset_end
@offset_end.setter
def offset_end(self, value):
self._offset_end = value
@property
def text(self):
return self._text
@text.setter
def text(self, value):
self._text = value
@property
def type(self):
return self._type
@type.setter
def type(self, value):
self._type = value
def output_ann(ann):
return '%s\t%s\t%s\t%s\t%s\t%s' % \
(ann.selector, ann.text_node_index,
ann.offset_start, ann.offset_end,
ann.text, ann.type)
def process_anns(ann_file, short_name_file=None):
name_mapping = {}
if short_name_file is not None:
lines = utils.read_text_file(short_name_file)
for l in lines:
arr = l.split('\t')
for idx in range(1, len(arr)):
if arr[idx].strip() != '':
name_mapping[arr[idx]] = arr[0]
print name_mapping
type2item = {}
lines = utils.read_text_file(ann_file)
output = ''
type2textnodes = {}
type2num_mentions = {}
type2articles = {}
for l in lines:
arr = l.split('\t')
article = arr[0]
id2anns = json.loads(arr[1])
output += arr[0] + '\n'
for id in id2anns:
a = id2anns[id]
d = a['selected'][0]
if d['r_index'] == -1:
continue
ann = NapEasyAnn(id)
ann.selector = d['loc']
ann.text_node_index = d['r_index']
ann.offset_start = d['r_start']
ann.offset_end = d['r_end']
ann.text = d['text'] if 'text' in d else '---'
ann.type = name_mapping[a['meta']] if a['meta'] in name_mapping else a['meta']
output += output_ann(ann) + '\n'
type2num_mentions[ann.type] = 1 if ann.type not in type2num_mentions else 1 + type2num_mentions[ann.type]
type2articles[ann.type] = [article] if ann.type not in type2articles else [article] + type2articles[ann.type]
tnode_id = article + ' ' + ann.selector + ' ' + str(ann.text_node_index)
type2textnodes[ann.type] = [tnode_id] if ann.type not in type2textnodes \
else [tnode_id] + type2textnodes[ann.type]
#if ann.type != 'The conclusion/finding':
type2item[ann.type] = [ann.text] if ann.type not in type2item else type2item[ann.type] + [ann.text]
output += '\n'
output += 'vocabularies\n'
for t in type2item:
type2item[t] = sorted(list(set([s.strip() for s in type2item[t]])))
output += '%s\t%s\n' % (t, len(type2item[t]))
output += '\n'.join(type2item[t])
output += '\n\n'
output += '\ntype to node number\n'
for t in type2textnodes:
output += '%s\t%s\n' % (t, len(set(type2textnodes[t])))
print output
for t in type2num_mentions:
num_articles = len(set(type2articles[t]))
print '%s\t%s\t%s' % (t, type2num_mentions[t], num_articles)
def main():
process_anns('./resources/napeasy_anns_v2.txt', './resources/short_names.txt')
if __name__ == "__main__":
main()