-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathloader.py
152 lines (131 loc) · 4.96 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import re
import codecs
from data_utils import iob2, iob_iobes, create_dico, create_mapping, get_seg_features, get_sub_features,get_doc_features
def load_sentences(path):
sentences = []
sentence = []
num = 0
for line in codecs.open(path, 'r', 'utf8'):
num+=1
line =line.rstrip()
if not line:
if len(sentence) > 0:
sentences.append(sentence)
sentence = []
else:
if line[0] == " ":
line = "$" + line[1:]
word = line.split()
else:
word= line.split()
sentence.append(word)
if len(sentence) > 0:
sentences.append(sentence)
return sentences
def update_tag_scheme(sentences, tag_scheme):
for i,s in enumerate(sentences):
tags = [w[-1] for w in s]
if not iob2(tags):
s_str = '\n'.join(' '.join(w) for w in s)
raise Exception('Sentences should be given in IOB format!'
+ 'please check sentence %i:\n%s' %(i,s_str))
if tag_scheme == 'iob':
for word, new_tag in zip(s, tags):
word[-1] = new_tag
elif tag_scheme == 'iobes':
new_tags = iob_iobes(tags)
for word, new_tag in zip(s, new_tags):
word[-1] = new_tag
else:
raise Exception('Unknow tagging scheme!')
def char_mapping(sentences, lower):
chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
dico = create_dico(chars)
dico["<PAD>"] = 10000001
dico['<UNK>'] = 10000000
char_to_id, id_to_char = create_mapping(dico)
print("Found %i unique words (%i in total)" % (
len(dico), sum(len(x) for x in chars)
))
return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary,ext_emb_path,chars):
print('Loading pretrained embeddings from %s...' % ext_emb_path)
assert os.path.isfile(ext_emb_path)
pretrained = set([
line.rstrip().split()[0].strip()
for line in codecs.open(ext_emb_path, 'r', 'utf-8')
])
if chars is None:
for char in pretrained:
if char not in dictionary:
dictionary[char] = 0
else:
for char in chars:
if any(x in pretrained for x in [
char,
char.lower(),
re.sub('\d', '0', char.lower())
]) and char not in dictionary:
dictionary[char] = 0
word_to_id,id_to_word = create_mapping(dictionary)
return dictionary, word_to_id, id_to_word
def tag_mapping(sentences):
tags = [[char[-1] for char in s ] for s in sentences]
dico = create_dico(tags)
tag_to_id, id_to_tag = create_mapping(dico)
print("Found %i unique named entity tags" % len(dico))
return dico, tag_to_id, id_to_tag
def prepare_dataset_(sentences, char_to_id, tag_to_id, train = True):
none_index = tag_to_id['O']
data = []
import json
doc_file = "doc.utf8"
with codecs.open(doc_file, 'r', 'utf-8') as f:
data_doc = f.readlines()
doc_dict = json.loads(data_doc[0])
for s in sentences:
string = [w[0] for w in s]
doc_id = [w[1] for w in s]
chars = [char_to_id[w if w in char_to_id else '<UNK>']
for w in string]
doc_chars = get_doc_features(doc_id, char_to_id, doc_dict, string)
entity_tags = [w[-3] for w in s]
segs = get_seg_features(string, entity_tags)
entity_subtype = [w[-2] for w in s]
subtypes = get_sub_features(string, entity_subtype)
if train:
tags = [tag_to_id[w[-1]] for w in s]
else:
tags = [none_index for _ in chars]
data.append([string, doc_chars, chars, segs, subtypes, tags])
return data
def prepare_dataset(sentences, char_to_id, tag_to_id, train = True):
none_index = tag_to_id['O']
data = []
import json
doc_file = "doc_dict.utf8"
with codecs.open(doc_file, 'r', 'utf-8') as f:
data_doc = f.readlines()
doc_dict = json.loads(data_doc[0])
for s in sentences:
string, doc_id, entity_types, entity_subtype, tags = list(), list(), list(), list(), list()
for w in s:
if w[0] != "...":
string.append(w[0])
doc_id.append(w[1])
entity_types.append(w[-3])
entity_subtype.append(w[-2])
tags.append(w[-1])
if len(string)> 4:
chars = [char_to_id[w if w in char_to_id else '<UNK>']
for w in string]
doc_chars = get_doc_features(doc_id, char_to_id, doc_dict, chars)
types = get_seg_features(string, entity_types)
subtypes = get_sub_features(string, entity_subtype)
if train:
tags = [tag_to_id[w] for w in tags]
else:
tags = [none_index for _ in chars]
data.append([string, doc_chars, chars, types, subtypes, tags])
return data