-
Notifications
You must be signed in to change notification settings - Fork 0
/
obt.py
146 lines (119 loc) · 4.25 KB
/
obt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from os import path, getenv, remove, devnull
from subprocess import check_output
FNULL = open(devnull, 'w')
OBT_PATH = getenv("OBT_PATH", "")
if OBT_PATH == "":
raise EnvironmentError("Path to Oslo-Bergen-Tagger installation dir 'OBT_PATH' not set.")
TAGS = {
'adj': {
'kjønn': ['m/f', 'nøyt', 'fem'],
'tall': ['ent', 'fl'],
'type': ['<adv>', '<ordenstall>', '<perf-part>', '<pres-part>', 'fork'],
'best': ['ub', 'be'],
'grad': ['pos', 'kom', 'sup']
},
'adv': {
'type': ['fork']
},
'det': {
'kjønn': ['fem', 'nøyt', 'mask'],
'tall': ['ent', 'fl'],
'type': ['dem', 'dem <adj>', '<adj> forst',
'<adj> kvant', 'kvant', 'poss', 'poss res', 'poss høflig', 'sp', 'forst'],
'best': ['ub', 'be'],
},
'konj': {
'type': ['<adv>', 'clb'],
},
'prep': {
'type': ['fork']
},
'pron': {
'kjønn': ['fem', 'mask', 'mask fem', 'nøyt'],
'tall': ['ent', 'fl'],
'type': ['hum res', 'hum sp', 'pers', 'pers hum', 'pers høflig', 'poss hum sp', 'refl', 'sp', 'res'],
'person': ["1", "2", "3"],
'kasus': ['nom', 'akk'],
},
'sbu': {
'type': ['<spørreartikkel>'],
},
'subst': {
'kjønn': ['nøyt', 'fem', 'mask'],
'tall': ['ent', 'fl'],
'type': ['appell fork', 'appell', 'prop', 'fork'],
'best': ['ub', 'be'],
'kasus': ['gen'],
},
'verb': {
'tid': ['pres inf pass', 'pres', 'inf', 'pret', 'perf-part', 'imp'],
}
}
def write_file(data, filepath):
with open(filepath, "w+") as f:
f.write(data)
def assign_tags(word_tags):
pos_tag = word_tags[0]
tags = word_tags[1:]
tag = {'ordklasse': pos_tag, 'raw_tags': ' '.join(word_tags)}
num_tags = len(tags)
num_tags_assigned = 0
while len(tags) > 0 and num_tags_assigned < num_tags:
proposed_tag = ' '.join(tags)
found = False
if pos_tag in TAGS:
for t in TAGS[pos_tag]:
if proposed_tag in TAGS[pos_tag][t]:
tag[t] = proposed_tag
found = True
break
length = len(tags)
if found:
num_tags_assigned += length
tags = word_tags[1+num_tags_assigned:]
elif length == 1:
if "tilleggstagger" not in tag:
tag["tilleggstagger"] = []
tag["tilleggstagger"].append(proposed_tag)
num_tags_assigned += 1
tags = word_tags[1+num_tags_assigned:]
else:
tags = tags[:-1]
return tag
def check_input(text, file):
if text is None and file is None:
raise ValueError("No argument passed. Either pass a string or a filename using the file= kwarg")
if text is not None and file is not None:
raise ValueError("Both a string and file were passed as argument. Please only use one.")
if file is not None and not path.isfile(file):
raise FileNotFoundError("Could not find file called \"" + str(file) + "\"")
def save_json(tags, filename):
from json import dumps
write_file(dumps(tags, indent=2), filename)
def tag_bm(text=None, file=None, encoding="UTF-8"):
check_input(text, file)
if text is not None:
temp_file = "/tmp/obtfile.txt"
write_file(text, temp_file)
result = check_output([path.join(OBT_PATH, "tag-bm.sh"), temp_file], stderr=FNULL).decode(encoding)
remove(temp_file)
else:
result = check_output([path.join(OBT_PATH, "tag-bm.sh"), file], stderr=FNULL).decode(encoding)
tags = []
lines = result.split("\n")
tag_starts = [line for line in lines if line.startswith("<word>")]
num_tags = len(tag_starts)
tag_start_indexes = [lines.index(tag_start) for tag_start in tag_starts]
for i in range(num_tags):
index = tag_start_indexes[i]
word = lines[index].strip()[6:-7]
word_tag = lines[index + 1].strip()[1:-1]
word_tags_split = lines[index + 2].strip().split()
base = word_tags_split[0][1:-1]
word_tags = word_tags_split[1:]
tag = assign_tags(word_tags)
tag["word"] = word
tag["word_tag"] = word_tag
tag["base"] = base
tags.append(tag)
return tags