forked from spyysalo/standoff2conll
-
Notifications
You must be signed in to change notification settings - Fork 2
/
standoff2conll.py
executable file
·168 lines (133 loc) · 5.84 KB
/
standoff2conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
from __future__ import print_function
import sys
import os
import codecs
from logging import error
from document import Document
from common import pairwise
from asciify import document_to_ascii
from unicode2ascii import log_missing_ascii_mappings
from tagsequence import TAGSETS, IO_TAGSET, IOBES_TAGSET, DEFAULT_TAGSET
from tagsequence import BIO_to_IO, BIO_to_IOBES
from standoff import OVERLAP_RULES, load_postags_into_document
OUTPUT_TYPES = {'CONLL': 0, 'ROTHANDYIH': 1}
def argparser():
import argparse
ap = argparse.ArgumentParser(description='Convert standoff to CoNLL format',
usage='%(prog)s [OPTIONS] DIRECTORY')
ap.add_argument('directory')
ap.add_argument('-1', '--singletype', default=None, metavar='TYPE',
help='replace all annotation types with TYPE')
ap.add_argument('-a', '--asciify', default=None, action='store_true',
help='map input to ASCII')
ap.add_argument('-n', '--no-sentence-split', default=False,
action='store_true',
help='do not perform sentence splitting')
ap.add_argument('-o', '--overlap-rule', choices=OVERLAP_RULES,
default=OVERLAP_RULES[0],
help='rule to apply to resolve overlapping annotations')
ap.add_argument('-s', '--tagset', choices=TAGSETS, default=None,
help='tagset (default %s)' % DEFAULT_TAGSET)
ap.add_argument('-p', '--postag', choices=TAGSETS, default=None,
help='tagset (default %s)' % DEFAULT_TAGSET)
ap.add_argument('--process', choices=['CONLL','ROTHANDYIH'], default='CONLL',
help='switch between processes for the output format CONLL, or ROTHANDYIH')
ap.add_argument('--process_pos_tag_input',
help='the pos tag input file used for ROTHANDYIH')
return ap
def is_standoff_file(fn):
return os.path.splitext(fn)[1] == '.ann'
def txt_for_ann(filename):
return os.path.splitext(filename)[0]+'.txt'
def read_ann(filename, options, encoding='utf-8', filepos = False):
txtfilename = txt_for_ann(filename)
with codecs.open(txtfilename, 'rU', encoding=encoding) as t_in:
with codecs.open(filename, 'rU', encoding=encoding) as a_in:
return Document.from_standoff(
t_in.read(), a_in.read(),
sentence_split = not options.no_sentence_split,
overlap_rule = options.overlap_rule,
filepos = filepos
)
def replace_types_with(document, type_):
from tagsequence import OUT_TAG, parse_tag, make_tag
for sentence in document.sentences:
for token in sentence.tokens:
if token.tag != OUT_TAG:
token.tag = make_tag(parse_tag(token.tag)[0], type_)
def retag_document(document, tagset):
if tagset == IO_TAGSET:
mapper = BIO_to_IO
elif tagset == IOBES_TAGSET:
mapper = BIO_to_IOBES
else:
raise ValueError('tagset {}'.format(tagset))
for sentence in document.sentences:
for t, next_t in pairwise(sentence.tokens, include_last=True):
next_tag = next_t.tag if next_t is not None else None
t.tag = mapper(t.tag, next_tag)
def convert_directory_conll(directory, options):
files = [n for n in os.listdir(directory) if is_standoff_file(n)]
files = [os.path.join(directory, fn) for fn in files]
if not files:
error('No standoff files in {}'.format(directory))
return
conll_data = ''
for fn in sorted(files):
document = read_ann(fn, options)
if options.singletype:
replace_types_with(document, options.singletype)
if options.tagset:
retag_document(document, options.tagset)
if options.asciify:
document_to_ascii(document)
conll_data = conll_data + document.to_conll()
return conll_data.encode('utf-8')
def convert_directory_rothandyih(directory, options, filepos):
files = [n for n in os.listdir(directory) if is_standoff_file(n)]
files = [os.path.join(directory, fn) for fn in files]
if not files:
error('No standoff files in {}'.format(directory))
return
conll_data = ''
lines = []
with open(filepos) as f:
lines = f.readlines()
previous_position = 0
for fn in sorted(files):
document = read_ann(fn, options, filepos = filepos)
if options.singletype:
replace_types_with(document, options.singletype)
if options.tagset:
retag_document(document, options.tagset)
if options.asciify:
document_to_ascii(document)
previous_position = load_postags_into_document(document, filepos, previous_position, lines)
conll_data = conll_data + document.to_rothandyih()
return conll_data.encode('utf-8')
def conversion_entry(argv, which, filepos = False):
# extra node just to compatibility with command line
data = convert_and_return([''] + argv, which, filepos)
return data
def convert_and_return(argv, which, filepos):
if not os.path.isdir(argv.directory):
error('Not a directory: {}'.format(argv.directory))
return 1
if which == OUTPUT_TYPES['CONLL']:
data = convert_directory_conll(argv.directory, argv)
elif which == OUTPUT_TYPES['ROTHANDYIH']:
data = convert_directory_rothandyih(argv.directory, argv, filepos)
if argv.asciify:
log_missing_ascii_mappings()
return data
def main(argv):
argv = argparser().parse_args(argv[1:])
if argv.process == 'CONLL':
data = convert_and_return(argv, OUTPUT_TYPES[argv.process], False)
elif argv.process == 'ROTHANDYIH':
data = convert_and_return(argv, OUTPUT_TYPES[argv.process], argv.process_pos_tag_input)
sys.stdout.write(data)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))