This repository has been archived by the owner on Feb 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TokenLimitedColumnDataset.py
262 lines (234 loc) · 10.6 KB
/
TokenLimitedColumnDataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import re
from logging import INFO
from pathlib import Path
from typing import Dict, List, Union
from flair.data import Sentence, Token, Corpus
from flair.datasets import log
from torch.utils.data import Dataset, random_split
class TokenLimitedColumnCorpus(Corpus):
def __init__(
self,
data_folder: Union[str, Path],
column_format: Dict[int, str],
train_file=None,
test_file=None,
dev_file=None,
tag_to_bioes=None,
in_memory: bool = True,
min_sentence_length: int = -1,
max_sentence_length: int = -1,
stride: int = 4,
evaluation: bool = False,
log_level=INFO,
):
"""
Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
:param data_folder: base folder with the task data
:param column_format: a map specifying the column format
:param train_file: the name of the train file
:param test_file: the name of the test file
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:param tag_to_bioes: whether to convert to BIOES tagging scheme
:return: a TaggedCorpus with annotated train, dev and test data
"""
if type(data_folder) == str:
data_folder: Path = Path(data_folder)
if train_file is not None:
train_file = data_folder / train_file
if test_file is not None:
test_file = data_folder / test_file
if dev_file is not None:
dev_file = data_folder / dev_file
# automatically identify train / test / dev files
if train_file is None:
for file in data_folder.iterdir():
file_name = file.name
if file_name.endswith(".gz"):
continue
if "train" in file_name and not "54019" in file_name:
train_file = file
if "dev" in file_name:
dev_file = file
if "testa" in file_name:
dev_file = file
if "testb" in file_name:
test_file = file
# if no test file is found, take any file with 'test' in name
if not evaluation and test_file is None:
for file in data_folder.iterdir():
file_name = file.name
if file_name.endswith(".gz"):
continue
if "test" in file_name:
test_file = file
log.log(log_level, "Reading data from {}".format(data_folder))
log.log(log_level, "Train: {}".format(train_file))
log.log(log_level, "Dev: {}".format(dev_file))
log.log(log_level, "Test: {}".format(test_file))
# get train data
train = TokenLimitedColumnDataset(
train_file, column_format, tag_to_bioes, in_memory=in_memory,
min_sentence_length=min_sentence_length, max_sentence_length=max_sentence_length,
stride=stride
)
# read in test file if exists, otherwise sample 10% of train data as test dataset
if test_file is not None:
test = TokenLimitedColumnDataset(
test_file, column_format, tag_to_bioes, in_memory=in_memory,
min_sentence_length=min_sentence_length, max_sentence_length=max_sentence_length,
stride=stride
)
elif not evaluation:
train_length = len(train)
test_size: int = round(train_length / 10)
splits = random_split(train, [train_length - test_size, test_size])
train = splits[0]
test = splits[1]
else:
test = Dataset()
# read in dev file if exists, otherwise sample 10% of train data as dev dataset
if dev_file is not None:
dev = TokenLimitedColumnDataset(
dev_file, column_format, tag_to_bioes, in_memory=in_memory,
min_sentence_length=min_sentence_length, max_sentence_length=max_sentence_length,
stride=stride
)
elif not evaluation:
train_length = len(train)
dev_size: int = round(train_length / 10)
splits = random_split(train, [train_length - dev_size, dev_size])
train = splits[0]
dev = splits[1]
else:
dev = Dataset()
super(TokenLimitedColumnCorpus, self).__init__(train, dev, test, name=data_folder.name)
class TokenLimitedColumnDataset(Dataset):
def __init__(
self,
path_to_column_file: Path,
column_name_map: Dict[int, str],
tag_to_bioes: str = None,
in_memory: bool = True,
min_sentence_length: int = -1,
max_sentence_length: int = -1,
stride: int = 4,
):
"""
:param max_sentence_length: The maximum sequence length. Set to "-1" to disable sequence length check.
:param stride: The stride on which to split sequences that exceed the maximum length.
"""
assert path_to_column_file.exists()
self.path_to_column_file = path_to_column_file
self.tag_to_bioes = tag_to_bioes
self.column_name_map = column_name_map
# store either Sentence objects in memory, or only file offsets
self.in_memory = in_memory
if self.in_memory:
self.sentences: List[Sentence] = []
else:
self.indices: List[int] = []
self.total_sentence_count: int = 0
# most data sets have the token text in the first column, if not, pass 'text' as column
self.text_column: int = 0
self.start_column: int = -1
self.end_column: int = -1
for column in self.column_name_map:
if column_name_map[column] == "text":
self.text_column = column
if column_name_map[column] == "begin" or column_name_map[column] == "start":
self.start_column = column
if column_name_map[column] == "end":
self.end_column = column
# determine encoding of text file
encoding = "utf-8"
try:
lines: List[str] = open(str(path_to_column_file), encoding="utf-8").read(
10
).strip().split("\n")
except:
log.info(
'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format(
path_to_column_file
)
)
encoding = "latin1"
sentence: Sentence = Sentence()
with open(str(self.path_to_column_file), encoding=encoding) as file:
skipped_sentences = 0
for line in file:
if line.startswith("#"):
continue
if line.strip().replace("", "") == "":
if len(sentence) > 0:
sentence.infer_space_after()
if self.in_memory:
if self.tag_to_bioes is not None:
sentence.convert_tag_scheme(
tag_type=self.tag_to_bioes, target_scheme="iobes"
)
sentence_length = len(sentence)
if 0 < max_sentence_length <= sentence_length or 0 < min_sentence_length > sentence_length:
skipped_sentences += 1
else:
self.sentences.append(sentence)
self.total_sentence_count += 1
else:
raise NotImplementedError(
"The SequenceLimitedDataset currently only supports in memory operation!")
sentence: Sentence = Sentence()
else:
fields: List[str] = re.split("\s+", line.strip())
token = self.create_token(fields, column_name_map)
sentence.add_token(token)
if len(sentence.tokens) > 0:
sentence.infer_space_after()
if self.in_memory:
if self.tag_to_bioes is not None:
sentence.convert_tag_scheme(
tag_type=self.tag_to_bioes, target_scheme="iobes"
)
sentence_length = len(" ".join([t.text for t in sentence.tokens]))
if 0 < max_sentence_length < sentence_length:
skipped_sentences += 1
for i in range(0, int(len(sentence) / stride)):
split_sentence: Sentence = Sentence()
offset = i * stride
curr_len = 0
for token in sentence.tokens[offset:]:
if curr_len + (1 + len(token.text)) <= max_sentence_length:
split_sentence.add_token(token)
curr_len += len(token.text) + 1
else:
break
self.sentences.append(split_sentence)
self.total_sentence_count += 1
else:
self.sentences.append(sentence)
self.total_sentence_count += 1
else:
raise NotImplementedError(
"The SequenceLimitedDataset currently only supports in memory operation!")
if skipped_sentences > 0:
log.info(f'Skipped {skipped_sentences} sentences with '
f'less than {min_sentence_length} or '
f'more than {max_sentence_length} tokens.')
def create_token(self, fields: List[Union[int, str]], column_name_map: Dict[int, str]):
if self.start_column != -1:
token = Token(fields[self.text_column], start_position=int(fields[self.start_column]))
else:
token = Token(fields[self.text_column])
for column in column_name_map:
if len(fields) > column:
if column != self.text_column and column != self.start_column and column != self.end_column:
token.add_tag(
self.column_name_map[column], fields[column]
)
return token
def __len__(self):
return self.total_sentence_count
def __getitem__(self, index: int = 0) -> Sentence:
if self.in_memory:
sentence = self.sentences[index]
else:
raise NotImplementedError("The SequenceLimitedDataset currently only supports in memory operation!")
return sentence