-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
249 lines (216 loc) · 11.7 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
from constants.disco_eval_constants import DiscoEvalConstants
from constants.glue_constants import GlueConstants
import constants.glue_constants as glue_constants
import typing
from constants.base_constants import *
dec = DiscoEvalConstants()
def preprocess_function_n_inputs(
examples: typing.Dict[str, typing.Any],
label_names: typing.Dict[int, str],
task_name: str,
label_column_name: str,
) -> typing.Dict[str, typing.List[str]]:
"""
Pre-processes batches of examples with two textual inputs for an encoder-decoder model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
label_names: A dictionary mapping from the integer representation of the label to the string representation.
task_name: The name of the task (i.e. SPArxib/RST/etc.).
label_column_name: Name of the column within the input dictionary that contains the labels text.
Returns:
A dictionary containing the original mappings, as well as mappings to processed inputs and outputs.
"""
outputs = [label_names[str(example)] for example in examples[label_column_name]]
examples.pop(label_column_name)
examples_values = dict(examples).values() # takes the values for each text column of the dataset
transposed_values = list(zip(*examples_values)) # transposes a matrix (list of lists)
inputs = [[f"{dec.TEXT_COLUMN_NAMES[i]}: {sent}" for i, sent in enumerate(exmple)] for exmple in transposed_values]
inputs = ["\t".join(exmple) for exmple in inputs]
inputs = [f"{task_name}: {exmple}" for exmple in inputs]
result = {'processed_inputs': inputs, 'processed_outputs': outputs}
return result
def preprocess_function_one_input(
examples: typing.Dict[str, typing.Any],
label_names: typing.Dict[int, str],
prefix: str,
text_column_name: str = GlueConstants.SENTENCE,
label_column_name: str = TokenizedExampleColumnNames.LABEL.value,
) -> typing.Dict[str, typing.List[str]]:
"""
Pre-processes batches of examples with only a single textual input for an encoder-decoder model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix: The string prefix prepended to each textual example. (This is task specific)
text_column_name: Name of the column within the input dictionary that contains the text.
label_column_name: Name of the column within the input dictionary that contains the labels text.
Returns:
A dictionary containing the original mappings, as well as mappings to processed inputs and outputs.
"""
inputs = [f"{prefix}{sentence}" for sentence in examples[text_column_name]]
outputs = [label_names[example] for example in examples[label_column_name]]
result = {'processed_inputs': inputs, 'processed_outputs': outputs}
return result
def preprocess_function_two_inputs(
examples: typing.Dict[str, typing.Any],
label_names: typing.Dict[int, str],
prefix_1: str,
prefix_2: str,
text_column_name_1: str,
text_column_name_2: str,
label_column_name: str,
is_regression: bool = False,
) -> typing.Dict[str, typing.List[str]]:
"""
Pre-processes batches of examples with two textual inputs for an encoder-decoder model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix_1: The string prefix prepended to the first textual example. (This is task specific)
prefix_2: The string prefix prepended to the second textual example.
text_column_name_1: Name of the first column within the input dictionary that contains the text.
text_column_name_2: Name of the second column within the input dictionary that contains the text.
label_column_name: Name of the column within the input dictionary that contains the labels text.
is_regression: Whether the task is a regression task or not.
Returns:
A dictionary containing the original mappings, as well as mappings to processed inputs and outputs.
"""
inputs_1 = [f"{prefix_1}{sentence}" for sentence in examples[text_column_name_1]]
inputs_2 = [f"{prefix_2}{sentence}" for sentence in examples[text_column_name_2]]
inputs = [f"{sent1} {sent2}" for sent1, sent2 in zip(inputs_1, inputs_2)]
if is_regression: # Training task involves predicting continuous values
outputs = [str(round(example, 1)) for example in examples[label_column_name]]
else: # Training task involves predicting a label from a predefined set of possible labels.
outputs = [label_names[example] for example in examples[label_column_name]]
result = {'processed_inputs': inputs, 'processed_outputs': outputs}
return result
def create_preprocess_function_one_input(
label_names: typing.Dict[int, str],
prefix: str,
text_column_name,
label_column_name,
) -> typing.Callable[[typing.Dict[str, typing.Any]], typing.Dict[str, typing.List[str]]]:
"""
Creates a pre-processing function for batches of examples with only a single textual input for an encoder-decoder
model.
Args:
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix: The string prefix prepended to each textual example. (This is task specific)
text_column_name: Name of the column within the input dictionary that contains the text.
label_column_name: Name of the column within the input dictionary that contains the labels text.
Returns:
A pre-processing function for batches of examples with only a single textual input for an encoder-decoder model.
"""
def preprocess_function(examples: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.List[str]]:
return preprocess_function_one_input(
examples=examples,
label_names=label_names,
prefix=prefix,
text_column_name=text_column_name,
label_column_name=label_column_name,
)
return preprocess_function
def create_preprocess_function_two_inputs(
label_names: typing.Dict[int, str],
prefix_1: str,
prefix_2: str,
text_column_name_1,
text_column_name_2,
label_column_name,
is_regression: bool = False,
) -> typing.Callable[[typing.Dict[str, typing.Any]], typing.Dict[str, typing.List[str]]]:
"""
Creates a pre-processing function for batches of examples with two textual inputs for an encoder-decoder model.
Args:
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix_1: The string prefix prepended to the first textual example. (This is task specific)
prefix_2: The string prefix prepended to the second textual example.
text_column_name_1: Name of the first column within the input dictionary that contains the text.
text_column_name_2: Name of the second column within the input dictionary that contains the text.
label_column_name: Name of the column within the input dictionary that contains the labels text.
is_regression: Whether the task is a regression task or not.
Returns:
A pre-processing function for batches of examples with two textual inputs for an encoder-decoder model.
"""
def preprocess_function(examples: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.List[str]]:
return preprocess_function_two_inputs(
examples=examples,
label_names=label_names,
prefix_1=prefix_1,
prefix_2=prefix_2,
text_column_name_1=text_column_name_1,
text_column_name_2=text_column_name_2,
label_column_name=label_column_name,
is_regression=is_regression,
)
return preprocess_function
def create_preprocess_function_n_inputs(
label_names: typing.Dict[int, str],
task_name: str,
label_column_name,
) -> typing.Callable[[typing.Dict[str, typing.Any]], typing.Dict[str, typing.List[str]]]:
def preprocess_function(examples: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.List[str]]:
return preprocess_function_n_inputs(
examples=examples,
label_names=label_names,
task_name=task_name,
label_column_name=label_column_name,
)
return preprocess_function
def create_preprocess_function(
dataset_info: typing.Union[glue_constants.TaskConfigOneInput, glue_constants.TaskConfigTwoInput],
dataset_name: str,
is_regression: bool = False,
) -> typing.Callable[
[typing.Dict[str, typing.Any]],
typing.Dict[str, typing.List[str]]
]:
"""
Create a function to pre-process the examples within the specified dataset.
Preprocessing often involves the following steps:
1. Adding prefixes to the input/s (still represented as strings, yet to be tokenized)
2. Converting the label from a numerical value to the predetermined string equivalent. For example, in SST2,
the label 0 corresponds with 'negative' and the label '1' corresponds with 'positive'.
Args:
dataset_info: A dictionary representation of the dataset's metadata. Includes a mapping between integer labels
and their corresponding names, the prefixes to prepend to textual inputs, and the names of the input and
label text columns.
dataset_name: The name of the dataset that is processed by this function.
is_regression: Whether the task is a regression task or not.
Returns:
A function that takes in a batch of input examples, and returns a dictionary with the processed inputs and
labels. Note that the original batch of input example might include additional columns.
Raises:
RuntimeError if the dataset information is not formatted correctly.
"""
label_names = dataset_info.LABELS
label_column_name = dataset_info.LABEL_COLUMN_NAME
if dataset_name in GlueConstants.TASKS: # This refers to the GLUE and SUPERGLUE benchmarks.
if isinstance(dataset_info, glue_constants.TaskConfigOneInput):
return create_preprocess_function_one_input(
label_names=label_names,
label_column_name=label_column_name,
prefix=dataset_info.PREFIX,
text_column_name=dataset_info.TEXT_COLUMN_NAME,
)
elif isinstance(dataset_info, glue_constants.TaskConfigTwoInput):
return create_preprocess_function_two_inputs(
label_names=label_names,
label_column_name=label_column_name,
prefix_1=dataset_info.PREFIX_1,
prefix_2=dataset_info.PREFIX_2,
text_column_name_1=dataset_info.TEXT_COLUMN_NAME_1,
text_column_name_2=dataset_info.TEXT_COLUMN_NAME_2,
is_regression=(is_regression or dataset_name == 'stsb')
)
else:
raise RuntimeError(
"Unsupported prefix structure. Must contain either `prefix` for single input tasks or `prefix_1` and "
"`prefix_2` for two input tasks"
)
elif dataset_name in DiscoEvalConstants.TASKS: # This refers to the DiscoEval benchmark.
return create_preprocess_function_n_inputs(
label_names=label_names,
task_name=dataset_name,
label_column_name=label_column_name,
)