-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_utils.py
173 lines (154 loc) · 7.99 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import pandas as pd
import json
import pickle
import numpy as np
from utils import ROOT_DIR
def load_cola():
train_sentences, train_label, test_sentences, test_label = [], [], [], []
file2 = open(f"{ROOT_DIR}/glue/CoLA/train.txt", "r", encoding="utf-8")
for line in file2.readlines():
line = line.strip('\n').split('\t')
train_sentences.append(line[0])
train_label.append(int(line[1]))
filet = open(f"{ROOT_DIR}/glue/CoLA/dev.txt", "r", encoding="utf-8")
for linet in filet.readlines():
linet = linet.strip('\n').split('\t')
test_sentences.append(linet[0])
test_label.append(int(linet[1]))
return train_sentences, train_label, test_sentences, test_label
def load_sst2():
def process_raw_data_sst(lines):
"""from lines in dataset to two lists of sentences and labels respectively"""
labels = []
sentences = []
for line in lines:
labels.append(int(line[0]))
sentences.append(line[2:].strip())
return sentences, labels
with open(f"{ROOT_DIR}/data/sst2/stsa.binary.train", "r", encoding="utf-8") as f:
train_lines = f.readlines()
with open(f"{ROOT_DIR}/data/sst2/stsa.binary.test", "r", encoding="utf-8") as f:
test_lines = f.readlines()
train_sentences, train_labels = process_raw_data_sst(train_lines)
test_sentences, test_labels = process_raw_data_sst(test_lines)
return train_sentences, train_labels, test_sentences, test_labels
def load_agnews():
train_data = pd.read_csv(f'{ROOT_DIR}/data/agnews/train.csv')
test_data = pd.read_csv(f'{ROOT_DIR}/data/agnews/test.csv')
train_sentences = train_data['Title'] + ". " + train_data['Description']
train_sentences = list(
[item.replace(' #39;s', '\'s').replace(' quot;', "\"").replace('\\', " ").replace(' #39;ll', "'ll") for item
in train_sentences]) # some basic cleaning
train_labels = list(train_data['Class Index'])
test_sentences = test_data['Title'] + ". " + test_data['Description']
test_sentences = list(
[item.replace(' #39;s', '\'s').replace(' quot;', "\"").replace('\\', " ").replace(' #39;ll', "'ll") for item
in test_sentences]) # some basic cleaning
test_labels = list(test_data['Class Index'])
train_labels = [l - 1 for l in train_labels] # make them 0, 1, 2, 3 instead of 1, 2, 3, 4
test_labels = [l - 1 for l in test_labels]
return train_sentences, train_labels, test_sentences, test_labels
def load_trec():
inv_label_dict = {'NUM': 0, 'LOC': 1, 'HUM': 2, 'DESC': 3, 'ENTY': 4, 'ABBR': 5}
train_sentences = []
train_labels = []
with open(f'{ROOT_DIR}/data/trec/train.txt', 'r', encoding="utf-8") as train_data:
for line in train_data:
train_label = line.split(' ')[0].split(':')[0]
train_label = inv_label_dict[train_label]
train_sentence = ' '.join(line.split(' ')[1:]).strip()
# basic cleaning
train_sentence = train_sentence.replace(" 's", "'s").replace('`` ', '"').replace(" ''",'"').replace(' ?','?').replace(' ,',',')
train_labels.append(train_label)
train_sentences.append(train_sentence)
test_sentences = []
test_labels = []
with open(f'{ROOT_DIR}/data/trec/test.txt', 'r', encoding="utf-8") as test_data:
for line in test_data:
test_label = line.split(' ')[0].split(':')[0]
test_label = inv_label_dict[test_label]
test_sentence = ' '.join(line.split(' ')[1:]).strip()
test_sentence = test_sentence.replace(" 's", "'s").replace('`` ', '"').replace(" ''",'"').replace(' ?','?').replace(' ,',',')
test_labels.append(test_label)
test_sentences.append(test_sentence)
return train_sentences, train_labels, test_sentences, test_labels
def load_rte():
train_questions = []
train_answers = []
with open(f"{ROOT_DIR}/data/rte/train.jsonl", "r", encoding="utf-8") as f:
for line in f:
myjson = json.loads(line)
q = myjson['hypothesis']
p = myjson['premise']
if myjson['label'] == 'not_entailment':
train_answers.append(0)
elif myjson['label'] == 'entailment':
train_answers.append(1)
else:
exit('answer')
train_questions.append(p + '\n' + 'question: ' + q + ' True or False?')
test_questions = []
test_answers = []
with open(f"{ROOT_DIR}/data/rte/val.jsonl", "r", encoding="utf-8") as f:
for line in f:
myjson = json.loads(line)
q = myjson['hypothesis']
p = myjson['premise']
if myjson['label'] == 'not_entailment':
test_answers.append(0)
elif myjson['label'] == 'entailment':
test_answers.append(1)
else:
exit('answer')
test_questions.append(p + '\n' + 'question: ' + q + ' True or False?')
return train_questions, train_answers, test_questions, test_answers
def load_dataset(params):
if params['dataset'] == 'sst2':
orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_sst2()
params['prompt_prefix'] = ""
params["q_prefix"] = "Review: "
params["a_prefix"] = "Sentiment: "
params['label_dict'] = {0: ['Negative'], 1: ['Positive']}
params['inv_label_dict'] = {'Negative': 0, 'Positive': 1}
params['task_format'] = 'classification'
params['num_tokens_to_predict'] = 1
elif params['dataset'] == 'agnews':
orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_agnews()
params['prompt_prefix'] = "Classify the news articles into the categories of World, Sports, Business, and Technology.\n\n"
params["q_prefix"] = "Article: "
params["a_prefix"] = "Answer: "
params['label_dict'] = {0: ['World'], 1: ['Sports'], 2: ['Business'], 3: ['Technology', 'Science']}
params['inv_label_dict'] = {'World': 0, 'Sports': 1, 'Business': 2, 'Technology': 3, 'Science': 3} # notice index start from 1 here
params['task_format'] = 'classification'
params['num_tokens_to_predict'] = 1
elif params['dataset'] == 'trec':
orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_trec()
params['prompt_prefix'] = "Classify the questions based on whether their answer type is a Number, Location, Person, Description, Entity, or Abbreviation.\n\n"
params["q_prefix"] = "Question: "
params["a_prefix"] = "Answer Type: "
params['label_dict'] = {0: ['Number'], 1: ['Location'], 2: ['Person'], 3: ['Description'], 4: ['Entity'], 5: ['Ab']}
params['inv_label_dict'] = {'Number': 0, 'Location': 1, 'Person': 2, 'Description': 3, 'Entity': 4, 'Ab': 5}
params['task_format'] = 'classification'
params['num_tokens_to_predict'] = 1
elif params['dataset'] == 'rte':
orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_rte()
params['prompt_prefix'] = ""
params["q_prefix"] = " "
params["a_prefix"] = "answer: "
params['label_dict'] = {0: ['False'], 1: ['True']}
params['inv_label_dict'] = {'False': 0, 'True': 1}
params['num_user_input'] = 2
params['task_format'] = 'classification'
params['num_tokens_to_predict'] = 1
elif params['dataset'] == 'cola':
orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels = load_cola()
params['prompt_prefix'] = ""
params["q_prefix"] = "Sentence: "
params["a_prefix"] = "Hypothesis: the sentence is grammatical, true or false? "
params['label_dict'] = {0: ['false'], 1: ['true']}
params['inv_label_dict'] = {'false': 0, 'true': 1}
params['task_format'] = 'classification'
params['num_tokens_to_predict'] = 1
else:
raise NotImplementedError
return orig_train_sentences, orig_train_labels, orig_test_sentences, orig_test_labels