-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_loader.py
141 lines (102 loc) · 4.18 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
data_loader.py: provides functions to load the provided data into memory.
This code can load datasets with the following package structure:
DATASET_NAME/
|
|_ README.md
|
|_ eval/
| |
| |_ DATASET_NAME__TEST.csv
| |
| |_ DATASET_NAME__DEV.csv (optional)
|
|_ training/
|
|_ DATASET_NAME__FULL.csv
All provided datasets already have this format.
"""
# ======================================================================================================================
#
# CODE SETUP
#
# ======================================================================================================================
# ====>> Python Native Imports <<====
import os
import sys
import csv
csv.field_size_limit(sys.maxsize) # For the Sougou dataset
# ====>> Authorship Info <<====
__author__ = ["Ed Collins", "Nikolai Rozanov", "Bingbing Zhang"]
__licence__ = "MIT"
__version__ = "0.0.1"
# ======================================================================================================================
# ======================================================================================================================
#
# FUNCTIONS
#
# ======================================================================================================================
def load_two_column_csv_data(fullPath):
"""
Loads data from a .csv file with precisely two columns and returns them as two lists.
:param fullPath : the full path to the .csv file to load.
:type fullPath : str
:return : list of column 1 from the .csv file, list of column 2 from the .csv file
"""
listOne, listTwo = [], []
with open(fullPath, "r") as f:
reader = csv.reader(f)
for line in reader:
if not line:
continue
assert len(line) == 2, "This function is only for two column .csv data."
listOne.append(line[0])
listTwo.append(line[1])
return listOne, listTwo
def _load_all(trainPath, validPath, testPath):
"""
Performs the actual loading of the datasets.
:param trainPath : path to training data.
:type trainPath : str
:param validPath : path to the validation data, may not exist as there is not always a validation set.
:type validPath : str
:param testPath : path to testing data.
:type testPath : str
:return : dictionary mapping train, valid and test to tuples of the data: (text, label)
"""
if os.path.exists(validPath):
return {"TRAIN" : load_two_column_csv_data(trainPath),
"VALID" : load_two_column_csv_data(validPath),
"TEST" : load_two_column_csv_data(testPath)}
else:
return {"TRAIN" : load_two_column_csv_data(trainPath),
"TEST" : load_two_column_csv_data(testPath)}
def load_dataset(datasetName):
"""
Loads a dataset into a dictionary. Note that the dataset directory must have the following format:
DATASET_NAME/
|
|_ README.md
|
|_ eval/
| |
| |_ DATASET_NAME__TEST.csv
| |
| |_ DATASET_NAME__DEV.csv (optional)
|
|_ training/
|
|_ DATASET_NAME__FULL.csv
It also assumes that this directory will be stored in the CURRENT WORKING DIRECTORY.
:param datasetName : the name of the dataset to load (which is the same name as the top level directory.
:type datasetName : str
:return : a dictionary containing train, valid (optional) and test sets. Keys are the set name, values
a tuple of lists, the first one being sentences and the second labels.
"""
if datasetName.endswith("/"):
datasetName = datasetName.strip("/")
trainPath = datasetName + "/training/" + datasetName + "__FULL.csv"
validPath = datasetName + "/eval/" + datasetName + "__DEV.csv"
testPath = datasetName + "/eval/" + datasetName + "__TEST.csv"
return _load_all(trainPath, validPath, testPath)
# ======================================================================================================================