-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathPreprocessing.py
100 lines (82 loc) · 3.11 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gc
import glob
import itertools
import os
import pickle
import joblib
import keras
import numpy as np
from scipy import sparse
from tqdm import tqdm_notebook
from deep_semantic_similarity_model import create_model
import utils
train = joblib.load("train.pkl")
encoder = utils.ClaimEncoder()
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")
def stack_uneven(arrays, fill_value=0.):
'''
Fits arrays into a single numpy array, even if they are
different sizes. `fill_value` is the default value.
Args:
arrays: list of np arrays of various sizes
(must be same rank, but not necessarily same size)
fill_value (float, optional):
Returns:
np.ndarray
'''
sizes = [a.shape for a in arrays]
max_sizes = np.max(list(zip(*sizes)), -1)
# The resultant array has stacked on the first dimension
result = np.full((len(arrays),) + tuple(max_sizes), fill_value)
for i, a in enumerate(arrays):
# The shape of this array `a`, turned into slices
slices = tuple(slice(0,s) for s in sizes[i])
# Overwrite a block slice of `result` with this array `a`
result[i][slices] = a
return result
class DataGenerator(keras.utils.Sequence):
"""
Generates data with batch size of 1 sample for the purposes of training our model.
"""
def __init__(self, data, batch_size=32, split=None):
"""
Sets the initial arguments and creates
an indicies array to randomize the dataset
between epochs
"""
if split:
self.indicies = split
else:
self.indicies = list(range(len(data)))
self.data = data
encoder = utils.ClaimEncoder()
self.batch_size = batch_size
_, _, _, _, self.claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.get_item(index)
def get_item(self, index):
d = self.data[index]
claim = sparse.vstack(encoder.tokenize_claim(d['claim'])).toarray()
evidences = []
ys = []
for e in d['evidence']:
processed = utils.preprocess_article_name(e.split("http://wikipedia.org/wiki/")[1])
evidences.append(sparse.vstack(encoder.tokenize_claim(processed)).toarray())
if processed in self.claim_to_article[d['claim']]:
ys.append(1)
else:
ys.append(0)
evidences = stack_uneven(evidences)
gc.collect()
return {"claim":np.repeat(claim[np.newaxis, :, :], len(evidences), axis=0), "document":evidences}, np.array(ys)
def on_epoch_end(self):
#np.random.shuffle(self.indicies)
pass
gen = DataGenerator(train)
model = create_model()
import gc
gc.collect()
model.fit_generator(gen, workers=1, max_queue_size=10, use_multiprocessing=False)
model.save("preprocessed_model.h5")