-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathall_data.py
72 lines (55 loc) · 2.1 KB
/
all_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch as t
from torch.utils.data import Dataset, DataLoader
import pickle
from config import opt
from sklearn.model_selection import train_test_split
import numpy as np
class CoreDataset(Dataset):
def __init__(self, data, labels, masks, num_labels, opt, segs=None, X_lengths=None):
self.data = data
self.labels = labels
self.masks = masks
self.num_data = len(self.data)
self.maxlen = opt.maxlen
self.num_labels = num_labels
self.mode = opt.data_mode
self.sentence_mode = opt.sentence_mode
self.segs = segs
self.X_lengths = X_lengths
self.opt = opt
def __getitem__(self, index):
# caps
caps = self.data[index]
caps = t.tensor(caps)
# labels
label = self.labels[index]
if self.mode == 'single':
labels = t.from_numpy(np.array(label))
else:
label = t.LongTensor(np.array(label))
labels = t.zeros(self.num_labels).scatter_(0, label, 1)
# masks
masks = t.tensor(self.masks[index])
if self.opt.dialog_data_mode:
X_lengths = t.tensor(self.X_lengths[index])
return caps, labels, masks, X_lengths
if self.sentence_mode == 'one':
return caps, labels, masks
else:
# segments
segs = t.tensor(self.segs[index])
return caps, labels, masks, segs
def __len__(self):
return len(self.data)
def get_dataloader(data, labels, masks, num_labels, opt, segs=None, X_lengths=None):
dataset = CoreDataset(data, labels, masks, num_labels, opt, segs, X_lengths)
batch_size = opt.dialog_batch_size if opt.dialog_data_mode else opt.batch_size
return DataLoader(dataset,
batch_size=batch_size,
shuffle=False)
######################################################################
if __name__ == '__main__':
with open(opt.data_path, 'rb') as f:
data = pickle.load(f)
X, y, entities = zip(*data)
X_train, X_test, y_train, y_test = train_test_split(X, y)