-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdataset.py
183 lines (148 loc) · 7.41 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
This file is based on DeepConvSep.
Copyright (c) 2014-2017 Marius Miron <miron.marius at gmail.com>
DeepConvSep is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
DeepConvSep is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with DeepConvSep. If not, see <http://www.gnu.org/licenses/>.
"""
import numpy as np
import scipy
import os,sys
import util
import sklearn
from sklearn.model_selection import train_test_split
class MyDataset(object):
"""
The class to load data in chunks and prepare batches for training neural networks
Parameters
----------
feature_dir : string
The path for the folder from where to load the input data to the network
batch_size : int, optional
The number of examples in a batch
time_context : int, optional
The time context modeled by the network.
The data files are split into segments of this size
step : int, optional
The number of stepping frames between adjacent segments
floatX: dtype
Type of the arrays for the output
"""
def __init__(self, feature_dir=None, batch_size=32, time_context=100, step=25,
suffix_in='_mel_', suffix_out='_label_', floatX=np.float32, train_percent=1.):
self.batch_size = batch_size
self.floatX = floatX
self.suffix_in = suffix_in
self.suffix_out = suffix_out
self.time_context = int(time_context)
if step > self.time_context:
self.step = int(0.5 * self.time_context)
else:
self.step = step
self.train_percent = np.maximum(0.1,np.minimum(1.,train_percent))
if feature_dir is not None:
self.initDir(feature_dir)
def getNumInstances(self,infile,time_context=100,step=25):
"""
For a single .data file computes the number of examples of size \"time_context\" that can be created
"""
shape = util.get_shape(os.path.join(infile.replace('.data','.shape')))
length_file = float(shape[0])
return np.maximum(1,int(np.ceil((length_file-time_context)/ self.step)))
def getFeatureSize(self,infile):
"""
For a single .data file return the number of feature, e.g. number of spectrogram bins
"""
shape = util.get_shape(os.path.join(infile.replace('.data','.shape')))
return shape[1]
def initDir(self,feature_dir):
assert os.path.isdir(os.path.dirname(feature_dir)), "path to feature directory does not exist"
#list of .data file names that contain the features
self.file_list = [f for f in os.listdir(feature_dir) if f.endswith(self.suffix_in+'.data') and
os.path.isfile(os.path.join(feature_dir,f.replace(self.suffix_in,self.suffix_out))) ]
self.total_files = len(self.file_list)
assert self.total_files>0, "there are no feature files in the input directory"
self.feature_dir = feature_dir
#how many training examples we create for every file?
#noinstances = self.getNumInstances(os.path.join(self.feature_dir,self.file_list[0]),time_context=self.time_context,step=self.step)
self.total_noinstances = np.cumsum(np.array([0]+[self.getNumInstances(os.path.join(self.feature_dir,infile),time_context=self.time_context,step=self.step)
for infile in self.file_list], dtype=int))
self.total_points = self.total_noinstances[-1]
#reduce the batch size if we have less points
self.batch_size=np.minimum(self.batch_size,self.total_points)
#how many batches can we fit in the dataset
self.iteration_size=int(np.floor(self.total_points/ self.batch_size))
self.iteration_step = -1
#feature size (last dimension of the output)
self.feature_size = self.getFeatureSize(infile=os.path.join(self.feature_dir,self.file_list[0]))
#init the output
self.features=np.zeros((self.total_points,self.time_context,self.feature_size),dtype=self.floatX)
self.labels=np.zeros((self.total_points,1),dtype=self.floatX)
#fetch all data from hard-disk
for id in range(self.total_files):
self.fetchFile(id)
self.shuffleBatches()
if self.train_percent<1.:
self.features_valid, self.features, self.labels_valid, self.labels= train_test_split(self.features, self.labels, test_size=self.train_percent, random_state=42)
self.total_points = len(self.features)
self.iteration_size = int(np.floor(self.total_points/ self.batch_size))
def shuffleBatches(self):
idxrand = np.random.permutation(self.total_points)
self.features=self.features[idxrand]
self.labels=self.labels[idxrand]
def fetchFile(self,id):
#load the data files
spec = util.loadTensor(out_path=os.path.join(self.feature_dir,self.file_list[id]))
lab = util.loadTensor(out_path=os.path.join(self.feature_dir,self.file_list[id].replace(self.suffix_in,self.suffix_out)))
#we need to put the features in the self.features array starting at this index
idx_start = self.total_noinstances[id]
#and we stop at this index in self.feature
idx_end = self.total_noinstances[id+1]
#copy each block of size (time_contex,feature_size) in the self.features
idx=0 #starting index of each block
start = 0 #starting point for each block in frames
while idx<(idx_end-idx_start):
self.features[idx_start+idx] = spec[start:start+self.time_context]
start = start + self.step
idx = idx + 1
self.labels[idx_start:idx_end] = lab[0]
spec = None
lab = None
def getData(self):
assert self.total_points>0, "no data points in dataset"
if self.train_percent<1.:
return self.features,self.labels, self.features_valid, self.labels_valid
else:
return self.features,self.labels, self.features,self.labels
def getValidation(self):
assert self.train_percent<1., "no validation examples. decrease train_percent"
return self.features_valid, self.labels_valid
def iterate(self):
self.iteration_step += 1
if self.iteration_step==self.iteration_size:
self.iteration_step = 0
self.shuffleBatches()
y = self.labels[self.iteration_step*self.batch_size:(self.iteration_step+1)*self.batch_size]
categorical = np.zeros((self.batch_size, int(self.labels.max()+1)))
y = np.array(y, dtype='int').ravel()
categorical[np.arange(self.batch_size), y] = 1
features = self.features[self.iteration_step*self.batch_size:(self.iteration_step+1)*self.batch_size,np.newaxis]
features = np.swapaxes(features, 1, 3)
return (features, categorical)
def __len__(self):
return self.iteration_size
def __call__(self):
return self.iterate()
def __iter__(self):
return self.iterate()
def next(self):
return self.iterate()
def batches(self):
return self.iterate()