-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
166 lines (132 loc) · 7.35 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)
from pyts.image import GramianAngularField
input_shape_global = None
def set_input_shape_global(input_shape):
global input_shape_global
input_shape_global = input_shape
def pack_features_vector(features, labels):
"""
Pack the features into a single array.
@param features: x
@param labels: y
@return: tuple of features and labels
"""
features = tf.stack(list(features.values()), axis=1)
features = tf.reshape(features, (-1, input_shape_global[1], input_shape_global[2], input_shape_global[3]))
labels = tf.reshape(labels, (-1, input_shape_global[1] * input_shape_global[2],))
labels = tf.reduce_mean(labels, axis=1)
return features, labels
def data_gen(dir, split, region, deterministic=True, transpose_flag=False, gaf_flag=False):
"""
Data generator method.
@param dir: path to data directory
@param split: train, val, or test split
@param region: target region of files that should be loaded
@param deterministic: whether to load the data deterministic or note
@param transpose_flag: whether to transpose the tensors or not
@param gaf_flag: whether to load the dataset in the gaf format
@yield: tuple of features and labels
"""
with np.load(os.path.join(dir, split, region)) as data:
if deterministic:
files = data.files
else:
files = np.random.choice(data.files, size=len(data.files), replace=False)
for file in files:
y = tf.cast(data[file][:, :, -1], tf.dtypes.int32)
x = data[file][:, :, :-1]
y = tf.math.reduce_mean(y, axis=0)
y = tf.math.reduce_mean(y, axis=0)
if transpose_flag:
x = np.transpose(x, axes=(1, 0, 2))
if gaf_flag:
x = np.reshape(x, (input_shape_global[1], input_shape_global[3]))
transformer = GramianAngularField(sample_range=None)
subs = []
for i in range(input_shape_global[3]):
# transformer expects value between -1 and 1
# this should generally be the case due to the scaling in the preprocessing
# however in some cases it values can be outside these boundaries
x = np.clip(x, -1, 1)
x_transformed = transformer.transform(x[np.newaxis, :, i])
subs.append(x_transformed.squeeze())
x = np.stack(subs, axis=2)
yield x, y
def create_ds(dir, region, split, batch_size=32, in_memory_flag=True, count=False, deterministic=True, transpose_flag=False, gaf_flag=False,
class_counts_file='class_counts.csv', filter_fn=None, cache_dir=None):
"""
Method to create the tensorflow dataset.
@param dir: path to data directory
@param region: target region of files that should be loaded
@param split: train, val, or test split
@param batch_size: batch size for training
@param in_memory_flag: whether the data is stored in one array or not
@param count: whether to return the counter or not
@param deterministic: whether to load the data deterministic or note
@param transpose_flag: whether to transpose the tensors or not
@param gaf_flag: whether to load the dataset in the gaf format
@param class_counts_file: path to class counts file
@param filter_fn: filter function to apply
@cache_dir: cache directory
@return: tuple of dataset and counters
"""
if in_memory_flag:
with np.load(os.path.join(dir, split, region + '.npz')) as data:
y = tf.cast(data['arr_0'][:, :, :, -1], tf.dtypes.int32)
x = data['arr_0'][:, :, :, :-1]
y = tf.math.reduce_mean(y, axis=1)
y = tf.math.reduce_mean(y, axis=1)
y = y[:, tf.newaxis]
if transpose_flag:
x = tf.transpose(x, perm=(0, 2, 1, 3))
ds = tf.data.Dataset.from_tensor_slices((x, y))
else:
ds = tf.data.Dataset.from_generator(data_gen, args=[dir, split, region + '.npz', deterministic, transpose_flag, gaf_flag],
output_signature=(
tf.TensorSpec(shape=(input_shape_global[1], input_shape_global[2],
input_shape_global[3]), dtype=tf.float32),
tf.TensorSpec(shape=(), dtype=tf.int32)
))
ds = ds.filter(filter_fn) if filter_fn is not None else ds
ds = ds.cache(cache_dir + '_' + split) if cache_dir is not None else ds
ds = ds.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE, deterministic=deterministic)
ds = ds.prefetch(tf.data.AUTOTUNE)
if count:
class_counts_df = pd.read_csv(os.path.join(dir, class_counts_file))
pos_counter, neg_counter = class_counts_df[split + '_' + region]
return ds, pos_counter, neg_counter
else:
return ds
def load_data(dir, region, input_shape=(None, 5, 20, 8), batch_size=32, in_memory_flag=True, deterministic=True, transpose_flag=False,
gaf_flag=False, class_counts_file='class_counts.csv', cache_dir=None):
"""
Method to load data as a tensorflow dataset for training.
@param dir: path to data directory
@param region: target region of files that should be loaded
@param input_shape: shape of the buckets
@param batch_size: batch size for training
@param in_memory_flag: whether the data is stored in one array or not
@param deterministic: whether to load the data deterministic or note
@param transpose_flag: whether to transpose the tensors or not
@param gaf_flag: whether to load the dataset in the gaf format
@param class_counts_file: path to class counts file
@cache_dir: cache directory
@return: tuple of train, validation, and test dataset plus the class weight dictionary
"""
set_input_shape_global(input_shape)
train_ds, pos_train_counter, neg_train_counter = create_ds(dir=dir, region=region, split='train', batch_size=batch_size, in_memory_flag=in_memory_flag, count=True,
deterministic=deterministic, transpose_flag=transpose_flag, gaf_flag=gaf_flag,
class_counts_file=class_counts_file, filter_fn=None, cache_dir=cache_dir)
val_ds = create_ds(dir=dir, region=region, split='val', batch_size=batch_size, in_memory_flag=in_memory_flag, count=False, deterministic=deterministic,
transpose_flag=transpose_flag, gaf_flag=gaf_flag, class_counts_file=class_counts_file, filter_fn=None, cache_dir=cache_dir)
test_ds = create_ds(dir=dir, region=region, split='test', batch_size=batch_size, in_memory_flag=in_memory_flag, count=False, deterministic=deterministic,
transpose_flag=transpose_flag, gaf_flag=gaf_flag, class_counts_file=class_counts_file, filter_fn=None, cache_dir=cache_dir)
weight_for_0 = (1 / neg_train_counter) * ((pos_train_counter + neg_train_counter) / 2.0)
weight_for_1 = (1 / pos_train_counter) * ((pos_train_counter + neg_train_counter) / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}
return train_ds, val_ds, test_ds, class_weight