-
Notifications
You must be signed in to change notification settings - Fork 5
/
gym.py
249 lines (209 loc) · 11.6 KB
/
gym.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import os
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm
from iteration_utilities import unique_everseen
import time
import gc
from keras import backend as K
from data_generator import DataGenerator
from utils import Utils
from components import Components
# TODO
# using the absolute anomaly score in minus, inverse and hinge loss?
# since these loss functions are originally designed for the representation learning,
# e.g., Euclidean distance or reconstruction errors, which is always positive
# batch normalization
# add other network architectures that may outperform the tree-based methods, e.g., MoE or TabNet
class ADGym():
def __init__(self,
seed_list: list=[1, 2, 3],
la=5,
suffix='',
grid_mode='small',
grid_size=100,
dataset_specific=None):
'''
:param la: number of labeled anomalies
:param suffix: suffix for save experimental results
:param grid_mode: use large or small scale of combinations
:param grid_size: whether to sampling grids to save computational cost
:param gan_specific: whether to specific GAN-based data augmentation method (which is time-consuming)
'''
self.la = la
if dataset_specific is not None:
self.suffix = '-'.join([suffix, dataset_specific, str(la), grid_mode, str(grid_size)])
else:
self.suffix = '-'.join([suffix, str(la), grid_mode, str(grid_size)])
self.seed_list = seed_list
self.grid_mode = grid_mode
self.grid_size = grid_size
self.dataset_specific = dataset_specific
self.utils = Utils()
if isinstance(la, int):
self.mode = 'nla'
elif isinstance(la, float):
self.mode = 'rla'
else:
raise NotImplementedError
self.generate_duplicates = False # whether to generate duplicates for small datasets
self.n_samples_lower_bound = 1000 # lower bound of sample size
self.n_samples_upper_bound = 3000 # upper bound of sample size
self.data_generator = DataGenerator(generate_duplicates=self.generate_duplicates,
n_samples_lower_bound=self.n_samples_lower_bound,
n_samples_upper_bound=self.n_samples_upper_bound)
# filtering out datasets that do not meet the requirements
def dataset_filter(self, dataset_list_org):
dataset_list, dataset_size = [], []
for dataset in dataset_list_org:
add = True
for seed in self.seed_list:
self.data_generator.dataset = dataset
self.data_generator.seed = seed
try:
data = self.data_generator.generator(la=1.00, at_least_one_labeled=True)
except:
add = False
pass
continue
if not self.generate_duplicates and \
len(data['y_train']) + len(data['y_test']) < self.n_samples_lower_bound:
add = False
else:
if self.mode == 'nla' and sum(data['y_train']) >= self.la:
pass
elif self.mode == 'rla' and sum(data['y_train']) > 0:
pass
else:
add = False
if add:
dataset_list.append(dataset)
dataset_size.append(len(data['y_train']) + len(data['y_test']))
else:
print(f"remove the dataset {dataset}")
# sort datasets by their sample size
dataset_list = [dataset_list[_] for _ in np.argsort(np.array(dataset_size))]
return dataset_list
def generate_gyms(self):
self.utils.set_seed(42)
# generate combinations of different components
com = Components()
print(com.gym(mode=self.grid_mode)) # see the entire components in the current grid mode (either large or small)
gyms_comb = list(product(*list(com.gym(mode=self.grid_mode).values())))
keys = list(com.gym(mode=self.grid_mode).keys())
gyms = []
for _ in tqdm(gyms_comb):
gym = {} # save components in dict
for j, __ in enumerate(_):
gym[keys[j]] = __
# todo: BCE and focal loss can be batch_resample=False!
# for inverse loss, we do not perform batch resampling strategy
if gym['loss_name'] == 'inverse' and gym['batch_resample']:
continue
# for other loss functions, we use batch resampling strategy
if gym['loss_name'] != 'inverse' and not gym['batch_resample']:
continue
# todo: ordinal loss for other network architectures
if gym['loss_name'] == 'ordinal' and gym['network_architecture'] != 'MLP':
continue
# delete components of network architecture = ResNet or FTT while the activation function is not RELU
if gym['network_architecture'] in ['ResNet', 'FTT'] and gym['act_fun'] != 'ReLU':
continue
gyms.append(gym)
print(f'The total size of grids: {len(gyms)}')
# random selection for considering computational cost
if len(gyms) > self.grid_size:
idx = np.random.choice(np.arange(len(gyms)), self.grid_size, replace=False)
gyms = [gyms[_] for _ in idx]
# remove duplicated components
gyms = list(unique_everseen(gyms))
return gyms
def run(self):
if self.dataset_specific is None or not isinstance(self.dataset_specific, str):
# dataset list
dataset_list = [os.path.splitext(_)[0] for _ in os.listdir('datasets') if os.path.splitext(_)[1] == '.npz']
else:
dataset_list = [self.dataset_specific]
# filtering dataset
dataset_list = self.dataset_filter(dataset_list)
# generate components
gyms = self.generate_gyms()
# create save path
if not os.path.exists('datasets/meta-features'):
os.makedirs('datasets/meta-features')
if not os.path.exists('result'):
os.makedirs('result')
for seed in self.seed_list:
# save results
df_results_AUCROC_train = pd.DataFrame(data=None, index=[str(_) for _ in gyms], columns=dataset_list)
df_results_AUCROC_test = pd.DataFrame(data=None, index=[str(_) for _ in gyms], columns=dataset_list)
df_results_AUCPR_train = pd.DataFrame(data=None, index=[str(_) for _ in gyms], columns=dataset_list)
df_results_AUCPR_test = pd.DataFrame(data=None, index=[str(_) for _ in gyms], columns=dataset_list)
df_results_runtime = pd.DataFrame(data=None, index=[str(_) for _ in gyms], columns=dataset_list)
for dataset in dataset_list:
# data generator instantiation
self.data_generator.dataset = dataset
self.data_generator.seed = seed
for j, gym in tqdm(enumerate(gyms)):
# generate data and save meta-features
if j == 0:
data = self.data_generator.generator(la=self.la, meta=True)
np.savez_compressed(os.path.join('datasets/meta-features', 'meta-features-' + dataset +
'-' + str(self.la) + '-' + str(seed) + '.npz'),
data=data['meta_features'])
# 注意此处Components函数的输入data, 如果不copy, 函数内部的修改将会同时对原始数据data进行修改
com = Components(seed=seed,
data=data.copy(),
augmentation=gym['augmentation'],
gan_specific_path=dataset + '-' + str(self.la) + '-' + str(seed) + '.npz',
preprocess=gym['preprocess'],
network_architecture=gym['network_architecture'],
hidden_size_list=gym['hidden_size_list'],
act_fun=gym['act_fun'],
dropout=gym['dropout'],
network_initialization=gym['network_initialization'],
training_strategy=gym['training_strategy'],
loss_name=gym['loss_name'],
optimizer_name=gym['optimizer_name'],
batch_resample=gym['batch_resample'],
epochs=gym['epochs'],
batch_size=gym['batch_size'],
lr=gym['lr'],
weight_decay=gym['weight_decay'])
try:
# training
start_time = time.time()
com.f_train()
end_time = time.time()
fit_time = end_time - start_time
# predicting
_, (metrics_train, metrics_test) = com.f_predict_score()
print(f'Dataset: {dataset}, Current combination: {gym}, training successfully.'
f' Performance (train): {metrics_train}, Performance (test): {metrics_test}\n')
except Exception as error:
print(f'Dataset: {dataset}, Current combination: {gym}, training failure. Error: {error}\n')
metrics_train, metrics_test, fit_time = None, None, None
pass
continue
K.clear_session()
del com; gc.collect()
# save results
if metrics_train is not None and metrics_test is not None and fit_time is not None:
df_results_AUCROC_train.loc[str(gym), dataset] = metrics_train['aucroc']
df_results_AUCROC_test.loc[str(gym), dataset] = metrics_test['aucroc']
df_results_AUCPR_train.loc[str(gym), dataset] = metrics_train['aucpr']
df_results_AUCPR_test.loc[str(gym), dataset] = metrics_test['aucpr']
df_results_runtime.loc[str(gym), dataset] = fit_time
print(f'Dataset: {dataset}, Current combination: {gym}, training successfully.')
else:
print(f'Dataset: {dataset}, Current combination: {gym}, training failure.')
# output
df_results_AUCROC_train.to_csv(os.path.join('result', 'result-AUCROC-train-' + self.suffix + '-' + str(seed) + '.csv'), index=True)
df_results_AUCROC_test.to_csv(os.path.join('result', 'result-AUCROC-test-' + self.suffix + '-' + str(seed) + '.csv'), index=True)
df_results_AUCPR_train.to_csv(os.path.join('result', 'result-AUCPR-train-' + self.suffix + '-' + str(seed) + '.csv'), index=True)
df_results_AUCPR_test.to_csv(os.path.join('result', 'result-AUCPR-test-' + self.suffix + '-' + str(seed) + '.csv'), index=True)
df_results_runtime.to_csv(os.path.join('result', 'result-runtime-' + self.suffix + '-' + str(seed) + '.csv'), index=True)
del data
adgym = ADGym(suffix='test', la=10, grid_mode='large', grid_size=1000, seed_list=[1, 2, 3])
adgym.run()