-
-
Notifications
You must be signed in to change notification settings - Fork 136
/
Copy pathexec.py
138 lines (115 loc) · 4.4 KB
/
exec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import logging
import os
import sys
from sklearn.model_selection import StratifiedKFold
import numpy as np
sys.path.append(
"{}/lib/oboe/automl".format(os.path.realpath(os.path.dirname(__file__)))
)
from oboe import AutoLearner
from frameworks.shared.callee import call_run, result
from frameworks.shared.utils import Timer
log = logging.getLogger(__name__)
def kfold_fit_validate(self, x_train, y_train, n_folds, random_state=None):
"""Performs k-fold cross validation on a training dataset. Note that this is the function used to fill entries
of the error matrix.
Args:
x_train (np.ndarray): Features of the training dataset.
y_train (np.ndarray): Labels of the training dataset.
n_folds (int): Number of folds to use for cross validation.
Returns:
float: Mean of k-fold cross validation error.
np.ndarray: Predictions on the training dataset from cross validation.
"""
y_predicted = np.empty(y_train.shape)
cv_errors = np.empty(n_folds)
kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
for i, (train_idx, test_idx) in enumerate(kf.split(x_train, y_train)):
x_tr = x_train[train_idx, :]
y_tr = y_train[train_idx]
x_te = x_train[test_idx, :]
y_te = y_train[test_idx]
model = self.instantiate()
if len(np.unique(y_tr)) > 1:
model.fit(x_tr, y_tr)
y_predicted[test_idx] = np.expand_dims(model.predict(x_te), axis=1)
else:
y_predicted[test_idx] = y_tr[0]
cv_errors[i] = self.error(y_te, y_predicted[test_idx])
self.cv_error = cv_errors.mean()
self.cv_predictions = y_predicted
self.sampled = True
if self.verbose:
print("{} {} complete.".format(self.algorithm, self.hyperparameters))
return cv_errors, y_predicted
def run(dataset, config):
log.info("\n**** Applying monkey patch ****\n")
from oboe.model import Model
Model.kfold_fit_validate = kfold_fit_validate
log.info(f"\n**** Oboe [{config.framework_version}] ****\n")
is_classification = config.type == "classification"
if not is_classification:
# regression currently fails (as of 26.02.2019: still under development state by oboe team)
raise ValueError("Regression is not yet supported (under development).")
X_train = dataset.train.X
y_train = dataset.train.y
training_params = {
k: v for k, v in config.framework_params.items() if not k.startswith("_")
}
n_cores = config.framework_params.get("_n_cores", config.cores)
log.info(
"Running oboe with a maximum time of {}s on {} cores.".format(
config.max_runtime_seconds, n_cores
)
)
log.warning(
"We completely ignore the advice to optimize towards metric: {}.".format(
config.metric
)
)
aml = AutoLearner(
p_type="classification" if is_classification else "regression",
n_cores=n_cores,
runtime_limit=config.max_runtime_seconds,
**training_params,
)
aml.error_matrix = aml.error_matrix.to_numpy()
if len(aml.ensemble.base_learners) > 0:
def aml_models():
return [aml.ensemble, *aml.ensemble.base_learners]
else:
def aml_models():
return []
with Timer() as training:
try:
aml.fit(X_train, y_train)
except IndexError as e:
if (
len(aml_models()) == 0
): # incorrect handling of some IndexError in oboe if ensemble is empty
raise ValueError(
"Oboe could not produce any model in the requested time."
)
raise e
log.info("Predicting on the test set.")
X_test = dataset.test.X
y_test = dataset.test.y
with Timer() as predict:
predictions = aml.predict(X_test)
predictions = predictions.reshape(len(X_test))
if is_classification:
probabilities = "predictions" # encoding is handled by caller in `__init__.py`
else:
probabilities = None
return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
target_is_encoded=is_classification,
models_count=len(aml_models()),
training_duration=training.duration,
predict_duration=predict.duration,
)
if __name__ == "__main__":
call_run(run)