Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify InputValidator: Allows pandas frame to directly reach the pipeline #1135

Merged
merged 18 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,10 +537,8 @@ def fit(
self._dataset_name = dataset_name
self._stopwatch.start_task(self._dataset_name)

if feat_type is None and self.InputValidator.feature_validator.feat_type:
self._feat_type = self.InputValidator.feature_validator.feat_type
elif feat_type is not None:
self._feat_type = feat_type
# Take the feature types from the validator
self._feat_type = self.InputValidator.feature_validator.feat_type

# Produce debug information to the logfile
self._logger.debug('Starting to print environment information')
Expand Down
2 changes: 1 addition & 1 deletion autosklearn/data/abstract_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def perform_one_hot_encoding(
rvals = []
if any(categorical):
encoder = DataPreprocessor(
categorical_features=categorical, force_sparse_output=sparse)
feat_type=categorical, force_sparse_output=sparse)
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
rvals.append(encoder.fit_transform(data[0]))
for d in data[1:]:
rvals.append(encoder.transform(d))
Expand Down
225 changes: 78 additions & 147 deletions autosklearn/data/feature_validator.py

Large diffs are not rendered by default.

23 changes: 8 additions & 15 deletions autosklearn/data/xy_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def __init__(
self.info['has_missing'] = np.all(np.isfinite(X.data))
else:
self.info['is_sparse'] = 0
self.info['has_missing'] = np.all(np.isfinite(X))
if hasattr(X, 'iloc'):
self.info['has_missing'] = X.isnull().values.any()
else:
self.info['has_missing'] = np.all(np.isfinite(X))

label_num = {
REGRESSION: 1,
Expand All @@ -54,14 +57,10 @@ def __init__(
if y_test is not None:
self.data['Y_test'] = y_test

if feat_type is not None:
for feat in feat_type:
allowed_types = ['numerical', 'categorical']
if feat.lower() not in allowed_types:
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("Entry '%s' in feat_type not in %s" %
(feat.lower(), str(allowed_types)))

self.feat_type = feat_type
if feat_type is None:
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
self.feat_type = {i: 'Numerical' for i in range(np.shape(X)[1])}
else:
self.feat_type = feat_type

# TODO: try to guess task type!

Expand All @@ -73,9 +72,3 @@ def __init__(
raise ValueError('X and y must have the same number of '
'datapoints, but have %d and %d.' % (X.shape[0],
y.shape[0]))
if self.feat_type is None:
self.feat_type = ['Numerical'] * X.shape[1]
if X.shape[1] != len(self.feat_type):
raise ValueError('X and feat_type must have the same number of columns, '
'but are %d and %d.' %
(X.shape[1], len(self.feat_type)))
2 changes: 2 additions & 0 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ def get_configuration_space(
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
dataset_name: Optional[str] = None,
feat_type: Optional[List[str]] = None,
):
"""
Returns the Configuration Space object, from which Auto-Sklearn
Expand All @@ -590,6 +591,7 @@ def get_configuration_space(
X, y,
X_test=X_test, y_test=y_test,
dataset_name=dataset_name,
feat_type=feat_type,
only_return_configuration_space=True,
) if self.automl_.configuration_space is None else self.automl_.configuration_space

Expand Down
20 changes: 5 additions & 15 deletions autosklearn/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,21 +235,11 @@ def __init__(
self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline
self.predict_function = self._predict_proba

categorical_mask = []
for feat in self.datamanager.feat_type:
if feat.lower() == 'numerical':
categorical_mask.append(False)
elif feat.lower() == 'categorical':
categorical_mask.append(True)
else:
raise ValueError(feat)
if np.sum(categorical_mask) > 0:
self._init_params = {
'data_preprocessing:categorical_features':
categorical_mask
}
else:
self._init_params = {}
self._init_params = {
'data_preprocessing:feat_type':
self.datamanager.feat_type
}

if init_params is not None:
self._init_params.update(init_params)

Expand Down
123 changes: 89 additions & 34 deletions autosklearn/evaluation/train_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from ConfigSpace import Configuration

import numpy as np

import pandas as pd

from smac.tae import TAEAbortException, StatusType

from sklearn.base import BaseEstimator
Expand Down Expand Up @@ -100,7 +103,10 @@ def subsample_indices(
# required to subsample because otherwise scikit-learn will complain

if task_type in CLASSIFICATION_TASKS and task_type != MULTILABEL_CLASSIFICATION:
stratify = Y_train[train_indices]
stratify: Optional[
Union[pd.DataFrame, np.ndarray]
] = Y_train.iloc[train_indices] if hasattr(
Y_train, 'iloc') else Y_train[train_indices]
else:
stratify = None

Expand Down Expand Up @@ -134,18 +140,25 @@ def _fit_with_budget(
):
if model.estimator_supports_iterative_fit():
budget_factor = model.get_max_iter()
Xt, fit_params = model.fit_transformer(X_train[train_indices],
Y_train[train_indices])
Xt, fit_params = model.fit_transformer(
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices],
Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
)

n_iter = int(np.ceil(budget / 100 * budget_factor))
model.iterative_fit(Xt, Y_train[train_indices], n_iter=n_iter, refit=True,
**fit_params)
model.iterative_fit(
Xt,
Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
n_iter=n_iter,
refit=True,
**fit_params
)
else:
_fit_and_suppress_warnings(
logger,
model,
X_train[train_indices],
Y_train[train_indices],
X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices],
Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
)

elif (
Expand Down Expand Up @@ -322,19 +335,27 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:

if iterations[i] == 1:
self.Y_train_targets[train_indices] = \
self.Y_train[train_indices]
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]
self.Y_targets[i] = self.Y_train[test_indices]

Xt, fit_params = model.fit_transformer(
self.X_train[train_indices],
self.Y_train[train_indices])
self.X_train.iloc[train_indices] if hasattr(
self.X_train, 'iloc') else self.X_train[train_indices],
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
)
Xt_array[i] = Xt
fit_params_array[i] = fit_params
n_iter = int(2 ** iterations[i] / 2) if iterations[i] > 1 else 2
total_n_iterations[i] = total_n_iterations[i] + n_iter

model.iterative_fit(Xt_array[i], self.Y_train[train_indices],
n_iter=n_iter, **fit_params_array[i])
model.iterative_fit(
Xt_array[i],
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
n_iter=n_iter, **fit_params_array[i]
)

(
train_pred,
Expand All @@ -356,7 +377,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
# Compute train loss of this fold and store it. train_loss could
# either be a scalar or a dict of scalars with metrics as keys.
train_loss = self._loss(
self.Y_train_targets[train_indices],
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
train_pred,
)
train_losses[i] = train_loss
Expand Down Expand Up @@ -738,10 +760,15 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
file_output = True if self.num_cv_folds == 1 else False

if model.estimator_supports_iterative_fit():
Xt, fit_params = model.fit_transformer(self.X_train[train_indices],
self.Y_train[train_indices])
Xt, fit_params = model.fit_transformer(
self.X_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.X_train[train_indices],
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
)

self.Y_train_targets[train_indices] = self.Y_train[train_indices]
self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]

iteration = 1
total_n_iteration = 0
Expand All @@ -759,8 +786,12 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
):
n_iter = int(2**iteration/2) if iteration > 1 else 2
total_n_iteration += n_iter
model.iterative_fit(Xt, self.Y_train[train_indices],
n_iter=n_iter, **fit_params)
model.iterative_fit(
Xt,
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
n_iter=n_iter, **fit_params
)
(
Y_train_pred,
Y_optimization_pred,
Expand All @@ -775,7 +806,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
if add_model_to_self:
self.model = model

train_loss = self._loss(self.Y_train[train_indices], Y_train_pred)
train_loss = self._loss(
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
Y_train_pred
)
loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
additional_run_info = model.get_additional_run_info()

Expand Down Expand Up @@ -814,7 +849,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
additional_run_info
) = self._partial_fit_and_predict_standard(fold, train_indices, test_indices,
add_model_to_self)
train_loss = self._loss(self.Y_train[train_indices], Y_train_pred)
train_loss = self._loss(
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
Y_train_pred
)
loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
if self.model.estimator_supports_iterative_fit():
model_max_iter = self.model.get_max_iter()
Expand Down Expand Up @@ -852,17 +891,21 @@ def _partial_fit_and_predict_standard(
_fit_and_suppress_warnings(
self.logger,
model,
self.X_train[train_indices],
self.Y_train[train_indices],
self.X_train.iloc[train_indices] if hasattr(
self.X_train, 'iloc') else self.X_train[train_indices],
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],
)

if add_model_to_self:
self.model = model
else:
self.models[fold] = model

self.Y_targets[fold] = self.Y_train[test_indices]
self.Y_train_targets[train_indices] = self.Y_train[train_indices]
self.Y_targets[fold] = self.Y_train.iloc[test_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[test_indices]
self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]

train_pred, opt_pred, valid_pred, test_pred = self._predict(
model=model,
Expand Down Expand Up @@ -893,7 +936,8 @@ def _partial_fit_and_predict_budget(
model = self._get_model()
self.indices[fold] = ((train_indices, test_indices))
self.Y_targets[fold] = self.Y_train[test_indices]
self.Y_train_targets[train_indices] = self.Y_train[train_indices]
self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices],

_fit_with_budget(
X_train=self.X_train,
Expand Down Expand Up @@ -929,13 +973,21 @@ def _partial_fit_and_predict_budget(
def _predict(self, model: BaseEstimator, test_indices: List[int],
train_indices: List[int]) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray]:
train_pred = self.predict_function(self.X_train[train_indices],
model, self.task_type,
self.Y_train[train_indices])
train_pred = self.predict_function(
self.X_train.iloc[train_indices] if hasattr(
self.X_train, 'iloc') else self.X_train[train_indices],
model, self.task_type,
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]
)

opt_pred = self.predict_function(self.X_train[test_indices],
model, self.task_type,
self.Y_train[train_indices])
opt_pred = self.predict_function(
self.X_train.iloc[test_indices] if hasattr(
self.X_train, 'iloc') else self.X_train[test_indices],
model, self.task_type,
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]
)

if self.X_valid is not None:
X_valid = self.X_valid.copy()
Expand All @@ -947,9 +999,12 @@ def _predict(self, model: BaseEstimator, test_indices: List[int],

if self.X_test is not None:
X_test = self.X_test.copy()
test_pred = self.predict_function(X_test, model,
self.task_type,
self.Y_train[train_indices])
test_pred = self.predict_function(
X_test, model,
self.task_type,
self.Y_train.iloc[train_indices] if hasattr(
self.Y_train, 'iloc') else self.Y_train[train_indices]
)
else:
test_pred = None

Expand Down
4 changes: 3 additions & 1 deletion autosklearn/metalearning/metafeatures/metafeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,9 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
# sparse matrices because of wrong sparse format)
sparse = scipy.sparse.issparse(X)
DPP = DataPreprocessor(
categorical_features=categorical, force_sparse_output=True)
feat_type={i: 'categorical' if feat else 'numerical'
mfeurer marked this conversation as resolved.
Show resolved Hide resolved
for i, feat in enumerate(categorical)},
force_sparse_output=True)
X_transformed = DPP.fit_transform(X)
categorical_transformed = [False] * X_transformed.shape[1]

Expand Down
Loading