automl · mfeurer · Jun 25, 2021 · Apr 23, 2021 · Apr 23, 2021 · May 3, 2021
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -537,10 +537,8 @@ def fit(
         self._dataset_name = dataset_name
         self._stopwatch.start_task(self._dataset_name)
 
-        if feat_type is None and self.InputValidator.feature_validator.feat_type:
-            self._feat_type = self.InputValidator.feature_validator.feat_type
-        elif feat_type is not None:
-            self._feat_type = feat_type
+        # Take the feature types from the validator
+        self._feat_type = self.InputValidator.feature_validator.feat_type
 
         # Produce debug information to the logfile
         self._logger.debug('Starting to print environment information')

diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py
@@ -24,7 +24,7 @@ def perform_one_hot_encoding(
     rvals = []
     if any(categorical):
         encoder = DataPreprocessor(
-            categorical_features=categorical, force_sparse_output=sparse)
+            feat_type=categorical, force_sparse_output=sparse)
         rvals.append(encoder.fit_transform(data[0]))
         for d in data[1:]:
             rvals.append(encoder.transform(d))

diff --git a/autosklearn/data/feature_validator.py b/autosklearn/data/feature_validator.py
diff --git a/autosklearn/data/xy_data_manager.py b/autosklearn/data/xy_data_manager.py
@@ -35,7 +35,10 @@ def __init__(
             self.info['has_missing'] = np.all(np.isfinite(X.data))
         else:
             self.info['is_sparse'] = 0
-            self.info['has_missing'] = np.all(np.isfinite(X))
+            if hasattr(X, 'iloc'):
+                self.info['has_missing'] = X.isnull().values.any()
+            else:
+                self.info['has_missing'] = np.all(np.isfinite(X))
 
         label_num = {
             REGRESSION: 1,
@@ -54,14 +57,10 @@ def __init__(
         if y_test is not None:
             self.data['Y_test'] = y_test
 
-        if feat_type is not None:
-            for feat in feat_type:
-                allowed_types = ['numerical', 'categorical']
-                if feat.lower() not in allowed_types:
-                    raise ValueError("Entry '%s' in feat_type not in %s" %
-                                     (feat.lower(), str(allowed_types)))
-
-        self.feat_type = feat_type
+        if feat_type is None:
+            self.feat_type = {i: 'Numerical' for i in range(np.shape(X)[1])}
+        else:
+            self.feat_type = feat_type
 
         # TODO: try to guess task type!
 
@@ -73,9 +72,3 @@ def __init__(
             raise ValueError('X and y must have the same number of '
                              'datapoints, but have %d and %d.' % (X.shape[0],
                                                                   y.shape[0]))
-        if self.feat_type is None:
-            self.feat_type = ['Numerical'] * X.shape[1]
-        if X.shape[1] != len(self.feat_type):
-            raise ValueError('X and feat_type must have the same number of columns, '
-                             'but are %d and %d.' %
-                             (X.shape[1], len(self.feat_type)))
diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -565,6 +565,7 @@ def get_configuration_space(
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
         y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
         dataset_name: Optional[str] = None,
+        feat_type: Optional[List[str]] = None,
     ):
         """
         Returns the Configuration Space object, from which Auto-Sklearn
@@ -590,6 +591,7 @@ def get_configuration_space(
             X, y,
             X_test=X_test, y_test=y_test,
             dataset_name=dataset_name,
+            feat_type=feat_type,
             only_return_configuration_space=True,
         ) if self.automl_.configuration_space is None else self.automl_.configuration_space
 

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -235,21 +235,11 @@ def __init__(
                 self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline
             self.predict_function = self._predict_proba
 
-        categorical_mask = []
-        for feat in self.datamanager.feat_type:
-            if feat.lower() == 'numerical':
-                categorical_mask.append(False)
-            elif feat.lower() == 'categorical':
-                categorical_mask.append(True)
-            else:
-                raise ValueError(feat)
-        if np.sum(categorical_mask) > 0:
-            self._init_params = {
-                'data_preprocessing:categorical_features':
-                    categorical_mask
-            }
-        else:
-            self._init_params = {}
+        self._init_params = {
+            'data_preprocessing:feat_type':
+                self.datamanager.feat_type
+        }
+
         if init_params is not None:
             self._init_params.update(init_params)
 

diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
@@ -8,6 +8,9 @@
 from ConfigSpace import Configuration
 
 import numpy as np
+
+import pandas as pd
+
 from smac.tae import TAEAbortException, StatusType
 
 from sklearn.base import BaseEstimator
@@ -100,7 +103,10 @@ def subsample_indices(
         # required to subsample because otherwise scikit-learn will complain
 
         if task_type in CLASSIFICATION_TASKS and task_type != MULTILABEL_CLASSIFICATION:
-            stratify = Y_train[train_indices]
+            stratify: Optional[
+                Union[pd.DataFrame, np.ndarray]
+            ] = Y_train.iloc[train_indices] if hasattr(
+                Y_train, 'iloc') else Y_train[train_indices]
         else:
             stratify = None
 
@@ -134,18 +140,25 @@ def _fit_with_budget(
     ):
         if model.estimator_supports_iterative_fit():
             budget_factor = model.get_max_iter()
-            Xt, fit_params = model.fit_transformer(X_train[train_indices],
-                                                   Y_train[train_indices])
+            Xt, fit_params = model.fit_transformer(
+                X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices],
+                Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
+            )
 
             n_iter = int(np.ceil(budget / 100 * budget_factor))
-            model.iterative_fit(Xt, Y_train[train_indices], n_iter=n_iter, refit=True,
-                                **fit_params)
+            model.iterative_fit(
+                Xt,
+                Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
+                n_iter=n_iter,
+                refit=True,
+                **fit_params
+            )
         else:
             _fit_and_suppress_warnings(
                 logger,
                 model,
-                X_train[train_indices],
-                Y_train[train_indices],
+                X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices],
+                Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices],
             )
 
     elif (
@@ -322,19 +335,27 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
 
                         if iterations[i] == 1:
                             self.Y_train_targets[train_indices] = \
-                                self.Y_train[train_indices]
+                                self.Y_train.iloc[train_indices] if hasattr(
+                                    self.Y_train, 'iloc') else self.Y_train[train_indices]
                             self.Y_targets[i] = self.Y_train[test_indices]
 
                             Xt, fit_params = model.fit_transformer(
-                                self.X_train[train_indices],
-                                self.Y_train[train_indices])
+                                self.X_train.iloc[train_indices] if hasattr(
+                                    self.X_train, 'iloc') else self.X_train[train_indices],
+                                self.Y_train.iloc[train_indices] if hasattr(
+                                    self.Y_train, 'iloc') else self.Y_train[train_indices],
+                            )
                             Xt_array[i] = Xt
                             fit_params_array[i] = fit_params
                         n_iter = int(2 ** iterations[i] / 2) if iterations[i] > 1 else 2
                         total_n_iterations[i] = total_n_iterations[i] + n_iter
 
-                        model.iterative_fit(Xt_array[i], self.Y_train[train_indices],
-                                            n_iter=n_iter, **fit_params_array[i])
+                        model.iterative_fit(
+                            Xt_array[i],
+                            self.Y_train.iloc[train_indices] if hasattr(
+                                self.Y_train, 'iloc') else self.Y_train[train_indices],
+                            n_iter=n_iter, **fit_params_array[i]
+                        )
 
                         (
                             train_pred,
@@ -356,7 +377,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None:
                         # Compute train loss of this fold and store it. train_loss could
                         # either be a scalar or a dict of scalars with metrics as keys.
                         train_loss = self._loss(
-                            self.Y_train_targets[train_indices],
+                            self.Y_train.iloc[train_indices] if hasattr(
+                                self.Y_train, 'iloc') else self.Y_train[train_indices],
                             train_pred,
                         )
                         train_losses[i] = train_loss
@@ -738,10 +760,15 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
         file_output = True if self.num_cv_folds == 1 else False
 
         if model.estimator_supports_iterative_fit():
-            Xt, fit_params = model.fit_transformer(self.X_train[train_indices],
-                                                   self.Y_train[train_indices])
+            Xt, fit_params = model.fit_transformer(
+                self.X_train.iloc[train_indices] if hasattr(
+                    self.Y_train, 'iloc') else self.X_train[train_indices],
+                self.Y_train.iloc[train_indices] if hasattr(
+                    self.Y_train, 'iloc') else self.Y_train[train_indices],
+            )
 
-            self.Y_train_targets[train_indices] = self.Y_train[train_indices]
+            self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
+                self.Y_train, 'iloc') else self.Y_train[train_indices]
 
             iteration = 1
             total_n_iteration = 0
@@ -759,8 +786,12 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
             ):
                 n_iter = int(2**iteration/2) if iteration > 1 else 2
                 total_n_iteration += n_iter
-                model.iterative_fit(Xt, self.Y_train[train_indices],
-                                    n_iter=n_iter, **fit_params)
+                model.iterative_fit(
+                    Xt,
+                    self.Y_train.iloc[train_indices] if hasattr(
+                        self.Y_train, 'iloc') else self.Y_train[train_indices],
+                    n_iter=n_iter, **fit_params
+                )
                 (
                     Y_train_pred,
                     Y_optimization_pred,
@@ -775,7 +806,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
                 if add_model_to_self:
                     self.model = model
 
-                train_loss = self._loss(self.Y_train[train_indices], Y_train_pred)
+                train_loss = self._loss(
+                    self.Y_train.iloc[train_indices] if hasattr(
+                        self.Y_train, 'iloc') else self.Y_train[train_indices],
+                    Y_train_pred
+                )
                 loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
                 additional_run_info = model.get_additional_run_info()
 
@@ -814,7 +849,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int]
                 additional_run_info
             ) = self._partial_fit_and_predict_standard(fold, train_indices, test_indices,
                                                        add_model_to_self)
-            train_loss = self._loss(self.Y_train[train_indices], Y_train_pred)
+            train_loss = self._loss(
+                self.Y_train.iloc[train_indices] if hasattr(
+                    self.Y_train, 'iloc') else self.Y_train[train_indices],
+                Y_train_pred
+            )
             loss = self._loss(self.Y_train[test_indices], Y_optimization_pred)
             if self.model.estimator_supports_iterative_fit():
                 model_max_iter = self.model.get_max_iter()
@@ -852,17 +891,21 @@ def _partial_fit_and_predict_standard(
         _fit_and_suppress_warnings(
             self.logger,
             model,
-            self.X_train[train_indices],
-            self.Y_train[train_indices],
+            self.X_train.iloc[train_indices] if hasattr(
+                self.X_train, 'iloc') else self.X_train[train_indices],
+            self.Y_train.iloc[train_indices] if hasattr(
+                self.Y_train, 'iloc') else self.Y_train[train_indices],
         )
 
         if add_model_to_self:
             self.model = model
         else:
             self.models[fold] = model
 
-        self.Y_targets[fold] = self.Y_train[test_indices]
-        self.Y_train_targets[train_indices] = self.Y_train[train_indices]
+        self.Y_targets[fold] = self.Y_train.iloc[test_indices] if hasattr(
+            self.Y_train, 'iloc') else self.Y_train[test_indices]
+        self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
+            self.Y_train, 'iloc') else self.Y_train[train_indices]
 
         train_pred, opt_pred, valid_pred, test_pred = self._predict(
             model=model,
@@ -893,7 +936,8 @@ def _partial_fit_and_predict_budget(
         model = self._get_model()
         self.indices[fold] = ((train_indices, test_indices))
         self.Y_targets[fold] = self.Y_train[test_indices]
-        self.Y_train_targets[train_indices] = self.Y_train[train_indices]
+        self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(
+            self.Y_train, 'iloc') else self.Y_train[train_indices],
 
         _fit_with_budget(
             X_train=self.X_train,
@@ -929,13 +973,21 @@ def _partial_fit_and_predict_budget(
     def _predict(self, model: BaseEstimator, test_indices: List[int],
                  train_indices: List[int]) -> Tuple[np.ndarray, np.ndarray,
                                                     np.ndarray, np.ndarray]:
-        train_pred = self.predict_function(self.X_train[train_indices],
-                                           model, self.task_type,
-                                           self.Y_train[train_indices])
+        train_pred = self.predict_function(
+            self.X_train.iloc[train_indices] if hasattr(
+                self.X_train, 'iloc') else self.X_train[train_indices],
+            model, self.task_type,
+            self.Y_train.iloc[train_indices] if hasattr(
+                self.Y_train, 'iloc') else self.Y_train[train_indices]
+        )
 
-        opt_pred = self.predict_function(self.X_train[test_indices],
-                                         model, self.task_type,
-                                         self.Y_train[train_indices])
+        opt_pred = self.predict_function(
+            self.X_train.iloc[test_indices] if hasattr(
+                self.X_train, 'iloc') else self.X_train[test_indices],
+            model, self.task_type,
+            self.Y_train.iloc[train_indices] if hasattr(
+                self.Y_train, 'iloc') else self.Y_train[train_indices]
+        )
 
         if self.X_valid is not None:
             X_valid = self.X_valid.copy()
@@ -947,9 +999,12 @@ def _predict(self, model: BaseEstimator, test_indices: List[int],
 
         if self.X_test is not None:
             X_test = self.X_test.copy()
-            test_pred = self.predict_function(X_test, model,
-                                              self.task_type,
-                                              self.Y_train[train_indices])
+            test_pred = self.predict_function(
+                X_test, model,
+                self.task_type,
+                self.Y_train.iloc[train_indices] if hasattr(
+                    self.Y_train, 'iloc') else self.Y_train[train_indices]
+            )
         else:
             test_pred = None
 

diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py
@@ -1002,7 +1002,9 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
                 # sparse matrices because of wrong sparse format)
                 sparse = scipy.sparse.issparse(X)
                 DPP = DataPreprocessor(
-                    categorical_features=categorical, force_sparse_output=True)
+                    feat_type={i: 'categorical' if feat else 'numerical'
+                               for i, feat in enumerate(categorical)},
+                    force_sparse_output=True)
                 X_transformed = DPP.fit_transform(X)
                 categorical_transformed = [False] * X_transformed.shape[1]