Simplify InputValidator: Allows pandas frame to directly reach the pi…

…peline (#1135) * [ADD] Move encoder to pipeline * [Fix] Unit Tests * [ADD] mypy support for preprocessing * [FIX] unit test * Feedback from PR * [Fix] unit test * [FIx] pre-commit * Add better unit testing for anneal * Fix unit testing * Fix metalearning script * Fix feat check * Feedback from pr * feat_type in testing * sparse dataframe * fix pandas landmarking meta-features * Feedback from comments * np.nan columns to category * [Fix] Mypy Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
automl · Jun 25, 2021 · 4a482de · 4a482de
1 parent 01e6e60
commit 4a482de
Show file tree

Hide file tree

Showing 52 changed files with 2,352 additions and 1,499 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,6 +22,10 @@ repos:
         args: [--show-error-codes]
         name: mypy auto-sklearn-evaluation
         files: autosklearn/evaluation
+      - id: mypy
+        args: [--show-error-codes]
+        name: mypy auto-sklearn-datapreprocessing
+        files: autosklearn/pipeline/components/data_preprocessing/
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:

diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -533,10 +533,8 @@ def fit(
         self._dataset_name = dataset_name
         self._stopwatch.start_task(self._dataset_name)
 
-        if feat_type is None and self.InputValidator.feature_validator.feat_type:
-            self._feat_type = self.InputValidator.feature_validator.feat_type
-        elif feat_type is not None:
-            self._feat_type = feat_type
+        # Take the feature types from the validator
+        self._feat_type = self.InputValidator.feature_validator.feat_type
 
         # Produce debug information to the logfile
         self._logger.debug('Starting to print environment information')

diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py
@@ -1,41 +1,12 @@
 import abc
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, Union
 
 import numpy as np
 
 import scipy.sparse
 
 from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
     import DataPreprocessor
-from autosklearn.util.data import predict_RAM_usage
-
-
-def perform_one_hot_encoding(
-    sparse: bool,
-    categorical: List[bool],
-    data: List
-) -> Tuple[List, bool]:
-    predicted_RAM_usage = float(
-        predict_RAM_usage(data[0], categorical)) / 1024 / 1024
-
-    if predicted_RAM_usage > 1000:
-        sparse = True
-
-    rvals = []
-    if any(categorical):
-        encoder = DataPreprocessor(
-            categorical_features=categorical, force_sparse_output=sparse)
-        rvals.append(encoder.fit_transform(data[0]))
-        for d in data[1:]:
-            rvals.append(encoder.transform(d))
-
-        if not sparse and scipy.sparse.issparse(rvals[0]):
-            for i in range(len(rvals)):
-                rvals[i] = rvals[i].todense()
-    else:
-        rvals = data
-
-    return rvals, sparse
 
 
 class AbstractDataManager():
@@ -60,11 +31,11 @@ def info(self) -> Dict[str, Any]:
         return self._info
 
     @property
-    def feat_type(self) -> List[str]:
+    def feat_type(self) -> Dict[Union[str, int], str]:
         return self._feat_type
 
     @feat_type.setter
-    def feat_type(self, value: List[str]) -> None:
+    def feat_type(self, value: Dict[Union[str, int], str]) -> None:
         self._feat_type = value
 
     @property