automl · mfeurer · Jun 25, 2021 · Apr 23, 2021 · Apr 23, 2021 · May 3, 2021
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,6 +22,10 @@ repos:
         args: [--show-error-codes]
         name: mypy auto-sklearn-evaluation
         files: autosklearn/evaluation
+      - id: mypy
+        args: [--show-error-codes]
+        name: mypy auto-sklearn-datapreprocessing
+        files: autosklearn/pipeline/components/data_preprocessing/
   - repo: https://gitlab.com/pycqa/flake8
     rev: 3.8.3
     hooks:

diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -537,10 +537,8 @@ def fit(
         self._dataset_name = dataset_name
         self._stopwatch.start_task(self._dataset_name)
 
-        if feat_type is None and self.InputValidator.feature_validator.feat_type:
-            self._feat_type = self.InputValidator.feature_validator.feat_type
-        elif feat_type is not None:
-            self._feat_type = feat_type
+        # Take the feature types from the validator
+        self._feat_type = self.InputValidator.feature_validator.feat_type
 
         # Produce debug information to the logfile
         self._logger.debug('Starting to print environment information')

diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py
@@ -1,41 +1,12 @@
 import abc
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import numpy as np
 
 import scipy.sparse
 
 from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
     import DataPreprocessor
-from autosklearn.util.data import predict_RAM_usage
-
-
-def perform_one_hot_encoding(
-    sparse: bool,
-    categorical: List[bool],
-    data: List
-) -> Tuple[List, bool]:
-    predicted_RAM_usage = float(
-        predict_RAM_usage(data[0], categorical)) / 1024 / 1024
-
-    if predicted_RAM_usage > 1000:
-        sparse = True
-
-    rvals = []
-    if any(categorical):
-        encoder = DataPreprocessor(
-            categorical_features=categorical, force_sparse_output=sparse)
-        rvals.append(encoder.fit_transform(data[0]))
-        for d in data[1:]:
-            rvals.append(encoder.transform(d))
-
-        if not sparse and scipy.sparse.issparse(rvals[0]):
-            for i in range(len(rvals)):
-                rvals[i] = rvals[i].todense()
-    else:
-        rvals = data
-
-    return rvals, sparse
 
 
 class AbstractDataManager():