Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify InputValidator: Allows pandas frame to directly reach the pipeline #1135

Merged
merged 18 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ repos:
args: [--show-error-codes]
name: mypy auto-sklearn-evaluation
files: autosklearn/evaluation
- id: mypy
args: [--show-error-codes]
name: mypy auto-sklearn-datapreprocessing
files: autosklearn/pipeline/components/data_preprocessing/
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.3
hooks:
Expand Down
6 changes: 2 additions & 4 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,10 +537,8 @@ def fit(
self._dataset_name = dataset_name
self._stopwatch.start_task(self._dataset_name)

if feat_type is None and self.InputValidator.feature_validator.feat_type:
self._feat_type = self.InputValidator.feature_validator.feat_type
elif feat_type is not None:
self._feat_type = feat_type
# Take the feature types from the validator
self._feat_type = self.InputValidator.feature_validator.feat_type

# Produce debug information to the logfile
self._logger.debug('Starting to print environment information')
Expand Down
35 changes: 3 additions & 32 deletions autosklearn/data/abstract_data_manager.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,12 @@
import abc
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, Union

import numpy as np

import scipy.sparse

from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \
import DataPreprocessor
from autosklearn.util.data import predict_RAM_usage


def perform_one_hot_encoding(
sparse: bool,
categorical: List[bool],
data: List
) -> Tuple[List, bool]:
predicted_RAM_usage = float(
predict_RAM_usage(data[0], categorical)) / 1024 / 1024

if predicted_RAM_usage > 1000:
sparse = True

rvals = []
if any(categorical):
encoder = DataPreprocessor(
categorical_features=categorical, force_sparse_output=sparse)
rvals.append(encoder.fit_transform(data[0]))
for d in data[1:]:
rvals.append(encoder.transform(d))

if not sparse and scipy.sparse.issparse(rvals[0]):
for i in range(len(rvals)):
rvals[i] = rvals[i].todense()
else:
rvals = data

return rvals, sparse


class AbstractDataManager():
Expand All @@ -60,11 +31,11 @@ def info(self) -> Dict[str, Any]:
return self._info

@property
def feat_type(self) -> List[str]:
def feat_type(self) -> Dict[Union[str, int], str]:
return self._feat_type

@feat_type.setter
def feat_type(self, value: List[str]) -> None:
def feat_type(self, value: Dict[Union[str, int], str]) -> None:
self._feat_type = value

@property
Expand Down
Loading