diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 77e6c5c512..91b5bbdf54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,10 @@ repos: args: [--show-error-codes] name: mypy auto-sklearn-evaluation files: autosklearn/evaluation + - id: mypy + args: [--show-error-codes] + name: mypy auto-sklearn-datapreprocessing + files: autosklearn/pipeline/components/data_preprocessing/ - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.3 hooks: diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 636762b6ff..7d7939364a 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -537,10 +537,8 @@ def fit( self._dataset_name = dataset_name self._stopwatch.start_task(self._dataset_name) - if feat_type is None and self.InputValidator.feature_validator.feat_type: - self._feat_type = self.InputValidator.feature_validator.feat_type - elif feat_type is not None: - self._feat_type = feat_type + # Take the feature types from the validator + self._feat_type = self.InputValidator.feature_validator.feat_type # Produce debug information to the logfile self._logger.debug('Starting to print environment information') diff --git a/autosklearn/data/abstract_data_manager.py b/autosklearn/data/abstract_data_manager.py index c167bc95b2..a6a8445afb 100644 --- a/autosklearn/data/abstract_data_manager.py +++ b/autosklearn/data/abstract_data_manager.py @@ -1,5 +1,5 @@ import abc -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, Union import numpy as np @@ -7,35 +7,6 @@ from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor -from autosklearn.util.data import predict_RAM_usage - - -def perform_one_hot_encoding( - sparse: bool, - categorical: List[bool], - data: List -) -> Tuple[List, bool]: - predicted_RAM_usage = float( - predict_RAM_usage(data[0], categorical)) / 1024 / 1024 - - if predicted_RAM_usage > 1000: - sparse = True - - rvals = [] - if any(categorical): - encoder = DataPreprocessor( - categorical_features=categorical, force_sparse_output=sparse) - rvals.append(encoder.fit_transform(data[0])) - for d in data[1:]: - rvals.append(encoder.transform(d)) - - if not sparse and scipy.sparse.issparse(rvals[0]): - for i in range(len(rvals)): - rvals[i] = rvals[i].todense() - else: - rvals = data - - return rvals, sparse class AbstractDataManager(): @@ -60,11 +31,11 @@ def info(self) -> Dict[str, Any]: return self._info @property - def feat_type(self) -> List[str]: + def feat_type(self) -> Dict[Union[str, int], str]: return self._feat_type @feat_type.setter - def feat_type(self, value: List[str]) -> None: + def feat_type(self, value: Dict[Union[str, int], str]) -> None: self._feat_type = value @property diff --git a/autosklearn/data/feature_validator.py b/autosklearn/data/feature_validator.py index 1e291ffada..385e5a2c05 100644 --- a/autosklearn/data/feature_validator.py +++ b/autosklearn/data/feature_validator.py @@ -1,18 +1,14 @@ -import functools import logging import typing import numpy as np import pandas as pd -from pandas.api.types import is_numeric_dtype +from pandas.api.types import is_numeric_dtype, is_sparse import scipy.sparse -import sklearn.utils -from sklearn import preprocessing from sklearn.base import BaseEstimator -from sklearn.compose import make_column_transformer from sklearn.exceptions import NotFittedError from autosklearn.util.logging_ import PickableLoggerAdapter @@ -34,40 +30,50 @@ class FeatureValidator(BaseEstimator): """ - A class to pre-process features. In this regards, the format of the data is checked, - and if applicable, features are encoded + Checks the input data to Auto-Sklearn. + + It also determines what columns are categorical and which ones are numerical, + so that the pre-processing pipeline can process this columns accordingly. + Attributes ---------- feat_type: typing.Optional[typing.List[str]] - In case the data is not a pandas DataFrame, this list indicates - which columns should be treated as categorical + In case the dataset is not a pandas DataFrame: + + If provided, this list indicates which columns should be treated as categorical + it is internally transformed into a dictionary that indicates a mapping from + column index to categorical/numerical + + If not provided, by default all columns are treated as numerical + If the input dataset is of type pandas dataframe, this argument + must be none, as the column type will be inferred from the pandas dtypes. + data_type: Class name of the data type provided during fit. - encoder: typing.Optional[BaseEstimator] - Host a encoder object if the data requires transformation (for example, - if provided a categorical column in a pandas DataFrame) - enc_columns: typing.List[str] - List of columns that where encoded """ def __init__(self, feat_type: typing.Optional[typing.List[str]] = None, logger: typing.Optional[PickableLoggerAdapter] = None, ) -> None: # If a dataframe was provided, we populate - # this attribute with the column types from the dataframe - # That is, this attribute contains whether autosklearn - # should treat a column as categorical or numerical - # During fit, if the user provided feat_types, the user - # constrain is honored. If not, this attribute is used. - self.feat_type = feat_type # type: typing.Optional[typing.List[str]] + # this attribute with a mapping from column to {numerical | categorical} + self.feat_type: typing.Optional[ + typing.Dict[typing.Union[str, int], str] + ] = None + if feat_type is not None: + if isinstance(feat_type, dict): + self.feat_type = feat_type + elif not isinstance(feat_type, list): + raise ValueError("Auto-Sklearn expects a list of categorical/" + "numerical feature types, yet a" + " {} was provided".format(type(feat_type))) + else: + + # Convert to a dictionary which will be passed to the ColumnTransformer + # Column Transformer supports strings or integer indexes + self.feat_type = {i: feat for i, feat in enumerate(feat_type)} # Register types to detect unsupported data format changes self.data_type = None # type: typing.Optional[type] - self.dtypes = [] # type: typing.List[str] - self.column_order = [] # type: typing.List[str] - - self.encoder = None # type: typing.Optional[BaseEstimator] - self.enc_columns = [] # type: typing.List[str] + self.dtypes = {} # type: typing.Dict[str, str] self.logger = logger if logger is not None else logging.getLogger(__name__) @@ -79,26 +85,28 @@ def fit( X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, ) -> BaseEstimator: """ - Validates and fit a categorical encoder (if needed) to the features. + Validates input data to Auto-Sklearn. The supported data types are List, numpy arrays and pandas DataFrames. CSR sparse data types are also supported Parameters ---------- - X_train: SUPPORTED_FEAT_TYPES - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] - A hold out set of data used for checking + X_train: SUPPORTED_FEAT_TYPES + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test: typing.Optional[SUPPORTED_FEAT_TYPES] + A hold out set of data used for checking """ # If a list was provided, it will be converted to pandas if isinstance(X_train, list): X_train, X_test = self.list_to_dataframe(X_train, X_test) - # Register the user provided feature types - if self.feat_type is not None: - if hasattr(X_train, "iloc"): + self._check_data(X_train) + + # Handle categorical feature identification for the pipeline + if hasattr(X_train, "iloc"): + if self.feat_type is not None: raise ValueError("When providing a DataFrame to Auto-Sklearn, we extract " "the feature types from the DataFrame.dtypes. That is, " "providing the option feat_type to the fit method is not " @@ -108,20 +116,28 @@ def fit( "DataFrame can be seen in " "https://pandas.pydata.org/pandas-docs/stable/reference" "/api/pandas.DataFrame.astype.html") - # Some checks if self.feat_type is provided - if len(self.feat_type) != np.shape(X_train)[1]: - raise ValueError('Array feat_type does not have same number of ' - 'variables as X has features. %d vs %d.' % - (len(self.feat_type), np.shape(X_train)[1])) - if not all([isinstance(f, str) for f in self.feat_type]): - raise ValueError('Array feat_type must only contain strings.') - - for ft in self.feat_type: - if ft.lower() not in ['categorical', 'numerical']: - raise ValueError('Only `Categorical` and `Numerical` are ' - 'valid feature types, you passed `%s`' % ft) - - self._check_data(X_train) + else: + self.feat_type = self.get_feat_type_from_columns(X_train) + else: + # Numpy array was provided + if self.feat_type is None: + # Assume numerical columns if a numpy array has no feature types + self.feat_type = {i: 'numerical' for i in range(np.shape(X_train)[1])} + else: + # Check The feat type provided + if len(self.feat_type) != np.shape(X_train)[1]: + raise ValueError('Array feat_type does not have same number of ' + 'variables as X has features. %d vs %d.' % + (len(self.feat_type), np.shape(X_train)[1])) + if not all([isinstance(f, str) for f in self.feat_type.values()]): + raise ValueError("feat_type must only contain strings: {}".format( + list(self.feat_type.values()), + )) + + for ft in self.feat_type.values(): + if ft.lower() not in ['categorical', 'numerical']: + raise ValueError('Only `Categorical` and `Numerical` are ' + 'valid feature types, you passed `%s`' % ft) if X_test is not None: self._check_data(X_test) @@ -133,77 +149,10 @@ def fit( np.shape(X_test)[1] )) - # Fit on the training data - self._fit(X_train) - self._is_fitted = True return self - def _fit( - self, - X: SUPPORTED_FEAT_TYPES, - ) -> BaseEstimator: - """ - In case input data is a pandas DataFrame, this utility encodes the user provided - features (from categorical for example) to a numerical value that further stages - will be able to use - - Parameters - ---------- - X: SUPPORTED_FEAT_TYPES - A set of features that are going to be validated (type and dimensionality - checks) and a encoder fitted in the case the data needs encoding - """ - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) - # Treat a column with all instances a NaN as numerical - # This will prevent doing encoding to a categorical column made completely - # out of nan values -- which will trigger a fail, as encoding is not supported - # with nan values. - # Columns that are completely made of NaN values are provided to the pipeline - # so that later stages decide how to handle them - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - - self.enc_columns, self.feat_type = self._get_columns_to_encode(X) - - if len(self.enc_columns) > 0: - - self.encoder = make_column_transformer( - (preprocessing.OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1, - ), self.enc_columns), - remainder="passthrough" - ) - - # Mypy redefinition - assert self.encoder is not None - self.encoder.fit(X) - - # The column transformer reoders the feature types - we therefore need to change - # it as well - def comparator(cmp1: str, cmp2: str) -> int: - if ( - cmp1 == 'categorical' and cmp2 == 'categorical' - or cmp1 == 'numerical' and cmp2 == 'numerical' - ): - return 0 - elif cmp1 == 'categorical' and cmp2 == 'numerical': - return -1 - elif cmp1 == 'numerical' and cmp2 == 'categorical': - return 1 - else: - raise ValueError((cmp1, cmp2)) - self.feat_type = sorted( - self.feat_type, - key=functools.cmp_to_key(comparator) - ) - return self - def transform( self, X: SUPPORTED_FEAT_TYPES, @@ -230,37 +179,20 @@ def transform( if isinstance(X, list): X, _ = self.list_to_dataframe(X) - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - # Check the data here so we catch problems on new test data self._check_data(X) - # Pandas related transformations - if hasattr(X, "iloc") and self.encoder is not None: - if np.any(pd.isnull(X)): - # After above check it means that if there is a NaN - # the whole column must be NaN - # Make sure it is numerical and let the pipeline handle it - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - X = self.encoder.transform(X) - # Sparse related transformations # Not all sparse format support index sorting - if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): - X.sort_indices() - - return sklearn.utils.check_array( - X, - force_all_finite=False, - accept_sparse='csr' - ) + if scipy.sparse.issparse(X): + if not isinstance(X, scipy.sparse.csr_matrix): + self.logger.warning(f"Sparse data provided is of type {type(X)} " + "yet Auto-Sklearn only support csr_matrix. Auto-sklearn " + "will convert the provided data to the csr_matrix format.") + X = X.tocsr(copy=False) + if hasattr(X, 'sort_indices'): + X.sort_indices() + return X def _check_data( self, @@ -276,6 +208,12 @@ def _check_data( checks) and a encoder fitted in the case the data needs encoding """ + # We consider columns that are all nan in a pandas frame as category + if hasattr(X, 'columns'): + for column in typing.cast(pd.DataFrame, X).columns: + if X[column].isna().all(): + X[column] = X[column].astype('category') + if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X): raise ValueError("Auto-sklearn only supports Numpy arrays, Pandas DataFrames," " scipy sparse and Python Lists, yet, the provided input is" @@ -285,6 +223,7 @@ def _check_data( if self.data_type is None: self.data_type = type(X) + if self.data_type != type(X): self.logger.warning("Auto-sklearn previously received features of type %s " "yet the current features have type %s. Changing the dtype " @@ -310,52 +249,28 @@ def _check_data( # If entered here, we have a pandas dataframe X = typing.cast(pd.DataFrame, X) - # Define the column to be encoded here as the feature validator is fitted once - # per estimator - enc_columns, _ = self._get_columns_to_encode(X) - - if len(enc_columns) > 0: - if np.any(pd.isnull( - X[enc_columns].dropna( # type: ignore[call-overload] - axis='columns', how='all') - )): - # Ignore all NaN columns, and if still a NaN - # Error out - raise ValueError("Categorical features in a dataframe cannot contain " - "missing/NaN values. The OrdinalEncoder used by " - "Auto-sklearn cannot handle this yet (due to a " - "limitation on scikit-learn being addressed via: " - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - ) - column_order = [column for column in X.columns] - if len(self.column_order) > 0: - if self.column_order != column_order: - raise ValueError("Changing the column order of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format( - self.column_order, - column_order, - )) - else: - self.column_order = column_order - dtypes = [dtype.name for dtype in X.dtypes] + dtypes = {col: X[col].dtype.name.lower() for col in X.columns} if len(self.dtypes) > 0: if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format( - self.dtypes, - dtypes, - )) + # To support list, we need to support object inference. + # In extreme cases, the train column might be all integer, + # and the test column might be float. + self.logger.warning("Changing the dtype of the features after fit() is " + "not recommended. Fit() method was called with " + "{} whereas the new features have {} as type".format( + self.dtypes, + dtypes, + )) else: self.dtypes = dtypes - def _get_columns_to_encode( + def get_feat_type_from_columns( self, X: pd.DataFrame, - ) -> typing.Tuple[typing.List[str], typing.List[str]]: + ) -> typing.Dict[typing.Union[str, int], str]: """ - Return the columns to be encoded from a pandas dataframe + Returns a dictionary that maps pandas dataframe columns to a feature type. + This feature type can be categorical or numerical Parameters ---------- @@ -364,23 +279,21 @@ def _get_columns_to_encode( checks) and a encoder fitted in the case the data needs encoding Returns ------- - enc_columns: - Columns to encode, if any feat_type: - Type of each column numerical/categorical + dictionary with column to feature type mapping """ - # Register if a column needs encoding - enc_columns = [] # Also, register the feature types for the estimator - feat_type = [] + feat_type = {} # Make sure each column is a valid type for i, column in enumerate(X.columns): - if X[column].dtype.name in ['category', 'bool']: + if is_sparse(X[column]): + raise ValueError("Auto-sklearn does not yet support sparse pandas Series." + f" Please convert {column} to a dense format.") + elif X[column].dtype.name in ['category', 'bool']: - enc_columns.append(column) - feat_type.append('categorical') + feat_type[column] = 'categorical' # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types elif not is_numeric_dtype(X[column]): @@ -419,8 +332,8 @@ def _get_columns_to_encode( ) ) else: - feat_type.append('numerical') - return enc_columns, feat_type + feat_type[column] = 'numerical' + return feat_type def list_to_dataframe( self, @@ -448,7 +361,28 @@ def list_to_dataframe( """ # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() + X_train = pd.DataFrame(data=X_train).convert_dtypes() + + # Store the dtypes and use in case of re-fit + if len(self.dtypes) == 0: + # Categorical data is inferred as string. Convert to categorical. + # Warn the user about dtypes or request him to use a dataframe + for col in X_train.columns: + if X_train[col].dtype.name == 'string': + X_train[col] = X_train[col].astype('category') + + self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns} + else: + for col in X_train.columns: + # Try to convert to the original dtype used to fit the validator + # But also be robust to extreme cases (for example, the train data for a + # column was all np.int-like and the test data is np.float-type) + try: + X_train[col] = X_train[col].astype(self.dtypes[col]) + except Exception as e: + self.logger.warning(f"Failed to format column {col} as {self.dtypes[col]}: {e}") + self.dtypes[col] = X_train[col].dtype.name.lower() + self.logger.warning("The provided feature types to autosklearn are of type list." "Features have been interpreted as: {}".format( [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)] @@ -459,5 +393,12 @@ def list_to_dataframe( "is {}. X_test will be casted as DataFrame.".format( type(X_test) )) - X_test = pd.DataFrame(data=X_test).infer_objects() + X_test = pd.DataFrame(data=X_test) + for col in X_test.columns: + try: + X_test[col] = X_test[col].astype(self.dtypes[col]) + except Exception as e: + self.logger.warning(f"Failed to format column {col} as {self.dtypes[col]}: {e}") + self.dtypes[col] = X_test[col].dtype.name.lower() + return X_train, X_test diff --git a/autosklearn/data/validation.py b/autosklearn/data/validation.py index f269c5ef8f..d429dec5a5 100644 --- a/autosklearn/data/validation.py +++ b/autosklearn/data/validation.py @@ -23,8 +23,13 @@ class InputValidator(BaseEstimator): Attributes ---------- feat_type: typing.Optional[typing.List[str]] - In case the data is not a pandas DataFrame, this list indicates - which columns should be treated as categorical + In case the dataset is not a pandas DataFrame: + + If provided, this list indicates which columns should be treated as categorical + it is internally transformed into a dictionary that indicates a mapping from + column index to categorical/numerical + + If not provided, by default all columns are treated as numerical + If the input dataset is of type pandas dataframe, this argument + must be none, as the column type will be inferred from the pandas dtypes. is_classification: bool For classification task, this flag indicates that the target data should be encoded diff --git a/autosklearn/data/xy_data_manager.py b/autosklearn/data/xy_data_manager.py index 7d83c4e07a..4c539157ee 100644 --- a/autosklearn/data/xy_data_manager.py +++ b/autosklearn/data/xy_data_manager.py @@ -1,8 +1,10 @@ # -*- encoding: utf-8 -*- -from typing import List, Optional +from typing import Dict, Optional, Union, cast import numpy as np +import pandas as pd + from scipy import sparse from autosklearn.constants import ( @@ -13,18 +15,22 @@ REGRESSION, ) from autosklearn.data.abstract_data_manager import AbstractDataManager +from autosklearn.data.validation import ( + SUPPORTED_FEAT_TYPES, + SUPPORTED_TARGET_TYPES, +) class XYDataManager(AbstractDataManager): def __init__( self, - X: np.ndarray, - y: np.ndarray, - X_test: Optional[np.ndarray], - y_test: Optional[np.ndarray], + X: SUPPORTED_FEAT_TYPES, + y: SUPPORTED_TARGET_TYPES, + X_test: Optional[SUPPORTED_FEAT_TYPES], + y_test: Optional[SUPPORTED_TARGET_TYPES], task: int, - feat_type: List[str], + feat_type: Dict[Union[str, int], str], dataset_name: str ): super(XYDataManager, self).__init__(dataset_name) @@ -32,17 +38,20 @@ def __init__( self.info['task'] = task if sparse.issparse(X): self.info['is_sparse'] = 1 - self.info['has_missing'] = np.all(np.isfinite(X.data)) + self.info['has_missing'] = np.all(np.isfinite(cast(sparse.csr_matrix, X).data)) else: self.info['is_sparse'] = 0 - self.info['has_missing'] = np.all(np.isfinite(X)) + if hasattr(X, 'iloc'): + self.info['has_missing'] = cast(pd.DataFrame, X).isnull().values.any() + else: + self.info['has_missing'] = np.all(np.isfinite(X)) label_num = { REGRESSION: 1, BINARY_CLASSIFICATION: 2, - MULTIOUTPUT_REGRESSION: y.shape[-1], + MULTIOUTPUT_REGRESSION: np.shape(y)[-1], MULTICLASS_CLASSIFICATION: len(np.unique(y)), - MULTILABEL_CLASSIFICATION: y.shape[-1] + MULTILABEL_CLASSIFICATION: np.shape(y)[-1] } self.info['label_num'] = label_num[task] @@ -54,28 +63,20 @@ def __init__( if y_test is not None: self.data['Y_test'] = y_test - if feat_type is not None: - for feat in feat_type: - allowed_types = ['numerical', 'categorical'] - if feat.lower() not in allowed_types: - raise ValueError("Entry '%s' in feat_type not in %s" % - (feat.lower(), str(allowed_types))) - - self.feat_type = feat_type + if isinstance(feat_type, dict): + self.feat_type = feat_type + else: + raise ValueError("Unsupported feat_type provided. We expect the user to " + "provide a Dict[str, str] mapping from column to categorical/ " + "numerical.") # TODO: try to guess task type! - if len(y.shape) > 2: + if len(np.shape(y)) > 2: raise ValueError('y must not have more than two dimensions, ' - 'but has %d.' % len(y.shape)) + 'but has %d.' % len(np.shape(y))) - if X.shape[0] != y.shape[0]: + if np.shape(X)[0] != np.shape(y)[0]: raise ValueError('X and y must have the same number of ' - 'datapoints, but have %d and %d.' % (X.shape[0], - y.shape[0])) - if self.feat_type is None: - self.feat_type = ['Numerical'] * X.shape[1] - if X.shape[1] != len(self.feat_type): - raise ValueError('X and feat_type must have the same number of columns, ' - 'but are %d and %d.' % - (X.shape[1], len(self.feat_type))) + 'datapoints, but have %d and %d.' % (np.shape(X)[0], + np.shape(y)[0])) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index e8530d8210..d675b8611d 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -565,6 +565,7 @@ def get_configuration_space( X_test: Optional[SUPPORTED_FEAT_TYPES] = None, y_test: Optional[SUPPORTED_TARGET_TYPES] = None, dataset_name: Optional[str] = None, + feat_type: Optional[List[str]] = None, ): """ Returns the Configuration Space object, from which Auto-Sklearn @@ -590,6 +591,7 @@ def get_configuration_space( X, y, X_test=X_test, y_test=y_test, dataset_name=dataset_name, + feat_type=feat_type, only_return_configuration_space=True, ) if self.automl_.configuration_space is None else self.automl_.configuration_space diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 50a9cd272e..e6778834c1 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -235,21 +235,11 @@ def __init__( self.model_class = autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba - categorical_mask = [] - for feat in self.datamanager.feat_type: - if feat.lower() == 'numerical': - categorical_mask.append(False) - elif feat.lower() == 'categorical': - categorical_mask.append(True) - else: - raise ValueError(feat) - if np.sum(categorical_mask) > 0: - self._init_params = { - 'data_preprocessing:categorical_features': - categorical_mask - } - else: - self._init_params = {} + self._init_params = { + 'data_preprocessing:feat_type': + self.datamanager.feat_type + } + if init_params is not None: self._init_params.update(init_params) diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index b253516085..7ddb874e53 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -8,6 +8,7 @@ from ConfigSpace import Configuration import numpy as np + from smac.tae import TAEAbortException, StatusType from sklearn.base import BaseEstimator @@ -27,6 +28,11 @@ REGRESSION_TASKS, MULTIOUTPUT_REGRESSION ) +from autosklearn.data.validation import ( + SUPPORTED_FEAT_TYPES, + SUPPORTED_TARGET_TYPES, + ) +from autosklearn.pipeline.base import PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import IterativeComponent from autosklearn.metrics import Scorer from autosklearn.util.backend import Backend @@ -69,7 +75,7 @@ } -def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: +def _get_y_array(y: SUPPORTED_TARGET_TYPES, task_type: int) -> SUPPORTED_TARGET_TYPES: if task_type in CLASSIFICATION_TASKS and task_type != \ MULTILABEL_CLASSIFICATION: return y.ravel() @@ -81,7 +87,7 @@ def subsample_indices( train_indices: List[int], subsample: Optional[float], task_type: int, - Y_train: np.ndarray + Y_train: SUPPORTED_TARGET_TYPES ) -> List[int]: if not isinstance(subsample, float): @@ -100,7 +106,10 @@ def subsample_indices( # required to subsample because otherwise scikit-learn will complain if task_type in CLASSIFICATION_TASKS and task_type != MULTILABEL_CLASSIFICATION: - stratify = Y_train[train_indices] + stratify: Optional[ + SUPPORTED_TARGET_TYPES + ] = Y_train.iloc[train_indices] if hasattr( + Y_train, 'iloc') else Y_train[train_indices] else: stratify = None @@ -119,8 +128,8 @@ def subsample_indices( def _fit_with_budget( - X_train: np.ndarray, - Y_train: np.ndarray, + X_train: SUPPORTED_FEAT_TYPES, + Y_train: SUPPORTED_TARGET_TYPES, budget: float, budget_type: Optional[str], logger: Union[logging.Logger, PicklableClientLogger], @@ -134,18 +143,25 @@ def _fit_with_budget( ): if model.estimator_supports_iterative_fit(): budget_factor = model.get_max_iter() - Xt, fit_params = model.fit_transformer(X_train[train_indices], - Y_train[train_indices]) + Xt, fit_params = model.fit_transformer( + X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices], + Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], + ) n_iter = int(np.ceil(budget / 100 * budget_factor)) - model.iterative_fit(Xt, Y_train[train_indices], n_iter=n_iter, refit=True, - **fit_params) + model.iterative_fit( + Xt, + Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], + n_iter=n_iter, + refit=True, + **fit_params + ) else: _fit_and_suppress_warnings( logger, model, - X_train[train_indices], - Y_train[train_indices], + X_train.iloc[train_indices] if hasattr(X_train, 'iloc') else X_train[train_indices], + Y_train.iloc[train_indices] if hasattr(Y_train, 'iloc') else Y_train[train_indices], ) elif ( @@ -221,7 +237,7 @@ def __init__( ) self.X_train = self.datamanager.data['X_train'] self.Y_train = self.datamanager.data['Y_train'] - self.Y_optimization: Optional[Union[List, np.ndarray]] = None + self.Y_optimization: Optional[SUPPORTED_TARGET_TYPES] = None self.Y_targets = [None] * self.num_cv_folds self.Y_train_targets = np.ones(self.Y_train.shape) * np.NaN self.models = [None] * self.num_cv_folds @@ -322,19 +338,27 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: if iterations[i] == 1: self.Y_train_targets[train_indices] = \ - self.Y_train[train_indices] + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] self.Y_targets[i] = self.Y_train[test_indices] Xt, fit_params = model.fit_transformer( - self.X_train[train_indices], - self.Y_train[train_indices]) + self.X_train.iloc[train_indices] if hasattr( + self.X_train, 'iloc') else self.X_train[train_indices], + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + ) Xt_array[i] = Xt fit_params_array[i] = fit_params n_iter = int(2 ** iterations[i] / 2) if iterations[i] > 1 else 2 total_n_iterations[i] = total_n_iterations[i] + n_iter - model.iterative_fit(Xt_array[i], self.Y_train[train_indices], - n_iter=n_iter, **fit_params_array[i]) + model.iterative_fit( + Xt_array[i], + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + n_iter=n_iter, **fit_params_array[i] + ) ( train_pred, @@ -356,7 +380,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # Compute train loss of this fold and store it. train_loss could # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( - self.Y_train_targets[train_indices], + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], train_pred, ) train_losses[i] = train_loss @@ -738,10 +763,15 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] file_output = True if self.num_cv_folds == 1 else False if model.estimator_supports_iterative_fit(): - Xt, fit_params = model.fit_transformer(self.X_train[train_indices], - self.Y_train[train_indices]) + Xt, fit_params = model.fit_transformer( + self.X_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.X_train[train_indices], + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + ) - self.Y_train_targets[train_indices] = self.Y_train[train_indices] + self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] iteration = 1 total_n_iteration = 0 @@ -759,8 +789,12 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] ): n_iter = int(2**iteration/2) if iteration > 1 else 2 total_n_iteration += n_iter - model.iterative_fit(Xt, self.Y_train[train_indices], - n_iter=n_iter, **fit_params) + model.iterative_fit( + Xt, + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + n_iter=n_iter, **fit_params + ) ( Y_train_pred, Y_optimization_pred, @@ -775,7 +809,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] if add_model_to_self: self.model = model - train_loss = self._loss(self.Y_train[train_indices], Y_train_pred) + train_loss = self._loss( + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + Y_train_pred + ) loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) additional_run_info = model.get_additional_run_info() @@ -814,7 +852,11 @@ def _partial_fit_and_predict_iterative(self, fold: int, train_indices: List[int] additional_run_info ) = self._partial_fit_and_predict_standard(fold, train_indices, test_indices, add_model_to_self) - train_loss = self._loss(self.Y_train[train_indices], Y_train_pred) + train_loss = self._loss( + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], + Y_train_pred + ) loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) if self.model.estimator_supports_iterative_fit(): model_max_iter = self.model.get_max_iter() @@ -843,8 +885,11 @@ def _partial_fit_and_predict_standard( fold: int, train_indices: List[int], test_indices: List[int], add_model_to_self: bool = False - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, - Dict[str, Union[str, int, float, Dict, List, Tuple]]]: + ) -> Tuple[PIPELINE_DATA_DTYPE, # train_pred + PIPELINE_DATA_DTYPE, # opt_pred + PIPELINE_DATA_DTYPE, # valid_pred + PIPELINE_DATA_DTYPE, # test_pred + TYPE_ADDITIONAL_INFO]: model = self._get_model() self.indices[fold] = ((train_indices, test_indices)) @@ -852,8 +897,10 @@ def _partial_fit_and_predict_standard( _fit_and_suppress_warnings( self.logger, model, - self.X_train[train_indices], - self.Y_train[train_indices], + self.X_train.iloc[train_indices] if hasattr( + self.X_train, 'iloc') else self.X_train[train_indices], + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], ) if add_model_to_self: @@ -861,8 +908,10 @@ def _partial_fit_and_predict_standard( else: self.models[fold] = model - self.Y_targets[fold] = self.Y_train[test_indices] - self.Y_train_targets[train_indices] = self.Y_train[train_indices] + self.Y_targets[fold] = self.Y_train.iloc[test_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[test_indices] + self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] train_pred, opt_pred, valid_pred, test_pred = self._predict( model=model, @@ -883,8 +932,11 @@ def _partial_fit_and_predict_budget( fold: int, train_indices: List[int], test_indices: List[int], add_model_to_self: bool = False, - ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, - Dict[str, Union[str, int, float, Dict, List, Tuple]]]: + ) -> Tuple[PIPELINE_DATA_DTYPE, # train_pred + PIPELINE_DATA_DTYPE, # opt_pred + PIPELINE_DATA_DTYPE, # valid_pred + PIPELINE_DATA_DTYPE, # test_pred + TYPE_ADDITIONAL_INFO]: # This function is only called in the event budget is not None # Add this statement for mypy @@ -893,7 +945,8 @@ def _partial_fit_and_predict_budget( model = self._get_model() self.indices[fold] = ((train_indices, test_indices)) self.Y_targets[fold] = self.Y_train[test_indices] - self.Y_train_targets[train_indices] = self.Y_train[train_indices] + self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices], _fit_with_budget( X_train=self.X_train, @@ -927,15 +980,25 @@ def _partial_fit_and_predict_budget( ) def _predict(self, model: BaseEstimator, test_indices: List[int], - train_indices: List[int]) -> Tuple[np.ndarray, np.ndarray, - np.ndarray, np.ndarray]: - train_pred = self.predict_function(self.X_train[train_indices], - model, self.task_type, - self.Y_train[train_indices]) + train_indices: List[int]) -> Tuple[PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE, + PIPELINE_DATA_DTYPE]: + train_pred = self.predict_function( + self.X_train.iloc[train_indices] if hasattr( + self.X_train, 'iloc') else self.X_train[train_indices], + model, self.task_type, + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] + ) - opt_pred = self.predict_function(self.X_train[test_indices], - model, self.task_type, - self.Y_train[train_indices]) + opt_pred = self.predict_function( + self.X_train.iloc[test_indices] if hasattr( + self.X_train, 'iloc') else self.X_train[test_indices], + model, self.task_type, + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] + ) if self.X_valid is not None: X_valid = self.X_valid.copy() @@ -947,9 +1010,12 @@ def _predict(self, model: BaseEstimator, test_indices: List[int], if self.X_test is not None: X_test = self.X_test.copy() - test_pred = self.predict_function(X_test, model, - self.task_type, - self.Y_train[train_indices]) + test_pred = self.predict_function( + X_test, model, + self.task_type, + self.Y_train.iloc[train_indices] if hasattr( + self.Y_train, 'iloc') else self.Y_train[train_indices] + ) else: test_pred = None diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py index 843bb9ecbb..7ad663f0be 100644 --- a/autosklearn/metalearning/metafeatures/metafeatures.py +++ b/autosklearn/metalearning/metafeatures/metafeatures.py @@ -2,12 +2,17 @@ import copy import numpy as np + +import pandas as pd + import scipy.stats from scipy.linalg import LinAlgError import scipy.sparse + # TODO use balanced accuracy! -from sklearn.utils import check_array from sklearn.multiclass import OneVsRestClassifier +from sklearn.utils import check_array +from sklearn.utils.multiclass import type_of_target from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor @@ -147,6 +152,10 @@ class NumberOfClasses(MetaFeature): does this for each label seperately and returns the mean. """ def _calculate(self, X, y, logger, categorical): + if type_of_target(y) == 'multilabel-indicator': + # We have a label binary indicator array: + # each sample is one row of a 2d array of shape (n_samples, n_classes) + return y.shape[1] if len(y.shape) == 2: return np.mean([len(np.unique(y[:, i])) for i in range(y.shape[1])]) else: @@ -169,7 +178,7 @@ def _calculate(self, X, y, logger, categorical): @helper_functions.define("MissingValues") class MissingValues(HelperFunction): def _calculate(self, X, y, logger, categorical): - missing = ~np.isfinite(X) + missing = pd.isna(X) return missing def _calculate_sparse(self, X, y, logger, categorical): @@ -235,7 +244,10 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("NumberOfMissingValues", dependency="MissingValues") class NumberOfMissingValues(MetaFeature): def _calculate(self, X, y, logger, categorical): - return float(helper_functions.get_value("MissingValues").sum()) + if scipy.sparse.issparse(X): + return float(helper_functions.get_value("MissingValues").sum()) + else: + return float(np.count_nonzero(helper_functions.get_value("MissingValues"))) @metafeatures.define("PercentageOfMissingValues", @@ -250,13 +262,13 @@ def _calculate(self, X, y, logger, categorical): @metafeatures.define("NumberOfNumericFeatures") class NumberOfNumericFeatures(MetaFeature): def _calculate(self, X, y, logger, categorical): - return len(categorical) - np.sum(categorical) + return len(categorical) - np.sum(list(categorical.values())) @metafeatures.define("NumberOfCategoricalFeatures") class NumberOfCategoricalFeatures(MetaFeature): def _calculate(self, X, y, logger, categorical): - return np.sum(categorical) + return np.sum(list(categorical.values())) @metafeatures.define("RatioNumericalToNominal") @@ -411,10 +423,12 @@ def _calculate(self, X, y, logger, categorical): class NumSymbols(HelperFunction): def _calculate(self, X, y, logger, categorical): symbols_per_column = [] - for i, column in enumerate(X.T): - if categorical[i]: - unique_values = np.unique(column) - num_unique = np.sum(np.isfinite(unique_values)) + for i in range(X.shape[1]): + if categorical[X.columns[i] if hasattr(X, 'columns') else i]: + column = X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] + unique_values = column.unique() if hasattr( + column, 'unique') else np.unique(column) + num_unique = np.sum(pd.notna(unique_values)) symbols_per_column.append(num_unique) return symbols_per_column @@ -422,7 +436,7 @@ def _calculate_sparse(self, X, y, logger, categorical): symbols_per_column = [] new_X = X.tocsc() for i in range(new_X.shape[1]): - if categorical[i]: + if categorical[X.columns[i] if hasattr(X, 'columns') else i]: unique_values = np.unique(new_X.getcol(i).data) num_unique = np.sum(np.isfinite(unique_values)) symbols_per_column.append(num_unique) @@ -488,15 +502,17 @@ class Kurtosisses(HelperFunction): def _calculate(self, X, y, logger, categorical): kurts = [] for i in range(X.shape[1]): - if not categorical[i]: - kurts.append(scipy.stats.kurtosis(X[:, i])) + if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: + kurts.append(scipy.stats.kurtosis( + X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] + )) return kurts def _calculate_sparse(self, X, y, logger, categorical): kurts = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[i]: + if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: start = X_new.indptr[i] stop = X_new.indptr[i+1] kurts.append(scipy.stats.kurtosis(X_new.data[start:stop])) @@ -540,15 +556,17 @@ class Skewnesses(HelperFunction): def _calculate(self, X, y, logger, categorical): skews = [] for i in range(X.shape[1]): - if not categorical[i]: - skews.append(scipy.stats.skew(X[:, i])) + if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: + skews.append(scipy.stats.skew( + X.iloc[:, i] if hasattr(X, 'iloc') else X[:, i] + )) return skews def _calculate_sparse(self, X, y, logger, categorical): skews = [] X_new = X.tocsc() for i in range(X_new.shape[1]): - if not categorical[i]: + if not categorical[X.columns[i] if hasattr(X, 'columns') else i]: start = X_new.indptr[i] stop = X_new.indptr[i + 1] skews.append(scipy.stats.skew(X_new.data[start:stop])) @@ -604,13 +622,11 @@ def cancor2(X, y): class ClassEntropy(MetaFeature): def _calculate(self, X, y, logger, categorical): labels = 1 if len(y.shape) == 1 else y.shape[1] - if labels == 1: - y = y.reshape((-1, 1)) entropies = [] for i in range(labels): occurence_dict = defaultdict(float) - for value in y[:, i]: + for value in y if labels == 1 else y[:, i]: occurence_dict[value] += 1 entropies.append(scipy.stats.entropy([occurence_dict[key] for key in occurence_dict], base=2)) @@ -664,13 +680,24 @@ def _calculate(self, X, y, logger, categorical): lda = sklearn.discriminant_analysis.LinearDiscriminantAnalysis() if len(y.shape) == 1 or y.shape[1] == 1: - lda.fit(X[train], y[train]) + lda.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) else: lda = OneVsRestClassifier(lda) - lda.fit(X[train], y[train]) - - predictions = lda.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + lda.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + + predictions = lda.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 except scipy.linalg.LinAlgError as e: self.logger.warning("LDA failed: %s Returned 0 instead!" % e) @@ -699,13 +726,24 @@ def _calculate(self, X, y, logger, categorical): nb = sklearn.naive_bayes.GaussianNB() if len(y.shape) == 1 or y.shape[1] == 1: - nb.fit(X[train], y[train]) + nb.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) else: nb = OneVsRestClassifier(nb) - nb.fit(X[train], y[train]) - - predictions = nb.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + nb.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + + predictions = nb.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 def _calculate_sparse(self, X, y, logger, categorical): @@ -729,13 +767,24 @@ def _calculate(self, X, y, logger, categorical): tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: - tree.fit(X[train], y[train]) + tree.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) else: tree = OneVsRestClassifier(tree) - tree.fit(X[train], y[train]) - - predictions = tree.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + tree.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + + predictions = tree.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 def _calculate_sparse(self, X, y, logger, categorical): @@ -766,12 +815,23 @@ def _calculate(self, X, y, logger, categorical): criterion="entropy", max_depth=1, random_state=random_state, min_samples_split=2, min_samples_leaf=1, max_features=None) if len(y.shape) == 1 or y.shape[1] == 1: - node.fit(X[train], y[train]) + node.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) else: node = OneVsRestClassifier(node) - node.fit(X[train], y[train]) - predictions = node.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + node.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + predictions = node.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 def _calculate_sparse(self, X, y, logger, categorical): @@ -794,9 +854,17 @@ def _calculate(self, X, y, logger, categorical): node = sklearn.tree.DecisionTreeClassifier( criterion="entropy", max_depth=1, random_state=random_state, min_samples_split=2, min_samples_leaf=1, max_features=1) - node.fit(X[train], y[train]) - predictions = node.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + node.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + predictions = node.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 def _calculate_sparse(self, X, y, logger, categorical): @@ -844,12 +912,23 @@ def _calculate(self, X, y, logger, categorical): for train, test in kf.split(X, y): kNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1) if len(y.shape) == 1 or y.shape[1] == 1: - kNN.fit(X[train], y[train]) + kNN.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) else: kNN = OneVsRestClassifier(kNN) - kNN.fit(X[train], y[train]) - predictions = kNN.predict(X[test]) - accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) + kNN.fit( + X.iloc[train] if hasattr(X, 'iloc') else X[train], + y.iloc[train] if hasattr(y, 'iloc') else y[train], + ) + predictions = kNN.predict( + X.iloc[test] if hasattr(X, 'iloc') else X[test], + ) + accuracy += sklearn.metrics.accuracy_score( + predictions, + y.iloc[test] if hasattr(y, 'iloc') else y[test], + ) return accuracy / 5 @@ -872,7 +951,9 @@ def _calculate(self, X, y, logger, categorical): for i in range(10): try: rs.shuffle(indices) - pca.fit(X[indices]) + pca.fit( + X.iloc[indices] if hasattr(X, 'iloc') else X[indices], + ) return pca except LinAlgError: pass @@ -1002,9 +1083,13 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) DPP = DataPreprocessor( - categorical_features=categorical, force_sparse_output=True) + # The difference between feat_type and categorical, is that + # categorical has True/False instead of categorical/numerical + feat_type={key: 'categorical' if value else 'numerical' + for key, value in categorical.items()}, + force_sparse_output=True) X_transformed = DPP.fit_transform(X) - categorical_transformed = [False] * X_transformed.shape[1] + categorical_transformed = {i: False for i in range(X_transformed.shape[1])} # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): @@ -1052,7 +1137,8 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger, dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) - value = helper_functions[dependency](X_, y_, categorical_) + value = helper_functions[dependency]( + X_, y_, categorical=categorical_, logger=logger) helper_functions.set_value(dependency, value) mf_[dependency] = value diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py index 3eef6ce7b5..9da737a17d 100644 --- a/autosklearn/pipeline/base.py +++ b/autosklearn/pipeline/base.py @@ -1,13 +1,30 @@ from abc import ABCMeta +from typing import Dict, Union -import numpy as np from ConfigSpace import Configuration + +import numpy as np + +import scipy.sparse + from sklearn.pipeline import Pipeline from sklearn.utils.validation import check_random_state from .components.base import AutoSklearnChoice, AutoSklearnComponent import autosklearn.pipeline.create_searchspace_util +DATASET_PROPERTIES_TYPE = Dict[str, Union[str, int, bool]] +PIPELINE_DATA_DTYPE = Union[ + np.ndarray, + scipy.sparse.bsr_matrix, + scipy.sparse.coo_matrix, + scipy.sparse.csc_matrix, + scipy.sparse.csr_matrix, + scipy.sparse.dia_matrix, + scipy.sparse.dok_matrix, + scipy.sparse.lil_matrix, +] + class BasePipeline(Pipeline): """Base class for all pipeline objects. diff --git a/autosklearn/pipeline/components/base.py b/autosklearn/pipeline/components/base.py index 0e16b2a8bd..dd7e079f85 100644 --- a/autosklearn/pipeline/components/base.py +++ b/autosklearn/pipeline/components/base.py @@ -4,7 +4,7 @@ import pkgutil import sys -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_random_state from autosklearn.pipeline.constants import SPARSE @@ -235,7 +235,7 @@ def get_estimator(self): return self.estimator -class AutoSklearnPreprocessingAlgorithm(AutoSklearnComponent): +class AutoSklearnPreprocessingAlgorithm(TransformerMixin, AutoSklearnComponent): """Provide an abstract interface for preprocessing algorithms in auto-sklearn. diff --git a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py index 191ab9d44b..fca1187fd3 100644 --- a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py +++ b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py @@ -1,26 +1,35 @@ +from typing import Any, List, Dict, Optional, Tuple, Union + import numpy as np from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from sklearn.base import BaseEstimator + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, SIGNED_DATA, INPUT class Balancing(AutoSklearnPreprocessingAlgorithm): - def __init__(self, strategy='none', random_state=None): + def __init__(self, strategy: str = 'none', + random_state: Optional[np.random.RandomState] = None,): self.strategy = strategy self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'Balancing': self.fitted_ = True return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X - def get_weights(self, Y, classifier, preprocessor, init_params, fit_params): + def get_weights(self, Y: PIPELINE_DATA_DTYPE, + classifier: BaseEstimator, preprocessor: BaseEstimator, + init_params: Optional[Dict[str, Any]], fit_params: Optional[Dict[str, Any]], + ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: if init_params is None: init_params = {} @@ -35,7 +44,7 @@ def get_weights(self, Y, classifier, preprocessor, init_params, fit_params): # are used together with warmstarts clf_ = ['adaboost', 'random_forest', 'extra_trees', 'sgd', 'passive_aggressive', 'gradient_boosting'] - pre_ = [] + pre_: List[str] = [] if classifier in clf_ or preprocessor in pre_: if len(Y.shape) > 1: offsets = [2 ** i for i in range(Y.shape[1])] @@ -88,7 +97,8 @@ def get_weights(self, Y, classifier, preprocessor, init_params, fit_params): return init_params, fit_params @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'Balancing', 'name': 'Balancing Imbalanced Class Distributions', 'handles_missing_values': True, @@ -109,7 +119,8 @@ def get_properties(dataset_properties=None): 'preferred_dtype': None} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: # TODO add replace by zero! strategy = CategoricalHyperparameter( "strategy", ["none", "weighting"], default_value="none") diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py index 44daf9f88b..330392964c 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/__init__.py @@ -1,10 +1,19 @@ from collections import OrderedDict import os -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice + +from typing import Any, Dict, Optional + +from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from sklearn.base import BaseEstimator + +from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ + ThirdPartyComponents, AutoSklearnChoice + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE + ohe_directory = os.path.split(__file__)[0] _ohes = find_components(__package__, ohe_directory, @@ -12,23 +21,26 @@ _addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -def add_ohe(ohe): +def add_ohe(ohe: 'OHEChoice') -> None: _addons.add_component(ohe) class OHEChoice(AutoSklearnChoice): @classmethod - def get_components(cls): - components = OrderedDict() + def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: + components: Dict[str, BaseEstimator] = OrderedDict() components.update(_ohes) components.update(_addons.components) return components - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + default: Optional[str] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() if dataset_properties is None: @@ -67,7 +79,9 @@ def get_hyperparameter_search_space(self, dataset_properties=None, self.dataset_properties = dataset_properties return cs - def set_hyperparameters(self, configuration, init_params=None): + def set_hyperparameters(self, configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None + ) -> 'OHEChoice': new_params = {} params = configuration.get_dictionary() @@ -95,5 +109,5 @@ def set_hyperparameters(self, configuration, init_params=None): return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.choice.transform(X) diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py new file mode 100644 index 0000000000..5aa1f7304a --- /dev/null +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/encoding.py @@ -0,0 +1,64 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from ConfigSpace.configuration_space import ConfigurationSpace + +import scipy.sparse + +from sklearn.preprocessing import OrdinalEncoder + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm +from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT + + +class OrdinalEncoding(AutoSklearnPreprocessingAlgorithm): + def __init__(self, + random_state: Optional[np.random.RandomState] = None, + ): + self.random_state = random_state + + def fit(self, X: PIPELINE_DATA_DTYPE, + y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'OrdinalEncoding': + if not scipy.sparse.issparse(X): + self.preprocessor = OrdinalEncoder( + categories='auto', handle_unknown='use_encoded_value', unknown_value=-1, + ) + self.preprocessor.fit(X, y) + return self + + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: + if scipy.sparse.issparse(X): + # Sparse data should be float dtype, which means we do not need + # to further encode it. + return X + if self.preprocessor is None: + raise NotImplementedError() + # Notice we are shifting the unseen categories during fit to 1 + # from -1, 0, ... to 0,..., cat + 1 + # This is done because Category shift requires non negative integers + # Consider removing this if that step is removed + return self.preprocessor.transform(X) + 1 + + @staticmethod + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: + return {'shortname': 'OrdinalEncoder', + 'name': 'Ordinal Encoder', + 'handles_regression': True, + 'handles_classification': True, + 'handles_multiclass': True, + 'handles_multilabel': True, + 'handles_multioutput': True, + # TODO find out of this is right! + 'handles_sparse': True, + 'handles_dense': True, + 'input': (DENSE, SPARSE, UNSIGNED_DATA), + 'output': (INPUT,), } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: + return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py index 261a0e7173..079b25d4db 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/no_encoding.py @@ -1,23 +1,30 @@ +from typing import Dict, Optional, Tuple, Union +import numpy as np + from ConfigSpace.configuration_space import ConfigurationSpace + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT class NoEncoding(AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): pass - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'NoEncoding': self.preprocessor = 'passthrough' self.fitted_ = True return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'no encoding', 'name': 'No categorical variable encoding', 'handles_regression': True, @@ -31,6 +38,7 @@ def get_properties(dataset_properties=None): 'output': (INPUT,)} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py index 8f5aa67c29..a831dbcc1b 100644 --- a/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py +++ b/autosklearn/pipeline/components/data_preprocessing/categorical_encoding/one_hot_encoding.py @@ -1,19 +1,25 @@ +from typing import Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + import scipy.sparse from sklearn.preprocessing import OneHotEncoder as DenseOneHotEncoder -from ConfigSpace.configuration_space import ConfigurationSpace +import numpy as np +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT class OneHotEncoder(AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'OneHotEncoder': if scipy.sparse.issparse(X): self.preprocessor = SparseOneHotEncoder() else: @@ -22,16 +28,14 @@ def fit(self, X, y=None): self.preprocessor.fit(X, y) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': '1Hot', 'name': 'One Hot Encoder', 'handles_regression': True, @@ -46,5 +50,6 @@ def get_properties(dataset_properties=None): 'output': (INPUT,), } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py index 4eacd62e4f..d2576dd942 100644 --- a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py +++ b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py @@ -1,6 +1,11 @@ -import autosklearn.pipeline.implementations.CategoryShift +from typing import Dict, Optional, Tuple, Union from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +import autosklearn.pipeline.implementations.CategoryShift from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT @@ -12,25 +17,24 @@ class CategoryShift(AutoSklearnPreprocessingAlgorithm): is not used, so to provide compatibility with sparse matrices. """ - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'CategoryShift': self.preprocessor = autosklearn.pipeline.implementations.CategoryShift\ .CategoryShift() self.preprocessor.fit(X, y) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'CategShift', 'name': 'Category Shift', 'handles_missing_values': True, @@ -52,5 +56,6 @@ def get_properties(dataset_properties=None): 'preferred_dtype': None} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing.py b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing.py index 217c2dd361..7a4f96b966 100644 --- a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing.py +++ b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing.py @@ -1,4 +1,4 @@ -import numpy as np +from typing import Any, List, Dict, Optional, Tuple, Union import sklearn.compose from scipy import sparse @@ -6,33 +6,49 @@ from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace -from autosklearn.pipeline.base import BasePipeline +import numpy as np + +from sklearn.base import BaseEstimator, TransformerMixin + +from autosklearn.pipeline.base import ( + BasePipeline, + DATASET_PROPERTIES_TYPE, + PIPELINE_DATA_DTYPE, + ) from autosklearn.pipeline.components.data_preprocessing.data_preprocessing_categorical \ import CategoricalPreprocessingPipeline from autosklearn.pipeline.components.data_preprocessing.data_preprocessing_numerical \ import NumericalPreprocessingPipeline from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT +from autosklearn.data.validation import ( + SUPPORTED_FEAT_TYPES, + SUPPORTED_TARGET_TYPES, +) -class DataPreprocessor(AutoSklearnComponent): +class DataPreprocessor(TransformerMixin, AutoSklearnComponent): """ This component is used to apply distinct transformations to categorical and numerical features of a dataset. It is built on top of sklearn's ColumnTransformer. """ - def __init__(self, config=None, pipeline=None, dataset_properties=None, include=None, - exclude=None, random_state=None, init_params=None, - categorical_features=None, force_sparse_output=False, - column_transformer=None): + def __init__( + self, + config: Optional[Configuration] = None, + pipeline: Optional[BasePipeline] = None, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None, + feat_type: Optional[Dict[Union[str, int], str]] = None, + force_sparse_output: bool = False, + column_transformer: Optional[sklearn.compose.ColumnTransformer] = None, + ): if pipeline is not None: raise ValueError("DataPreprocessor's argument 'pipeline' should be None") - if categorical_features is not None: - categorical_features = np.array(categorical_features) - if categorical_features.dtype != 'bool': - raise ValueError('Parameter categorical_features must' - ' only contain booleans.') self.config = config self.pipeline = pipeline self.dataset_properties = dataset_properties @@ -40,7 +56,7 @@ def __init__(self, config=None, pipeline=None, dataset_properties=None, include= self.exclude = exclude self.random_state = random_state self.init_params = init_params - self.categorical_features = categorical_features + self.feat_type = feat_type self.force_sparse_output = force_sparse_output # The pipeline that will be applied to the categorical features (i.e. columns) @@ -65,38 +81,65 @@ def __init__(self, config=None, pipeline=None, dataset_properties=None, include= config=None, steps=pipeline, dataset_properties=dataset_properties, include=include, exclude=exclude, random_state=random_state, init_params=init_params) - self._transformers = [ - ["categorical_transformer", self.categ_ppl], - ["numerical_transformer", self.numer_ppl], + self._transformers: List[Tuple[str, AutoSklearnComponent]] = [ + ("categorical_transformer", self.categ_ppl), + ("numerical_transformer", self.numer_ppl), ] if self.config: self.set_hyperparameters(self.config, init_params=init_params) self.column_transformer = column_transformer - def fit(self, X, y=None): + def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = None + ) -> 'DataPreprocessor': n_feats = X.shape[1] - # If categorical_features is none or an array made just of False booleans, then - # only the numerical transformer is used - numerical_features = np.logical_not(self.categorical_features) - if self.categorical_features is None or np.all(numerical_features): - sklearn_transf_spec = [ - ["numerical_transformer", self.numer_ppl, [True] * n_feats] + categorical_features = [] + numerical_features = [] + if self.feat_type is not None: + # Make sure that we are not missing any column! + expected = set(self.feat_type.keys()) + if hasattr(X, 'columns'): + columns = set(X.columns) + else: + columns = set(range(n_feats)) + if expected != columns: + raise ValueError("Train data has columns={} yet the feat_types are feat={}".format( + expected, + columns + )) + categorical_features = [key for key, value in self.feat_type.items() + if value.lower() == 'categorical'] + numerical_features = [key for key, value in self.feat_type.items() + if value.lower() == 'numerical'] + + # If no categorical features, assume we have a numerical only pipeline + if len(categorical_features) == 0: + sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [ + ("numerical_transformer", self.numer_ppl, [True] * n_feats) ] # If all features are categorical, then just the categorical transformer is used - elif np.all(self.categorical_features): + elif len(numerical_features) == 0: sklearn_transf_spec = [ - ["categorical_transformer", self.categ_ppl, [True] * n_feats] + ("categorical_transformer", self.categ_ppl, [True] * n_feats) ] # For the other cases, both transformers are used else: - cat_feats = self.categorical_features - num_feats = np.logical_not(self.categorical_features) sklearn_transf_spec = [ - ["categorical_transformer", self.categ_ppl, cat_feats], - ["numerical_transformer", self.numer_ppl, num_feats] + ("categorical_transformer", self.categ_ppl, categorical_features), + ("numerical_transformer", self.numer_ppl, numerical_features) ] + # And one last check in case feat type is None + # And to make sure the final specification has all the columns + # considered in the column transformer + total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec]) + if total_columns != n_feats: + raise ValueError("Missing columns in the specification of the data validator" + " for train data={} and spec={}".format( + np.shape(X), + sklearn_transf_spec, + )) + self.sparse_ = sparse.issparse(X) or self.force_sparse_output self.column_transformer = sklearn.compose.ColumnTransformer( transformers=sklearn_transf_spec, @@ -105,7 +148,7 @@ def fit(self, X, y=None): self.column_transformer.fit(X, y) return self - def transform(self, X): + def transform(self, X: SUPPORTED_FEAT_TYPES) -> PIPELINE_DATA_DTYPE: if self.column_transformer is None: raise ValueError("Cannot call transform on a Datapreprocessor that has not" "yet been fit. Please check the log files for errors " @@ -113,11 +156,9 @@ def transform(self, X): ) return self.column_transformer.transform(X) - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'FeatTypeSplit', 'name': 'Feature Type Splitter', 'handles_regression': True, @@ -130,9 +171,10 @@ def get_properties(dataset_properties=None): 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (INPUT,), } - def set_hyperparameters(self, configuration, init_params=None): - if init_params is not None and 'categorical_features' in init_params.keys(): - self.categorical_features = init_params['categorical_features'] + def set_hyperparameters(self, configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None) -> 'DataPreprocessor': + if init_params is not None and 'feat_type' in init_params.keys(): + self.feat_type = init_params['feat_type'] self.config = configuration @@ -150,6 +192,7 @@ def set_hyperparameters(self, configuration, init_params=None): sub_configuration = Configuration(sub_configuration_space, values=sub_config_dict) + sub_init_params_dict: Optional[Dict[str, Any]] = None if init_params is not None: sub_init_params_dict = {} for param in init_params: @@ -157,8 +200,6 @@ def set_hyperparameters(self, configuration, init_params=None): value = init_params[param] new_name = param.replace('%s:' % transf_name, '', 1) sub_init_params_dict[new_name] = value - else: - sub_init_params_dict = None if isinstance(transf_op, ( AutoSklearnChoice, AutoSklearnComponent, BasePipeline)): @@ -169,7 +210,10 @@ def set_hyperparameters(self, configuration, init_params=None): return self - def get_hyperparameter_search_space(self, dataset_properties=None): + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: self.dataset_properties = dataset_properties cs = ConfigurationSpace() cs = DataPreprocessor._get_hyperparameter_search_space_recursevely( @@ -177,7 +221,11 @@ def get_hyperparameter_search_space(self, dataset_properties=None): return cs @staticmethod - def _get_hyperparameter_search_space_recursevely(dataset_properties, cs, transformer): + def _get_hyperparameter_search_space_recursevely( + dataset_properties: DATASET_PROPERTIES_TYPE, + cs: ConfigurationSpace, + transformer: BaseEstimator, + ) -> ConfigurationSpace: for st_name, st_operation in transformer: if hasattr(st_operation, "get_hyperparameter_search_space"): cs.add_configuration_space( diff --git a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_categorical.py b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_categorical.py index f9f05955f3..c76e2d4fd0 100644 --- a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_categorical.py +++ b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_categorical.py @@ -1,6 +1,10 @@ +from typing import Any, List, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace + import numpy as np -from ConfigSpace.configuration_space import ConfigurationSpace +from sklearn.base import BaseEstimator from autosklearn.pipeline.components.data_preprocessing.category_shift.\ category_shift import CategoryShift @@ -10,8 +14,13 @@ import CoalescenseChoice from autosklearn.pipeline.components.data_preprocessing.categorical_encoding \ import OHEChoice - -from autosklearn.pipeline.base import BasePipeline +from autosklearn.pipeline.components.data_preprocessing.categorical_encoding.encoding import ( + OrdinalEncoding +) +from autosklearn.pipeline.base import ( + BasePipeline, + DATASET_PROPERTIES_TYPE, +) from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT @@ -21,7 +30,7 @@ class CategoricalPreprocessingPipeline(BasePipeline): The steps of this pipeline are: 1 - Category shift: Adds 3 to every category value 2 - Imputation: Assign category 2 to missing values (NaN). - 3 - Minority coalescence: Assign category 1 to all categories whose occurence + 3 - Minority coalescence: Assign category 1 to all categories whose occurrence don't sum-up to a certain minimum fraction 4 - One hot encoding: usual sklearn one hot encoding @@ -36,16 +45,22 @@ class CategoricalPreprocessingPipeline(BasePipeline): If None, the random number generator is the RandomState instance used by `np.random`.""" - def __init__(self, config=None, steps=None, dataset_properties=None, - include=None, exclude=None, random_state=None, - init_params=None): + def __init__(self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, BaseEstimator]]] = None, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None): self._output_dtype = np.int32 super().__init__( config, steps, dataset_properties, include, exclude, random_state, init_params) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'cat_datapreproc', 'name': 'categorical data preprocessing', 'handles_missing_values': True, @@ -65,8 +80,12 @@ def get_properties(dataset_properties=None): 'output': (INPUT,), 'preferred_dtype': None} - def _get_hyperparameter_search_space(self, include=None, exclude=None, - dataset_properties=None): + def _get_hyperparameter_search_space( + self, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: """Create the hyperparameter configuration space. Returns @@ -85,7 +104,9 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, return cs - def _get_pipeline_steps(self, dataset_properties=None): + def _get_pipeline_steps(self, + dataset_properties: Optional[Dict[str, str]] = None, + ) -> List[Tuple[str, BaseEstimator]]: steps = [] default_dataset_properties = {} @@ -93,13 +114,14 @@ def _get_pipeline_steps(self, dataset_properties=None): default_dataset_properties.update(dataset_properties) steps.extend([ - ["category_shift", CategoryShift()], - ["imputation", CategoricalImputation()], - ["category_coalescence", CoalescenseChoice(default_dataset_properties)], - ["categorical_encoding", OHEChoice(default_dataset_properties)], + ("imputation", CategoricalImputation()), + ("encoding", OrdinalEncoding()), + ("category_shift", CategoryShift()), + ("category_coalescence", CoalescenseChoice(default_dataset_properties)), + ("categorical_encoding", OHEChoice(default_dataset_properties)), ]) return steps - def _get_estimator_hyperparameter_name(self): + def _get_estimator_hyperparameter_name(self) -> str: return "categorical data preprocessing" diff --git a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_numerical.py b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_numerical.py index ca26b56616..e48c1695d4 100644 --- a/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_numerical.py +++ b/autosklearn/pipeline/components/data_preprocessing/data_preprocessing_numerical.py @@ -1,6 +1,10 @@ +from typing import Any, List, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace + import numpy as np -from ConfigSpace.configuration_space import ConfigurationSpace +from sklearn.base import BaseEstimator from autosklearn.pipeline.components.data_preprocessing import rescaling as \ rescaling_components @@ -9,7 +13,10 @@ from autosklearn.pipeline.components.data_preprocessing.variance_threshold\ .variance_threshold import VarianceThreshold -from autosklearn.pipeline.base import BasePipeline +from autosklearn.pipeline.base import ( + BasePipeline, + DATASET_PROPERTIES_TYPE, +) from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT @@ -31,20 +38,24 @@ class NumericalPreprocessingPipeline(BasePipeline): If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance - used by `np.random`. - - """ - - def __init__(self, config=None, steps=None, dataset_properties=None, - include=None, exclude=None, random_state=None, - init_params=None): + used by `np.random`.""" + + def __init__(self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, BaseEstimator]]] = None, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None): self._output_dtype = np.int32 super().__init__( config, steps, dataset_properties, include, exclude, random_state, init_params) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'num_datapreproc', 'name': 'numeric data preprocessing', 'handles_missing_values': True, @@ -64,8 +75,12 @@ def get_properties(dataset_properties=None): 'output': (INPUT,), 'preferred_dtype': None} - def _get_hyperparameter_search_space(self, include=None, exclude=None, - dataset_properties=None): + def _get_hyperparameter_search_space( + self, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + ) -> ConfigurationSpace: """Create the hyperparameter configuration space. Parameters @@ -87,7 +102,9 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None, return cs - def _get_pipeline_steps(self, dataset_properties=None): + def _get_pipeline_steps(self, + dataset_properties: Optional[Dict[str, str]] = None, + ) -> List[Tuple[str, BaseEstimator]]: steps = [] default_dataset_properties = {} @@ -95,12 +112,12 @@ def _get_pipeline_steps(self, dataset_properties=None): default_dataset_properties.update(dataset_properties) steps.extend([ - ["imputation", NumericalImputation()], - ["variance_threshold", VarianceThreshold()], - ["rescaling", rescaling_components.RescalingChoice(default_dataset_properties)], + ("imputation", NumericalImputation()), + ("variance_threshold", VarianceThreshold()), + ("rescaling", rescaling_components.RescalingChoice(default_dataset_properties)), ]) return steps - def _get_estimator_hyperparameter_name(self): + def _get_estimator_hyperparameter_name(self) -> str: return "numerical data preprocessing" diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py index 5510dbb434..4eab4d986b 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/categorical_imputation.py @@ -1,35 +1,60 @@ +from typing import Dict, Optional, Tuple, Union + from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT class CategoricalImputation(AutoSklearnPreprocessingAlgorithm): """ - Substitute missing values by 2 + Substitute missing values by constant: + When strategy == “constant”, fill_value is used to replace all + occurrences of missing_values. + If left to the default, fill_value will be 0 when imputing + numerical data and “missing_value” for strings or object data types. """ - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, + y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation': import sklearn.impute + fill_value = None + if hasattr(X, 'columns'): + kind = X[X.columns[-1]].dtype.kind + else: + # Series, sparse and numpy have dtype + # Only DataFrame does not + kind = X.dtype.kind + if kind in ("i", "u", "f"): + # We do not want to impute a category with the default + # value (0 is the default) in case such default is in the + # train data already! + fill_value = 0 + unique = np.unique(X) + while fill_value in unique: + fill_value -= 1 + self.preprocessor = sklearn.impute.SimpleImputer( - strategy='constant', fill_value=2, copy=False) + strategy='constant', copy=False, fill_value=fill_value) self.preprocessor.fit(X) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() - X = self.preprocessor.transform(X).astype(int) + X = self.preprocessor.transform(X) return X - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'CategoricalImputation', 'name': 'Categorical Imputation', 'handles_missing_values': True, @@ -51,5 +76,6 @@ def get_properties(dataset_properties=None): 'preferred_dtype': None} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: return ConfigurationSpace() diff --git a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py index 17b25c609e..49dddfdfba 100644 --- a/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py +++ b/autosklearn/pipeline/components/data_preprocessing/imputation/numerical_imputation.py @@ -1,17 +1,24 @@ +from typing import Dict, Optional, Tuple, Union + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT class NumericalImputation(AutoSklearnPreprocessingAlgorithm): - def __init__(self, strategy='mean', random_state=None): + def __init__(self, strategy: str = 'mean', + random_state: Optional[np.random.RandomState] = None): self.strategy = strategy self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, + y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'NumericalImputation': import sklearn.impute self.preprocessor = sklearn.impute.SimpleImputer( @@ -19,13 +26,14 @@ def fit(self, X, y=None): self.preprocessor.fit(X) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'NumericalImputation', 'name': 'Numerical Imputation', 'handles_missing_values': True, @@ -47,7 +55,8 @@ def get_properties(dataset_properties=None): 'preferred_dtype': None} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: # TODO add replace by zero! strategy = CategoricalHyperparameter( "strategy", ["mean", "median", "most_frequent"], default_value="mean") diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py index 0ed2433e45..a3f46b32e0 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/__init__.py @@ -1,33 +1,45 @@ from collections import OrderedDict import os -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice + +from typing import Any, Dict, Optional + +from ConfigSpace import Configuration from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ + ThirdPartyComponents, AutoSklearnChoice + +from sklearn.base import BaseEstimator + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE + mc_directory = os.path.split(__file__)[0] _mcs = find_components( __package__, mc_directory, AutoSklearnPreprocessingAlgorithm) _addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -def add_mc(mc): +def add_mc(mc: BaseEstimator) -> None: _addons.add_component(mc) class CoalescenseChoice(AutoSklearnChoice): @classmethod - def get_components(cls): - components = OrderedDict() + def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: + components: Dict[str, BaseEstimator] = OrderedDict() components.update(_mcs) components.update(_addons.components) return components - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + default: Optional[str] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() if dataset_properties is None: @@ -64,7 +76,9 @@ def get_hyperparameter_search_space(self, dataset_properties=None, self.dataset_properties = dataset_properties return cs - def set_hyperparameters(self, configuration, init_params=None): + def set_hyperparameters(self, configuration: Configuration, + init_params: Optional[Dict[str, Any]] = None + ) -> 'CoalescenseChoice': new_params = {} params = configuration.get_dictionary() @@ -92,5 +106,5 @@ def set_hyperparameters(self, configuration, init_params=None): return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.choice.transform(X) diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py index c4446f2345..875a142ff8 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/minority_coalescer.py @@ -1,8 +1,13 @@ -import autosklearn.pipeline.implementations.MinorityCoalescer +from typing import Dict, Optional, Tuple, Union + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter +import numpy as np + +import autosklearn.pipeline.implementations.MinorityCoalescer +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT @@ -11,10 +16,12 @@ class MinorityCoalescer(AutoSklearnPreprocessingAlgorithm): """ Group together categories which occurence is less than a specified minimum fraction. """ - def __init__(self, minimum_fraction=0.01, random_state=None): + def __init__(self, minimum_fraction: float = 0.01, + random_state: Optional[np.random.RandomState] = None): self.minimum_fraction = minimum_fraction - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'MinorityCoalescer': self.minimum_fraction = float(self.minimum_fraction) self.preprocessor = autosklearn.pipeline.implementations.MinorityCoalescer\ @@ -22,16 +29,14 @@ def fit(self, X, y=None): self.preprocessor.fit(X, y) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'coalescer', 'name': 'Categorical minority coalescer', 'handles_regression': True, @@ -46,7 +51,8 @@ def get_properties(dataset_properties=None): 'output': (INPUT,), } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() minimum_fraction = UniformFloatHyperparameter( "minimum_fraction", lower=.0001, upper=0.5, default_value=0.01, log=True) diff --git a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py index f9c0770041..684fa0d536 100644 --- a/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py +++ b/autosklearn/pipeline/components/data_preprocessing/minority_coalescense/no_coalescense.py @@ -1,25 +1,30 @@ +from typing import Dict, Optional, Tuple, Union + + from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import \ AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT class NoCoalescence(AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): pass - def fit(self, X, y=None): + def fit(self, X: np.array, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> PIPELINE_DATA_DTYPE: self.preprocessor = 'passthrough' return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'no coalescence', 'name': 'No categorical variable coalescence', 'handles_regression': True, @@ -33,6 +38,7 @@ def get_properties(dataset_properties=None): 'output': (INPUT,)} @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py index 53845b1fa3..298086476a 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/__init__.py @@ -1,10 +1,19 @@ from collections import OrderedDict import os -from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ - ThirdPartyComponents, AutoSklearnChoice + +from typing import Dict, Optional + from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import CategoricalHyperparameter +from sklearn.base import BaseEstimator + +from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \ + ThirdPartyComponents, AutoSklearnChoice +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling import ( + Rescaling +) rescaling_directory = os.path.split(__file__)[0] _rescalers = find_components(__package__, @@ -13,23 +22,26 @@ _addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm) -def add_rescaler(rescaler): +def add_rescaler(rescaler: Rescaling) -> None: _addons.add_component(rescaler) class RescalingChoice(AutoSklearnChoice): @classmethod - def get_components(cls): - components = OrderedDict() + def get_components(cls: BaseEstimator) -> Dict[str, BaseEstimator]: + components: Dict[str, BaseEstimator] = OrderedDict() components.update(_rescalers) components.update(_addons.components) return components - def get_hyperparameter_search_space(self, dataset_properties=None, - default=None, - include=None, - exclude=None): + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, + default: Optional[str] = None, + include: Optional[Dict[str, str]] = None, + exclude: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: cs = ConfigurationSpace() if dataset_properties is None: @@ -67,5 +79,5 @@ def get_hyperparameter_search_space(self, dataset_properties=None, self.dataset_properties = dataset_properties return cs - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return self.choice.transform(X) diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py index 366d8b3654..26d2ef8b41 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/abstract_rescaling.py @@ -1,19 +1,35 @@ +from typing import Optional + from ConfigSpace.configuration_space import ConfigurationSpace +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE +from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm + class Rescaling(object): # Rescaling does not support fit_transform (as of 0.19.1)! + def __init__(self, random_state: Optional[np.random.RandomState] = None): + self.preprocessor: Optional[BaseEstimator] = None - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'AutoSklearnPreprocessingAlgorithm': + if self.preprocessor is None: + raise NotFittedError() self.preprocessor.fit(X) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py index 9129e0f5c3..9e144bb957 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/minmax.py @@ -1,3 +1,8 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, INPUT from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -5,12 +10,13 @@ class MinMaxScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state): + def __init__(self, random_state: Optional[np.random.RandomState] = None): from sklearn.preprocessing import MinMaxScaler self.preprocessor = MinMaxScaler(copy=False) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'MinMaxScaler', 'name': 'MinMaxScaler', 'handles_missing_values': False, diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py index 8e68a6b24c..83377e2544 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/none.py @@ -1,3 +1,6 @@ +from typing import Dict, Optional, Tuple, Union + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT, SPARSE from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -5,18 +8,18 @@ class NoRescalingComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state=None): - pass - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'AutoSklearnPreprocessingAlgorithm': self.preprocessor = 'passthrough' return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: return X @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'NoRescaling', 'name': 'NoRescaling', 'handles_missing_values': False, diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py index 0db3261b37..5ad6a0795e 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/normalize.py @@ -1,3 +1,8 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT, SPARSE from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -5,14 +10,15 @@ class NormalizerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state): + def __init__(self, random_state: Optional[np.random.RandomState] = None): # Use custom implementation because sklearn implementation cannot # handle float32 input matrix from sklearn.preprocessing import Normalizer self.preprocessor = Normalizer(copy=False) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'Normalizer', 'name': 'Normalizer', 'handles_missing_values': False, diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py index fb03067187..3c90ef6736 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/power_transformer.py @@ -1,3 +1,8 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, INPUT from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -5,12 +10,13 @@ class PowerTransformerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state): + def __init__(self, random_state: Optional[np.random.RandomState] = None): from sklearn.preprocessing import PowerTransformer self.preprocessor = PowerTransformer(copy=False) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'PowerTransformer', 'name': 'PowerTransformer', 'handles_missing_values': False, diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py index 025a7e6cce..e4066e7722 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/quantile_transformer.py @@ -1,3 +1,8 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformIntegerHyperparameter, \ CategoricalHyperparameter @@ -10,7 +15,8 @@ class QuantileTransformerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, n_quantiles, output_distribution, random_state): + def __init__(self, n_quantiles: int, output_distribution: str, + random_state: Optional[np.random.RandomState] = None): from sklearn.preprocessing import QuantileTransformer self.n_quantiles = n_quantiles self.output_distribution = output_distribution @@ -21,7 +27,8 @@ def __init__(self, n_quantiles, output_distribution, random_state): ) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'QuantileTransformer', 'name': 'QuantileTransformer', 'handles_regression': True, @@ -37,7 +44,9 @@ def get_properties(dataset_properties=None): 'output': (INPUT, SIGNED_DATA), 'preferred_dtype': None} - def get_hyperparameter_search_space(dataset_properties=None): + @staticmethod + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() # TODO parametrize like the Random Forest as n_quantiles = n_features^param n_quantiles = UniformIntegerHyperparameter( diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py index 8bea5a5f91..e43a384d6f 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/robust_scaler.py @@ -1,7 +1,14 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + + from scipy import sparse +from sklearn.exceptions import NotFittedError from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import UniformFloatHyperparameter +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.constants import DENSE, UNSIGNED_DATA, SIGNED_DATA, INPUT, SPARSE from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -10,7 +17,8 @@ class RobustScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, q_min, q_max, random_state): + def __init__(self, q_min: float, q_max: float, + random_state: Optional[np.random.RandomState] = None): from sklearn.preprocessing import RobustScaler self.q_min = q_min self.q_max = q_max @@ -19,7 +27,8 @@ def __init__(self, q_min, q_max, random_state): ) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'RobustScaler', 'name': 'RobustScaler', 'handles_regression': True, @@ -35,7 +44,9 @@ def get_properties(dataset_properties=None): 'output': (INPUT, SIGNED_DATA), 'preferred_dtype': None} - def get_hyperparameter_search_space(dataset_properties=None): + @staticmethod + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() q_min = UniformFloatHyperparameter( 'q_min', 0.001, 0.3, default_value=0.25 @@ -46,7 +57,10 @@ def get_hyperparameter_search_space(dataset_properties=None): cs.add_hyperparameters((q_min, q_max)) return cs - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'AutoSklearnPreprocessingAlgorithm': + if self.preprocessor is None: + raise NotFittedError() if sparse.isspmatrix(X): self.preprocessor.set_params(with_centering=False) diff --git a/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py b/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py index da5cbe7303..32d341ae0d 100644 --- a/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py +++ b/autosklearn/pipeline/components/data_preprocessing/rescaling/standardize.py @@ -1,4 +1,12 @@ +from typing import Dict, Optional, Tuple, Union + +import numpy as np + from scipy import sparse + +from sklearn.exceptions import NotFittedError + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT from autosklearn.pipeline.components.data_preprocessing.rescaling.abstract_rescaling \ import Rescaling @@ -7,12 +15,13 @@ class StandardScalerComponent(Rescaling, AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state): + def __init__(self, random_state: Optional[np.random.RandomState] = None): from sklearn.preprocessing import StandardScaler self.preprocessor = StandardScaler(copy=False) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return {'shortname': 'StandardScaler', 'name': 'StandardScaler', 'handles_missing_values': False, @@ -33,7 +42,10 @@ def get_properties(dataset_properties=None): 'output': (INPUT,), 'preferred_dtype': None} - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None + ) -> 'AutoSklearnPreprocessingAlgorithm': + if self.preprocessor is None: + raise NotFittedError() if sparse.isspmatrix(X): self.preprocessor.set_params(with_mean=False) diff --git a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py index b454e3ed45..bf50f5f0af 100644 --- a/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py +++ b/autosklearn/pipeline/components/data_preprocessing/variance_threshold/variance_threshold.py @@ -1,5 +1,10 @@ +from typing import Dict, Optional, Tuple, Union + from ConfigSpace.configuration_space import ConfigurationSpace +import numpy as np + +from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT @@ -7,24 +12,26 @@ class VarianceThreshold(AutoSklearnPreprocessingAlgorithm): - def __init__(self, random_state=None): + def __init__(self, random_state: Optional[np.random.RandomState] = None): # VarianceThreshold does not support fit_transform (as of 0.19.1)! self.random_state = random_state - def fit(self, X, y=None): + def fit(self, X: PIPELINE_DATA_DTYPE, + y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'VarianceThreshold': self.preprocessor = sklearn.feature_selection.VarianceThreshold( threshold=0.0 ) self.preprocessor = self.preprocessor.fit(X) return self - def transform(self, X): + def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod - def get_properties(dataset_properties=None): + def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: return { 'shortname': 'Variance Threshold', 'name': 'Variance Threshold (constant feature removal)', @@ -41,6 +48,7 @@ def get_properties(dataset_properties=None): } @staticmethod - def get_hyperparameter_search_space(dataset_properties=None): + def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None + ) -> ConfigurationSpace: cs = ConfigurationSpace() return cs diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 2be6eaacab..ed0ed81c0f 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -84,8 +84,9 @@ def _calculate_metafeatures(data_feat_type, data_info_task, basename, # == Calculate metafeatures task_name = 'CalculateMetafeatures' watcher.start_task(task_name) - categorical = [True if feat_type.lower() in ['categorical'] else False - for feat_type in data_feat_type] + + categorical = {col: True if feat_type.lower() == 'categorical' else False + for col, feat_type in data_feat_type.items()} EXCLUDE_META_FEATURES = EXCLUDE_META_FEATURES_CLASSIFICATION \ if data_info_task in CLASSIFICATION_TASKS else EXCLUDE_META_FEATURES_REGRESSION @@ -122,8 +123,8 @@ def _calculate_metafeatures_encoded(data_feat_type, basename, x_train, y_train, task_name = 'CalculateMetafeaturesEncoded' watcher.start_task(task_name) - categorical = [True if feat_type.lower() in ['categorical'] else False - for feat_type in data_feat_type] + categorical = {col: True if feat_type.lower() == 'categorical' else False + for col, feat_type in data_feat_type.items()} result = calculate_all_metafeatures_encoded_labels( x_train, y_train, categorical=categorical, diff --git a/scripts/update_metadata_util.py b/scripts/update_metadata_util.py index 019be9e830..153e63c6cf 100644 --- a/scripts/update_metadata_util.py +++ b/scripts/update_metadata_util.py @@ -51,7 +51,7 @@ def load_task(task_id): name = dataset.name.lower() del _ del dataset - cat = ['categorical' if c else 'numerical' for c in cat] + cat = {i: 'categorical' if c else 'numerical' for i, c in enumerate(cat)} if isinstance(task, openml.tasks.OpenMLClassificationTask): task_type = 'classification' diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index 6f105e09b9..6f11374f8b 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -396,7 +396,7 @@ def test_do_dummy_prediction(backend, dask_client, datasets): X_test, Y_test, task=task, dataset_name=name, - feat_type=None, + feat_type={i: 'numerical' for i in range(X_train.shape[1])}, ) auto = autosklearn.automl.AutoML( @@ -439,7 +439,7 @@ def test_fail_if_dummy_prediction_fails(ta_run_mock, backend, dask_client): X_train, Y_train, X_test, Y_test, task=2, - feat_type=['Numerical' for i in range(X_train.shape[1])], + feat_type={i: 'Numerical' for i in range(X_train.shape[1])}, dataset_name='iris', ) @@ -655,7 +655,7 @@ def test_fail_if_feat_type_on_pandas_input(backend, dask_client): automl.fit( X_train, y_train, task=BINARY_CLASSIFICATION, - feat_type=['Categorical', 'Numerical'], + feat_type={1: 'Categorical', 2: 'Numerical'}, ) diff --git a/test/test_automl/test_estimators.py b/test/test_automl/test_estimators.py index e6b9ef3bd0..f789e86fc1 100644 --- a/test/test_automl/test_estimators.py +++ b/test/test_automl/test_estimators.py @@ -127,13 +127,13 @@ def test_feat_type_wrong_arguments(): y = np.zeros((100, )) cls = AutoSklearnClassifier(ensemble_size=0) - expected_msg = r".*Array feat_type does not have same number of " + expected_msg = r".*feat_type does not have same number of " "variables as X has features. 1 vs 100.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y, feat_type=[True]) cls = AutoSklearnClassifier(ensemble_size=0) - expected_msg = r".*Array feat_type must only contain strings.*" + expected_msg = r".*feat_type must only contain strings.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y, feat_type=[True]*100) @@ -790,3 +790,98 @@ def test_fit_pipeline(dask_client, task_type, resampling_strategy, disable_file_ assert os.path.exists(cv_model_path) elif resampling_strategy == 'holdout': assert not os.path.exists(cv_model_path) + + +@pytest.mark.parametrize("data_type", ['pandas', 'numpy']) +@pytest.mark.parametrize("include_categorical", [True, False]) +def test_pass_categorical_and_numeric_columns_to_pipeline( + dask_client, data_type, include_categorical): + + # Prepare the training data + X, y = sklearn.datasets.make_classification() + feat_type = None + if 'pandas' in data_type: + X = pd.DataFrame(X) + y = pd.DataFrame(y, dtype="category") + if include_categorical: + cat_name = X.shape[1] + X[cat_name] = 'A' + X[cat_name] = X[cat_name].astype('category') + elif 'numpy' in data_type: + if include_categorical: + feat_type = ['numerical' for x in range(np.shape(X)[1])] + feat_type.append('categorical') + temporal = np.zeros((X.shape[0], X.shape[1]+1)) + temporal[:, :-1] = X + X = temporal + else: + pytest.fail() + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, test_size=0.5, random_state=3 + ) + + seed = 3 + automl = AutoSklearnClassifier( + time_left_for_this_task=120, + # Time left for task plays no role + # only per run time limit + per_run_time_limit=30, + ensemble_size=0, + dask_client=dask_client, + include_estimators=['random_forest'], + seed=seed, + ) + config = automl.get_configuration_space(X_train, y_train, + feat_type=feat_type, + X_test=X_test, y_test=y_test, + ).get_default_configuration() + + pipeline, run_info, run_value = automl.fit_pipeline(X=X_train, y=y_train, config=config, + feat_type=feat_type, + X_test=X_test, y_test=y_test) + + # We should produce a decent result + assert run_value.cost < 0.4, f"{run_value}/{run_value.additional_info}" + prediction = pipeline.predict(automl.automl_.InputValidator.feature_validator.transform(X)) + assert np.shape(prediction)[0], np.shape(y)[0] + + if include_categorical: + expected_dict = {i: 'numerical' for i in range(np.shape(X)[1] - 1)} + expected_dict[X.shape[1] - 1] = 'categorical' + else: + expected_dict = {i: 'numerical' for i in range(np.shape(X)[1])} + assert expected_dict == pipeline.named_steps['data_preprocessing'].feat_type + + +@pytest.mark.parametrize("as_frame", [True, False]) +def test_autosklearn_anneal(as_frame): + """ + This test makes sure that anneal dataset can be fitted and scored. + This dataset is quite complex, with NaN, categorical and numerical columns + so is a good testcase for unit-testing + """ + X, y = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=as_frame) + automl = AutoSklearnClassifier(time_left_for_this_task=60, ensemble_size=0, + initial_configurations_via_metalearning=0, + smac_scenario_args={'runcount_limit': 3}, + resampling_strategy='holdout-iterative-fit') + + if as_frame: + # Let autosklearn calculate the feat types + automl_fitted = automl.fit(X, y) + else: + X_, y_ = sklearn.datasets.fetch_openml(data_id=2, return_X_y=True, as_frame=True) + feat_type = ['categorical' if X_[col].dtype.name == 'category' else 'numerical' + for col in X_.columns] + automl_fitted = automl.fit(X, y, feat_type=feat_type) + assert automl is automl_fitted + + automl_ensemble_fitted = automl.fit_ensemble(y, ensemble_size=5) + assert automl is automl_ensemble_fitted + + # We want to make sure we can learn from this data. + # This is a test to make sure the data format (numpy/pandas) + # can be used in a meaningful way -- not meant for generalization, + # hence we use the train dataset + assert automl_fitted.score(X, y) > 0.75 diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 608e000982..07b2670a15 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -1,9 +1,7 @@ -import copy -import random - import numpy as np import pandas as pd +from pandas.api.types import is_numeric_dtype import pytest @@ -202,29 +200,14 @@ def test_featurevalidator_supported_types(input_data_featuretest): transformed_X = validator.transform(input_data_featuretest) if sparse.issparse(input_data_featuretest): assert sparse.issparse(transformed_X) + elif isinstance(input_data_featuretest, list): + assert isinstance(transformed_X, pd.DataFrame) else: - assert isinstance(transformed_X, np.ndarray) + assert isinstance(transformed_X, type(input_data_featuretest)) assert np.shape(input_data_featuretest) == np.shape(transformed_X) - assert np.issubdtype(transformed_X.dtype, np.number) assert validator._is_fitted -@pytest.mark.parametrize( - 'input_data_featuretest', - ( - 'list_categoricalonly_nonan', - 'list_categoricalonly_nan', - 'list_mixed_nonan', - 'list_mixed_nan', - ), - indirect=True -) -def test_featurevalidator_unsupported_list(input_data_featuretest): - validator = FeatureValidator() - with pytest.raises(ValueError, match=r".*has invalid type object. Cast it to a valid dtype.*"): - validator.fit(input_data_featuretest) - - @pytest.mark.parametrize( 'input_data_featuretest', ( @@ -239,21 +222,6 @@ def test_featurevalidator_unsupported_numpy(input_data_featuretest): validator.fit(input_data_featuretest) -@pytest.mark.parametrize( - 'input_data_featuretest', - ( - 'pandas_categoricalonly_nan', - 'pandas_mixed_nan', - 'openml_179', # adult workclass has NaN in columns - ), - indirect=True -) -def test_featurevalidator_unsupported_pandas(input_data_featuretest): - validator = FeatureValidator() - with pytest.raises(ValueError, match=r"Categorical features in a dataframe.*missing/NaN"): - validator.fit(input_data_featuretest) - - @pytest.mark.parametrize( 'input_data_featuretest', ( @@ -263,8 +231,6 @@ def test_featurevalidator_unsupported_pandas(input_data_featuretest): 'numpy_mixed_nan', 'pandas_categoricalonly_nonan', 'pandas_mixed_nonan', - 'list_numericalonly_nonan', - 'list_numericalonly_nan', 'sparse_bsr_nonan', 'sparse_bsr_nan', 'sparse_coo_nonan', @@ -285,8 +251,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): """ Check if we can fit in a given type (numpy) yet transform if the user changes the type (pandas then) - - This is problematic only in the case we create an encoder """ validator = FeatureValidator() validator.fit(input_data_featuretest, input_data_featuretest) @@ -302,13 +266,12 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): raise ValueError(type(input_data_featuretest)) transformed_X = validator.transform(complementary_type) assert np.shape(input_data_featuretest) == np.shape(transformed_X) - assert np.issubdtype(transformed_X.dtype, np.number) assert validator._is_fitted -def test_featurevalidator_get_columns_to_encode(): +def test_featurevalidatorget_feat_type_from_columns(): """ - Makes sure that encoded columns are returned by _get_columns_to_encode + Makes sure that encoded columns are returned by get_feat_type_from_columns whereas numerical columns are not returned """ validator = FeatureValidator() @@ -321,10 +284,12 @@ def test_featurevalidator_get_columns_to_encode(): for col in df.columns: df[col] = df[col].astype(col) - enc_columns, feature_types = validator._get_columns_to_encode(df) + feature_types = validator.get_feat_type_from_columns(df) - assert enc_columns == ['category', 'bool'] - assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] + assert feature_types == {'int': 'numerical', + 'float': 'numerical', + 'category': 'categorical', + 'bool': 'categorical'} def test_features_unsupported_calls_are_raised(): @@ -338,12 +303,9 @@ def test_features_unsupported_calls_are_raised(): validator.fit( pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) ) - with pytest.raises(ValueError, match="has invalid type object"): - validator.fit( - pd.DataFrame({'string': ['foo']}) - ) with pytest.raises(ValueError, match=r"Auto-sklearn only supports.*yet, the provided input"): validator.fit({'input1': 1, 'input2': 2}) + validator = FeatureValidator() with pytest.raises(ValueError, match=r"has unsupported dtype string"): validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string')) with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"): @@ -352,104 +314,19 @@ def test_features_unsupported_calls_are_raised(): ) with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): validator.transform(np.array([[1, 2, 3], [4, 5, 6]])) - validator.feat_type = ['Numerical'] + validator = FeatureValidator(feat_type=['Numerical']) with pytest.raises(ValueError, match=r"providing the option feat_type to the fit method is.*"): validator.fit(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) - with pytest.raises(ValueError, match=r"Array feat_type does not have same number of.*"): + with pytest.raises(ValueError, match=r"feat_type does not have same number of.*"): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) - validator.feat_type = [1, 2, 3] - with pytest.raises(ValueError, match=r"Array feat_type must only contain strings.*"): + validator = FeatureValidator(feat_type=[1, 2, 3]) + with pytest.raises(ValueError, match=r"feat_type must only contain strings.*"): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) - validator.feat_type = ['1', '2', '3'] + validator = FeatureValidator(feat_type=['1', '2', '3']) with pytest.raises(ValueError, match=r"Only `Categorical` and `Numerical` are.*"): validator.fit(np.array([[1, 2, 3], [4, 5, 6]])) -@pytest.mark.parametrize( - 'input_data_featuretest', - ( - 'numpy_numericalonly_nonan', - 'numpy_numericalonly_nan', - 'pandas_numericalonly_nonan', - 'pandas_numericalonly_nan', - 'list_numericalonly_nonan', - 'list_numericalonly_nan', - # Category in numpy is handled via feat_type - 'numpy_categoricalonly_nonan', - 'numpy_mixed_nonan', - 'numpy_categoricalonly_nan', - 'numpy_mixed_nan', - 'sparse_bsr_nonan', - 'sparse_bsr_nan', - 'sparse_coo_nonan', - 'sparse_coo_nan', - 'sparse_csc_nonan', - 'sparse_csc_nan', - 'sparse_csr_nonan', - 'sparse_csr_nan', - 'sparse_dia_nonan', - 'sparse_dia_nan', - 'sparse_dok_nonan', - 'sparse_dok_nan', - 'sparse_lil_nonan', - 'sparse_lil_nan', - ), - indirect=True -) -def test_no_encoder_created(input_data_featuretest): - """ - Makes sure that for numerical only features, no encoder is created - """ - validator = FeatureValidator() - validator.fit(input_data_featuretest) - validator.transform(input_data_featuretest) - assert validator.encoder is None - - -@pytest.mark.parametrize( - 'input_data_featuretest', - ( - 'pandas_categoricalonly_nonan', - 'pandas_mixed_nonan', - ), - indirect=True -) -def test_encoder_created(input_data_featuretest): - """ - This test ensures an encoder is created if categorical data is provided - """ - validator = FeatureValidator() - validator.fit(input_data_featuretest) - transformed_X = validator.transform(input_data_featuretest) - assert validator.encoder is not None - - # Make sure that the encoded features are actually encoded. Categorical columns are at - # the start after transformation. In our fixtures, this is also honored prior encode - enc_columns, feature_types = validator._get_columns_to_encode(input_data_featuretest) - - # At least one categorical - assert 'categorical' in validator.feat_type - - # Numerical if the original data has numerical only columns - if np.any([pd.api.types.is_numeric_dtype(input_data_featuretest[col] - ) for col in input_data_featuretest.columns]): - assert 'numerical' in validator.feat_type - for i, feat_type in enumerate(feature_types): - if 'numerical' in feat_type: - np.testing.assert_array_equal( - transformed_X[:, i], - input_data_featuretest[input_data_featuretest.columns[i]].to_numpy() - ) - elif 'categorical' in feat_type: - np.testing.assert_array_equal( - transformed_X[:, i], - # Expect always 0, 1... because we use a ordinal encoder - np.array([0, 1]) - ) - else: - raise ValueError(feat_type) - - def test_no_new_category_after_fit(): """ This test makes sure that we can actually pass new categories to the estimator @@ -463,27 +340,6 @@ def test_no_new_category_after_fit(): validator.transform(x) -def test_unknown_encode_value(): - x = pd.DataFrame([ - {'a': -41, 'b': -3, 'c': 'a', 'd': -987.2}, - {'a': -21, 'b': -3, 'c': 'a', 'd': -9.2}, - {'a': 0, 'b': -4, 'c': 'b', 'd': -97.2}, - {'a': -51, 'b': -3, 'c': 'a', 'd': 987.2}, - {'a': 500, 'b': -3, 'c': 'a', 'd': -92}, - ]) - x['c'] = x['c'].astype('category') - validator = FeatureValidator() - - # Make sure that this value is honored - validator.fit(x) - x['c'].cat.add_categories(['NA'], inplace=True) - x.loc[0, 'c'] = 'NA' # unknown value - x_t = validator.transform(x) - # The first row should have a -1 as we added a new categorical there - expected_row = [-1, -41, -3, -987.2] - assert expected_row == x_t[0].tolist() - - # Actual checks for the features @pytest.mark.parametrize( 'openml_id', @@ -529,20 +385,83 @@ def test_featurevalidator_new_data_after_fit(openml_id, # Basic Checking if sparse.issparse(input_data_featuretest): assert sparse.issparse(transformed_X) + elif isinstance(input_data_featuretest, list): + assert isinstance(transformed_X, pd.DataFrame) else: - assert isinstance(transformed_X, np.ndarray) + assert isinstance(transformed_X, type(X_train)) assert np.shape(X_test) == np.shape(transformed_X) - # And then check proper error messages - if train_data_type == 'pandas': - old_dtypes = copy.deepcopy(validator.dtypes) - validator.dtypes = ['dummy' for dtype in X_train.dtypes] - with pytest.raises(ValueError, match=r"hanging the dtype of the features after fit"): - transformed_X = validator.transform(X_test) - validator.dtypes = old_dtypes - if test_data_type == 'pandas': - columns = X_test.columns.tolist() - random.shuffle(columns) - X_test = X_test[columns] - with pytest.raises(ValueError, match=r"Changing the column order of the features"): - transformed_X = validator.transform(X_test) + +@pytest.mark.parametrize( + 'openml_id', + ( + 40981, # Australian + 3, # kr-vs-kp + 1468, # cnae-9 + 40975, # car + 40984, # Segment + 2, # anneal + ), +) +def test_list_to_dataframe(openml_id): + + X_pandas, y_pandas = sklearn.datasets.fetch_openml(data_id=openml_id, + return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X_pandas, y_pandas, random_state=1) + + X_list = X_train.values.tolist() + validator = FeatureValidator() + validator.fit(X_list) + transformed_X = validator.transform(X_list) + for i, col in enumerate(X_pandas.columns): + if is_numeric_dtype(X_pandas[col].dtype): + # convert dtype translates 72.0 to 72. Be robust against this! + assert is_numeric_dtype(transformed_X[i].dtype) + else: + assert X_pandas[col].dtype.name == transformed_X[i].dtype.name, col + + # Also make sure that at testing time + # this work + transformed_X = validator.transform(X_test.values.tolist()) + for i, col in enumerate(X_pandas.columns): + if is_numeric_dtype(X_pandas[col].dtype): + # convert dtype translates 72.0 to 72. Be robust against this! + assert is_numeric_dtype(transformed_X[i].dtype) + else: + assert X_pandas[col].dtype.name == transformed_X[i].dtype.name, col + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'sparse_lil_nonan', + 'sparse_lil_nan', + ), + indirect=True +) +def test_sparse_output_is_csr(input_data_featuretest): + validator = FeatureValidator() + validator.fit(input_data_featuretest, input_data_featuretest) + transformed_X = validator.transform(input_data_featuretest) + assert sparse.issparse(transformed_X) + assert isinstance(transformed_X, sparse.csr_matrix) + + +def test_unsupported_dataframe_sparse(): + df = pd.DataFrame({'A': pd.Series(pd.arrays.SparseArray(np.random.randn(10)))}) + validator = FeatureValidator() + with pytest.raises(ValueError, match=r"Auto-sklearn does not yet support sparse pandas"): + validator.fit(df) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 47e74a6776..7bc2cb3dc5 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -43,15 +43,10 @@ def test_data_validation_for_classification(openmlid, as_frame): assert np.any(pd.isnull(X_train_t).all(axis=0)) # make sure everything was encoded to number - assert np.issubdtype(X_train_t.dtype, np.number) assert np.issubdtype(y_train_t.dtype, np.number) - # Categorical columns are sorted to the beginning - if as_frame: - validator.feature_validator.feat_type is not None - ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type)) - if len(ordered_unique_elements) > 1: - assert ordered_unique_elements[0] == 'categorical' + # Make sure we created a feat type + validator.feature_validator.feat_type is not None @pytest.mark.parametrize('openmlid', [505, 546, 531]) @@ -84,16 +79,7 @@ def test_data_validation_for_regression(openmlid, as_frame): elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)): assert np.any(pd.isnull(X_train_t).all(axis=0)) - # make sure everything was encoded to number - assert np.issubdtype(X_train_t.dtype, np.number) - assert np.issubdtype(y_train_t.dtype, np.number) - - # Categorical columns are sorted to the beginning - if as_frame: - validator.feature_validator.feat_type is not None - ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type)) - if len(ordered_unique_elements) > 1: - assert ordered_unique_elements[0] == 'categorical' + validator.feature_validator.feat_type is not None def test_sparse_data_validation_for_regression(): diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py index cc5634d013..db48703042 100644 --- a/test/test_evaluation/evaluation_util.py +++ b/test/test_evaluation/evaluation_util.py @@ -123,7 +123,10 @@ def get_multiclass_classification_datamanager(): 'X_test': X_test, 'Y_test': Y_test } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + D.feat_type = {0: 'numerical', + 1: 'Numerical', + 2: 'numerical', + 3: 'numerical'} return D @@ -131,9 +134,10 @@ def get_abalone_datamanager(): # https://www.openml.org/d/183 dataset_name = 'abalone' data = sklearn.datasets.fetch_openml(data_id=183, as_frame=True) - feat_type = [ - 'Categorical' if x.name == 'category' else 'Numerical' for x in data['data'].dtypes - ] + feat_type = { + i: 'Categorical' if x.name == 'category' else 'Numerical' + for i, x in enumerate(data['data'].dtypes) + } X, y = sklearn.datasets.fetch_openml(data_id=183, return_X_y=True, as_frame=False) y = preprocessing.LabelEncoder().fit_transform(y) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( @@ -186,7 +190,10 @@ def get_multilabel_classification_datamanager(): 'X_test': X_test, 'Y_test': Y_test } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + D.feat_type = {0: 'numerical', + 1: 'Numerical', + 2: 'numerical', + 3: 'numerical'} return D @@ -225,7 +232,10 @@ def get_binary_classification_datamanager(): 'X_test': X_test, 'Y_test': Y_test.reshape((-1, 1)) } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical'] + D.feat_type = {0: 'numerical', + 1: 'Numerical', + 2: 'numerical', + 3: 'numerical'} return D @@ -256,9 +266,7 @@ def get_regression_datamanager(): 'X_test': X_test, 'Y_test': Y_test.reshape((-1, 1)) } - D.feat_type = ['numerical', 'Numerical', 'numerical', 'numerical', - 'numerical', 'numerical', 'numerical', 'numerical', - 'numerical', 'numerical', 'numerical'] + D.feat_type = {i: 'numerical' for i in range(X_train.shape[1])} return D @@ -290,7 +298,7 @@ def get_500_classes_datamanager(): 'X_valid': X[700:710], 'Y_valid': Y[700:710], 'X_test': X[710:], 'Y_test': Y[710:] } - D.feat_type = ['numerical'] * 20 + D.feat_type = {i: 'numerical' for i in range(20)} return D diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index d923be5eba..d0e7c6066e 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -1063,7 +1063,7 @@ def test_get_splitter(self, te_mock): D = unittest.mock.Mock(spec=AbstractDataManager) D.data = dict(Y_train=np.array([0, 0, 0, 1, 1, 1])) D.info = dict(task=BINARY_CLASSIFICATION) - D.feat_type = [] + D.feat_type = {} # holdout, binary classification evaluator = TrainEvaluator() @@ -1218,7 +1218,7 @@ def test_get_splitter_cv_object(self, te_mock): D = unittest.mock.Mock(spec=AbstractDataManager) D.data = dict(Y_train=np.array([0, 0, 0, 1, 1, 1])) D.info = dict(task=BINARY_CLASSIFICATION) - D.feat_type = [] + D.feat_type = {} # GroupKFold, classification with args D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1]) @@ -2139,7 +2139,7 @@ def test_get_splitter_cv_object(self, te_mock): def test_holdout_split_size(self, te_mock): te_mock.return_value = None D = unittest.mock.Mock(spec=AbstractDataManager) - D.feat_type = [] + D.feat_type = {} evaluator = TrainEvaluator() evaluator.resampling_strategy = 'holdout' diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features.py b/test/test_metalearning/pyMetaLearn/test_meta_features.py index 1b8cbb76ea..9ad97f68b9 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features.py @@ -1,14 +1,16 @@ import logging import os import tempfile -from io import StringIO -from unittest import TestCase import unittest +import pandas as pd + +import pytest + import arff from joblib import Memory import numpy as np -from sklearn.datasets import make_multilabel_classification +from sklearn.datasets import make_multilabel_classification, fetch_openml from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor @@ -16,475 +18,734 @@ import autosklearn.metalearning.metafeatures.metafeatures as meta_features -class MetaFeaturesTest(TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - self.cwd = os.getcwd() - tests_dir = __file__ - os.chdir(os.path.dirname(tests_dir)) - - decoder = arff.ArffDecoder() - with open(os.path.join("datasets", "dataset.arff")) as fh: - dataset = decoder.decode(fh, encode_nominal=True) - - # -1 because the last attribute is the class - self.attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] - self.categorical = [True if attribute == 'nominal' else False - for attribute in self.attribute_types] - - data = np.array(dataset['data'], dtype=np.float64) - X = data[:, :-1] - y = data[:, -1].reshape((-1,)) - - DPP = DataPreprocessor(categorical_features=self.categorical) - X_transformed = DPP.fit_transform(X) - - # Transform the array which indicates the categorical metafeatures - number_numerical = np.sum(~np.array(self.categorical)) - categorical_transformed = [True] * (X_transformed.shape[1] - - number_numerical) + \ - [False] * number_numerical - self.categorical_transformed = categorical_transformed - - self.X = X - self.X_transformed = X_transformed - self.y = y - self.mf = meta_features.metafeatures - self.helpers = meta_features.helper_functions - - # Create a logger for testing - self.logger = logging.getLogger() - - # Precompute some helper functions - self.helpers.set_value( - "PCA", self.helpers["PCA"](self.X_transformed, self.y, self.logger), - ) - self.helpers.set_value( - "MissingValues", - self.helpers["MissingValues"](self.X, self.y, self.logger, self.categorical), - ) - self.helpers.set_value( - "NumSymbols", - self.helpers["NumSymbols"](self.X, self.y, self.logger, self.categorical), - ) - self.helpers.set_value( - "ClassOccurences", - self.helpers["ClassOccurences"](self.X, self.y, self.logger), - ) - self.helpers.set_value( - "Skewnesses", - self.helpers["Skewnesses"](self.X_transformed, self.y, - self.logger, self.categorical_transformed), - ) - self.helpers.set_value( - "Kurtosisses", - self.helpers["Kurtosisses"](self.X_transformed, self.y, - self.logger, self.categorical_transformed), - ) - - def tearDown(self): - os.chdir(self.cwd) - - def get_multilabel(self): - cache = Memory(location=tempfile.gettempdir()) - cached_func = cache.cache(make_multilabel_classification) - return cached_func( - n_samples=100, - n_features=10, - n_classes=5, - n_labels=5, - return_indicator=True, - random_state=1 +@pytest.fixture( + scope='class', + params=('pandas', 'numpy') +) +def multilabel_train_data(request): + cache = Memory(location=tempfile.gettempdir()) + cached_func = cache.cache(make_multilabel_classification) + X, y = cached_func( + n_samples=100, + n_features=10, + n_classes=5, + n_labels=5, + return_indicator=True, + random_state=1 + ) + if request.param == 'numpy': + return X, y + elif request.param == 'pandas': + return pd.DataFrame(X), y + else: + raise ValueError(request.param) + + +@pytest.fixture( + scope='class', + params=('pandas', 'numpy') +) +def meta_train_data(request): + tests_dir = __file__ + os.chdir(os.path.dirname(tests_dir)) + + decoder = arff.ArffDecoder() + with open(os.path.join("datasets", "dataset.arff")) as fh: + dataset = decoder.decode(fh, encode_nominal=True) + + # -1 because the last attribute is the class + attribute_types = [ + 'numeric' if type(type_) != list else 'nominal' + for name, type_ in dataset['attributes'][:-1]] + + categorical = {i: True if attribute == 'nominal' else False + for i, attribute in enumerate(attribute_types)} + + data = np.array(dataset['data'], dtype=np.float64) + X = data[:, :-1] + y = data[:, -1].reshape((-1,)) + + logger = logging.getLogger('Meta') + meta_features.helper_functions.set_value( + "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), + ) + meta_features.helper_functions.set_value( + "NumSymbols", + meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + ) + meta_features.helper_functions.set_value( + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"](X, y, logger), + ) + if request.param == 'numpy': + return X, y, categorical + elif request.param == 'pandas': + return pd.DataFrame(X), y, categorical + else: + raise ValueError(request.param) + + +@pytest.fixture( + scope='class', + params=('pandas', 'numpy') +) +def meta_train_data_transformed(request): + tests_dir = __file__ + os.chdir(os.path.dirname(tests_dir)) + + decoder = arff.ArffDecoder() + with open(os.path.join("datasets", "dataset.arff")) as fh: + dataset = decoder.decode(fh, encode_nominal=True) + + # -1 because the last attribute is the class + attribute_types = [ + 'numeric' if type(type_) != list else 'nominal' + for name, type_ in dataset['attributes'][:-1]] + categorical = {i: True if attribute == 'nominal' else False + for i, attribute in enumerate(attribute_types)} + + data = np.array(dataset['data'], dtype=np.float64) + X = data[:, :-1] + y = data[:, -1].reshape((-1,)) + + logger = logging.getLogger('Meta') + meta_features.helper_functions.set_value( + "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), ) + meta_features.helper_functions.set_value( + "NumSymbols", + meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), + ) + meta_features.helper_functions.set_value( + "ClassOccurences", + meta_features.helper_functions["ClassOccurences"](X, y, logger), + ) + + DPP = DataPreprocessor(feat_type={ + col: 'categorical' if category else 'numerical' for col, category in categorical.items() + }) + X_transformed = DPP.fit_transform(X) + + number_numerical = np.sum(~np.array(list(categorical.values()))) + categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False + for i in range(X_transformed.shape[1])} + + # pre-compute values for transformed inputs + meta_features.helper_functions.set_value( + "PCA", meta_features.helper_functions["PCA"](X_transformed, y, logger), + ) + meta_features.helper_functions.set_value( + "Skewnesses", meta_features.helper_functions["Skewnesses"]( + X_transformed, y, logger, categorical_transformed), + ) + meta_features.helper_functions.set_value( + "Kurtosisses", meta_features.helper_functions["Kurtosisses"]( + X_transformed, y, logger, categorical_transformed) + ) + + if request.param == 'numpy': + return X_transformed, y, categorical_transformed + elif request.param == 'pandas': + return pd.DataFrame(X_transformed), y, categorical_transformed + else: + raise ValueError(request.param) + + +def test_number_of_instance(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfInstances"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 898 + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_classes(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfClasses"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 5 + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_features(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfFeatures"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 38 + assert isinstance(mf, MetaFeatureValue) + + +def test_missing_values(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.helper_functions["MissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert isinstance(mf.value, pd.DataFrame if hasattr(X, 'iloc') else np.ndarray) + assert mf.value.shape == X.shape + assert 22175 == np.count_nonzero(mf.value) + + +def test_number_of_Instances_with_missing_values(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 898 + assert isinstance(mf, MetaFeatureValue) + + +def test_percentage_of_Instances_with_missing_values(meta_train_data): + X, y, categorical = meta_train_data + meta_features.metafeatures.set_value( + "NumberOfInstancesWithMissingValues", + meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical), + ) + mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == 1.0 + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_features_with_missing_values(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 29 + assert isinstance(mf, MetaFeatureValue) + + +def test_percentage_of_features_with_missing_values(meta_train_data): + X, y, categorical = meta_train_data + meta_features.metafeatures.set_value( + "NumberOfFeaturesWithMissingValues", + meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical)) + mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == float(29)/float(38) + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_missing_values(meta_train_data): + X, y, categorical = meta_train_data + np.save('/tmp/debug', X) + mf = meta_features.metafeatures["NumberOfMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 22175 + assert isinstance(mf, MetaFeatureValue) + + +def test_percentage_missing_values(meta_train_data): + X, y, categorical = meta_train_data + meta_features.metafeatures.set_value( + "NumberOfMissingValues", meta_features.metafeatures["NumberOfMissingValues"]( + X, y, logging.getLogger('Meta'), categorical)) + mf = meta_features.metafeatures["PercentageOfMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(22175)/float(38*898)) + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_numeric_features(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfNumericFeatures"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 6 + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_categorical_features(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["NumberOfCategoricalFeatures"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 32 + assert isinstance(mf, MetaFeatureValue) + + +def test_ratio_numerical_to_categorical(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["RatioNumericalToNominal"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(6)/float(32)) + assert isinstance(mf, MetaFeatureValue) + + +def test_ratio_categorical_to_numerical(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["RatioNominalToNumerical"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(32)/float(6)) + assert isinstance(mf, MetaFeatureValue) + + +def test_dataset_ratio(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["DatasetRatio"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(38)/float(898)) + assert isinstance(mf, MetaFeatureValue) + + +def test_inverse_dataset_ratio(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["InverseDatasetRatio"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(898)/float(38)) + assert isinstance(mf, MetaFeatureValue) + + +def test_class_occurences(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == {0.0: 8.0, 1.0: 99.0, 2.0: 684.0, 4.0: 67.0, 5.0: 40.0} + + +def test_class_probability_min(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["ClassProbabilityMin"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(8)/float(898)) + assert isinstance(mf, MetaFeatureValue) + + +def test_class_probability_max(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["ClassProbabilityMax"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(mf.value) == (float(684)/float(898)) + assert isinstance(mf, MetaFeatureValue) + + +def test_class_probability_mean(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["ClassProbabilityMean"]( + X, y, logging.getLogger('Meta'), categorical) + classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) + prob_mean = (classes / float(898)).mean() + assert pytest.approx(mf.value) == prob_mean + assert isinstance(mf, MetaFeatureValue) + + +def test_class_probability_std(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["ClassProbabilitySTD"]( + X, y, logging.getLogger('Meta'), categorical) + classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) + prob_std = (classes / float(898)).std() + assert pytest.approx(mf.value) == prob_std + assert isinstance(mf, MetaFeatureValue) + + +def test_num_symbols(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.helper_functions["NumSymbols"]( + X, y, logging.getLogger('Meta'), categorical) + symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, 0, + 1, 1, 1, 0, 1, 1, 0, 3, 1, 0, 0, 0, 2, 2, 3, 2] + assert mf.value == symbol_frequency + + +def test_symbols_min(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["SymbolsMin"](X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 1 + + +def test_symbols_max(meta_train_data): + X, y, categorical = meta_train_data + # this is attribute steel + mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 7 + + +def test_symbols_mean(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["SymbolsMean"]( + X, y, logging.getLogger('Meta'), categorical) + # Empty looking spaces denote empty attributes + symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # + 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] + assert pytest.approx(mf.value) == np.mean(symbol_frequency) + + +def test_symbols_std(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["SymbolsSTD"](X, y, logging.getLogger('Meta'), categorical) + symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # + 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] + assert pytest.approx(mf.value) == np.std(symbol_frequency) + + +def test_symbols_sum(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["SymbolsSum"](X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 49 + + +def test_class_entropy(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.metafeatures["ClassEntropy"]( + X, y, logging.getLogger('Meta'), categorical) + classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) + classes = classes / sum(classes) + entropy = -np.sum([c * np.log2(c) for c in classes]) + + assert pytest.approx(mf.value) == entropy + + +def test_calculate_all_metafeatures(meta_train_data): + X, y, categorical = meta_train_data + mf = meta_features.calculate_all_metafeatures( + X, y, categorical, "2", logger=logging.getLogger('Meta')) + assert 52 == len(mf.metafeature_values) + assert mf.metafeature_values['NumberOfCategoricalFeatures'].value == 32 + + +def test_kurtosisses(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + mf = meta_features.helper_functions["Kurtosisses"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + assert 6 == len(mf.value) + + +def test_kurtosis_min(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["KurtosisMin"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_kurtosis_max(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["KurtosisMax"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_kurtosis_mean(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["KurtosisMean"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_kurtosis_std(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["KurtosisSTD"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_skewnesses(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + mf = meta_features.helper_functions["Skewnesses"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + assert 6 == len(mf.value) + + +def test_skewness_min(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["SkewnessMin"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_skewness_max(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["SkewnessMax"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_skewness_mean(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["SkewnessMean"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_skewness_std(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["SkewnessSTD"]( + X_transformed, y, logging.getLogger('Meta'), categorical_transformed) + + +def test_landmark_lda(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkLDA"](X_transformed, y, logging.getLogger('Meta')) + + +def test_landmark_naive_bayes(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkNaiveBayes"]( + X_transformed, y, logging.getLogger('Meta')) + + +def test_landmark_decision_tree(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkDecisionTree"]( + X_transformed, y, logging.getLogger('Meta')) + + +def test_decision_node(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkDecisionNodeLearner"]( + X_transformed, y, logging.getLogger('Meta')) + + +def test_random_node(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkRandomNodeLearner"]( + X_transformed, y, logging.getLogger('Meta')) + + +@unittest.skip("Currently not implemented!") +def test_worst_node(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["LandmarkWorstNodeLearner"]( + X_transformed, y, logging.getLogger('Meta')) + + +def test_1NN(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + # TODO: somehow compute the expected output? + meta_features.metafeatures["Landmark1NN"](X_transformed, y, logging.getLogger('Meta')) + + +def test_pca(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + meta_features.helper_functions["PCA"](X_transformed, y, logging.getLogger('Meta')) + + +def test_pca_95percent(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(0.2716049382716049) == mf.value + + +def test_pca_kurtosis_first_pc(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + mf = meta_features.metafeatures["PCAKurtosisFirstPC"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(-0.702850) != mf.value + + +def test_pca_skewness_first_pc(meta_train_data_transformed): + X_transformed, y, categorical_transformed = meta_train_data_transformed + mf = meta_features.metafeatures["PCASkewnessFirstPC"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(0.051210) != mf.value + + +def test_class_occurences_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.helper_functions["ClassOccurences"](X, y, logging.getLogger('Meta')) + assert mf.value == [{0: 16.0, 1: 84.0}, + {0: 8.0, 1: 92.0}, + {0: 68.0, 1: 32.0}, + {0: 15.0, 1: 85.0}, + {0: 28.0, 1: 72.0}] + + +def test_class_probability_min_multilabel(multilabel_train_data): + X, y = multilabel_train_data + meta_features.helper_functions.set_value( + "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger('Meta'))) + mf = meta_features.metafeatures["ClassProbabilityMin"](X, y, logging.getLogger('Meta')) + assert pytest.approx(mf.value) == (float(8) / float(100)) + assert isinstance(mf, MetaFeatureValue) + + +def test_class_probability_max_multilabel(multilabel_train_data): + X, y = multilabel_train_data + meta_features.helper_functions.set_value( + "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger('Meta'))) + mf = meta_features.metafeatures["ClassProbabilityMax"](X, y, logging.getLogger('Meta')) + assert pytest.approx(mf.value) == (float(92) / float(100)) + assert isinstance(mf, MetaFeatureValue) + - def test_number_of_instance(self): - mf = self.mf["NumberOfInstances"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 898) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_classes(self): - mf = self.mf["NumberOfClasses"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 5) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_classes_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["NumberOfClasses"](X, y, self.logger) - self.assertEqual(mf.value, 2) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_features(self): - mf = self.mf["NumberOfFeatures"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 38) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_missing_values(self): - mf = self.helpers["MissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertIsInstance(mf.value, np.ndarray) - self.assertEqual(mf.value.shape, self.X.shape) - self.assertEqual(22175, np.sum(mf.value)) - - def test_number_of_Instances_with_missing_values(self): - mf = self.mf["NumberOfInstancesWithMissingValues"](self.X, self.y, self.logger, - self.categorical) - self.assertEqual(mf.value, 898) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_percentage_of_Instances_with_missing_values(self): - self.mf.set_value( - "NumberOfInstancesWithMissingValues", - self.mf["NumberOfInstancesWithMissingValues"]( - self.X, self.y, self.logger, self.categorical), - ) - mf = self.mf["PercentageOfInstancesWithMissingValues"](self.X, self.y, - self.logger, self.categorical) - self.assertAlmostEqual(mf.value, 1.0) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_features_with_missing_values(self): - mf = self.mf["NumberOfFeaturesWithMissingValues"](self.X, self.y, - self.logger, self.categorical) - self.assertEqual(mf.value, 29) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_percentage_of_features_with_missing_values(self): - self.mf.set_value( - "NumberOfFeaturesWithMissingValues", - self.mf["NumberOfFeaturesWithMissingValues"](self.X, self.y, - self.logger, self.categorical)) - mf = self.mf["PercentageOfFeaturesWithMissingValues"](self.X, self.y, - self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(29)/float(38)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_missing_values(self): - mf = self.mf["NumberOfMissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 22175) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_percentage_missing_values(self): - self.mf.set_value("NumberOfMissingValues", - self.mf["NumberOfMissingValues"](self.X, self.y, - self.logger, self.categorical)) - mf = self.mf["PercentageOfMissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(22175)/float((38*898))) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_numeric_features(self): - mf = self.mf["NumberOfNumericFeatures"](self.X, self.y, self.logger, - self.categorical) - self.assertEqual(mf.value, 6) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_number_of_categorical_features(self): - mf = self.mf["NumberOfCategoricalFeatures"](self.X, self.y, self.logger, - self.categorical) - self.assertEqual(mf.value, 32) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_ratio_numerical_to_categorical(self): - mf = self.mf["RatioNumericalToNominal"](self.X, self.y, self.logger, - self.categorical) - self.assertAlmostEqual(mf.value, float(6)/float(32)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_ratio_categorical_to_numerical(self): - mf = self.mf["RatioNominalToNumerical"](self.X, self.y, self.logger, - self.categorical) - self.assertAlmostEqual(mf.value, float(32)/float(6)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_dataset_ratio(self): - mf = self.mf["DatasetRatio"](self.X, self.y, self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(38)/float(898)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_inverse_dataset_ratio(self): - mf = self.mf["InverseDatasetRatio"](self.X, self.y, self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(898)/float(38)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_occurences(self): - mf = self.helpers["ClassOccurences"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, - {0.0: 8.0, 1.0: 99.0, 2.0: 684.0, 4.0: 67.0, 5.0: 40.0}) - - def test_class_occurences_multilabel(self): - X, y = self.get_multilabel() - mf = self.helpers["ClassOccurences"](X, y, self.logger) - self.assertEqual(mf.value, - [{0: 16.0, 1: 84.0}, - {0: 8.0, 1: 92.0}, - {0: 68.0, 1: 32.0}, - {0: 15.0, 1: 85.0}, - {0: 28.0, 1: 72.0}]) - - def test_class_probability_min(self): - mf = self.mf["ClassProbabilityMin"](self.X, self.y, self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(8)/float(898)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_min_multilabel(self): - X, y = self.get_multilabel() - self.helpers.set_value("ClassOccurences", - self.helpers["ClassOccurences"](X, y, self.logger)) - mf = self.mf["ClassProbabilityMin"](X, y, self.logger) - self.assertAlmostEqual(mf.value, float(8) / float(100)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_max(self): - mf = self.mf["ClassProbabilityMax"](self.X, self.y, self.logger, self.categorical) - self.assertAlmostEqual(mf.value, float(684)/float(898)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_max_multilabel(self): - X, y = self.get_multilabel() - self.helpers.set_value("ClassOccurences", - self.helpers["ClassOccurences"](X, y, self.logger)) - mf = self.mf["ClassProbabilityMax"](X, y, self.logger) - self.assertAlmostEqual(mf.value, float(92) / float(100)) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_mean(self): - mf = self.mf["ClassProbabilityMean"](self.X, self.y, self.logger, self.categorical) - classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) - prob_mean = (classes / float(898)).mean() - self.assertAlmostEqual(mf.value, prob_mean) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_mean_multilabel(self): - X, y = self.get_multilabel() - self.helpers.set_value("ClassOccurences", - self.helpers["ClassOccurences"](X, y, self.logger)) - mf = self.mf["ClassProbabilityMean"](X, y, self.logger) - classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] - probas = np.mean([np.mean(np.array(cls_)) / 100 for cls_ in classes]) - self.assertAlmostEqual(mf.value, probas) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_std(self): - mf = self.mf["ClassProbabilitySTD"](self.X, self.y, self.logger, self.categorical) - classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) - prob_std = (classes / float(898)).std() - self.assertAlmostEqual(mf.value, prob_std) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_class_probability_std_multilabel(self): - X, y = self.get_multilabel() - self.helpers.set_value("ClassOccurences", - self.helpers["ClassOccurences"](X, y, self.logger)) - mf = self.mf["ClassProbabilitySTD"](X, y, self.logger) - classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] - probas = np.mean([np.std(np.array(cls_) / 100.) for cls_ in classes]) - self.assertAlmostEqual(mf.value, probas) - self.assertIsInstance(mf, MetaFeatureValue) - - def test_num_symbols(self): - mf = self.helpers["NumSymbols"](self.X, self.y, self.logger, self.categorical) - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, 0, - 1, 1, 1, 0, 1, 1, 0, 3, 1, 0, 0, 0, 2, 2, 3, 2] - self.assertEqual(mf.value, symbol_frequency) - - def test_symbols_min(self): - mf = self.mf["SymbolsMin"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 1) - - def test_symbols_max(self): - # this is attribute steel - mf = self.mf["SymbolsMax"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 7) - - def test_symbols_mean(self): - mf = self.mf["SymbolsMean"](self.X, self.y, self.logger, self.categorical) - # Empty looking spaces denote empty attributes - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # - 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] - self.assertAlmostEqual(mf.value, np.mean(symbol_frequency)) - - def test_symbols_std(self): - mf = self.mf["SymbolsSTD"](self.X, self.y, self.logger, self.categorical) - symbol_frequency = [2, 1, 7, 1, 2, 4, 1, 1, 4, 2, 1, 1, 1, 2, 1, # - 1, 1, 1, 1, 1, 3, 1, 2, 2, 3, 2] - self.assertAlmostEqual(mf.value, np.std(symbol_frequency)) - - def test_symbols_sum(self): - mf = self.mf["SymbolsSum"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 49) - - def test_kurtosisses(self): - mf = self.helpers["Kurtosisses"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - self.assertEqual(6, len(mf.value)) - - def test_kurtosis_min(self): - # TODO: somehow compute the expected output? - self.mf["KurtosisMin"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_kurtosis_max(self): - # TODO: somehow compute the expected output? - self.mf["KurtosisMax"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_kurtosis_mean(self): - # TODO: somehow compute the expected output? - self.mf["KurtosisMean"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_kurtosis_std(self): - # TODO: somehow compute the expected output? - self.mf["KurtosisSTD"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_skewnesses(self): - mf = self.helpers["Skewnesses"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - self.assertEqual(6, len(mf.value)) - - def test_skewness_min(self): - # TODO: somehow compute the expected output? - self.mf["SkewnessMin"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_skewness_max(self): - # TODO: somehow compute the expected output? - self.mf["SkewnessMax"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_skewness_mean(self): - # TODO: somehow compute the expected output? - self.mf["SkewnessMean"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_skewness_std(self): - # TODO: somehow compute the expected output? - self.mf["SkewnessSTD"](self.X_transformed, self.y, self.logger, - self.categorical_transformed) - - def test_class_entropy(self): - mf = self.mf["ClassEntropy"](self.X, self.y, self.logger, self.categorical) - classes = np.array((8, 99, 684, 67, 40), dtype=np.float64) - classes = classes / sum(classes) - entropy = -np.sum([c * np.log2(c) for c in classes]) - - self.assertAlmostEqual(mf.value, entropy) - - def test_class_entropy_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["ClassEntropy"](X, y, self.logger) - - classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] - entropies = [] - for cls in classes: - cls = np.array(cls, dtype=np.float32) - cls = cls / sum(cls) - entropy = -np.sum([c * np.log2(c) for c in cls]) - entropies.append(entropy) - - self.assertAlmostEqual(mf.value, np.mean(entropies)) - - def test_landmark_lda(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkLDA"](self.X_transformed, self.y, self.logger) - - def test_landmark_lda_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["LandmarkLDA"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - def test_landmark_naive_bayes(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkNaiveBayes"](self.X_transformed, self.y, self.logger) - - def test_landmark_naive_bayes_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["LandmarkNaiveBayes"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - def test_landmark_decision_tree(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkDecisionTree"](self.X_transformed, self.y, self.logger) - - def test_landmark_decision_tree_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["LandmarkDecisionTree"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - def test_decision_node(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkDecisionNodeLearner"](self.X_transformed, self.y, self.logger) - - def test_landmark_decision_node_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["LandmarkDecisionNodeLearner"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - def test_random_node(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkRandomNodeLearner"](self.X_transformed, self.y, self.logger) - - def test_landmark_random_node_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["LandmarkRandomNodeLearner"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - @unittest.skip("Currently not implemented!") - def test_worst_node(self): - # TODO: somehow compute the expected output? - self.mf["LandmarkWorstNodeLearner"](self.X_transformed, self.y, self.logger) - - def test_1NN(self): - # TODO: somehow compute the expected output? - self.mf["Landmark1NN"](self.X_transformed, self.y, self.logger) - - def test_1NN_multilabel(self): - X, y = self.get_multilabel() - mf = self.mf["Landmark1NN"](X, y, self.logger) - self.assertTrue(np.isfinite(mf.value)) - - def test_pca(self): - self.helpers["PCA"](self.X_transformed, self.y, self.logger) - - def test_pca_95percent(self): - mf = self.mf["PCAFractionOfComponentsFor95PercentVariance"]( - self.X_transformed, self.y, self.logger) - self.assertAlmostEqual(0.2716049382716049, mf.value) - - def test_pca_kurtosis_first_pc(self): - mf = self.mf["PCAKurtosisFirstPC"](self.X_transformed, self.y, self.logger) - self.assertNotAlmostEqual(-0.702850, mf.value) - - def test_pca_skewness_first_pc(self): - mf = self.mf["PCASkewnessFirstPC"](self.X_transformed, self.y, self.logger) - self.assertNotAlmostEqual(0.051210, mf.value) - - def test_calculate_all_metafeatures(self): - mf = meta_features.calculate_all_metafeatures( - self.X, self.y, self.categorical, "2", logger=self.logger) - self.assertEqual(52, len(mf.metafeature_values)) - self.assertEqual(mf.metafeature_values[ - 'NumberOfCategoricalFeatures'].value, 32) - sio = StringIO() - mf.dump(sio) - - def test_calculate_all_metafeatures_multilabel(self): - self.helpers.clear() - X, y = self.get_multilabel() - categorical = [False] * 10 - mf = meta_features.calculate_all_metafeatures( - X, y, categorical, "Generated", logger=self.logger) - self.assertEqual(52, len(mf.metafeature_values)) - sio = StringIO() - mf.dump(sio) - - -if __name__ == "__main__": - # suite = unittest.TestLoader().loadTestsFromTestCase(TestMetaFeatures) - # unittest.TextTestRunner(verbosity=2).run(suite) - t = unittest.TestLoader().loadTestsFromName( - "pyMetaLearn.metafeatures.test_meta_features.TestMetaFeatures" - ".test_calculate_all_metafeatures") - unittest.TextTestRunner(verbosity=2).run(t) +def test_class_probability_mean_multilabel(multilabel_train_data): + X, y = multilabel_train_data + meta_features.helper_functions.set_value( + "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger('Meta'))) + mf = meta_features.metafeatures["ClassProbabilityMean"](X, y, logging.getLogger('Meta')) + classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] + probas = np.mean([np.mean(np.array(cls_)) / 100 for cls_ in classes]) + assert mf.value == pytest.approx(probas) + assert isinstance(mf, MetaFeatureValue) + + +def test_number_of_classes_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["NumberOfClasses"](X, y, logging.getLogger('Meta')) + assert mf.value == 5 + assert isinstance(mf, MetaFeatureValue) + + +def test_class_probability_std_multilabel(multilabel_train_data): + X, y = multilabel_train_data + meta_features.helper_functions.set_value( + "ClassOccurences", meta_features.helper_functions["ClassOccurences"]( + X, y, logging.getLogger('Meta'))) + mf = meta_features.metafeatures["ClassProbabilitySTD"](X, y, logging.getLogger('Meta')) + classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] + probas = np.mean([np.std(np.array(cls_) / 100.) for cls_ in classes]) + assert pytest.approx(mf.value) == probas + assert isinstance(mf, MetaFeatureValue) + + +def test_class_entropy_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["ClassEntropy"](X, y, logging.getLogger('Meta')) + + classes = [(16, 84), (8, 92), (68, 32), (15, 85), (28, 72)] + entropies = [] + for cls in classes: + cls = np.array(cls, dtype=np.float32) + cls = cls / sum(cls) + entropy = -np.sum([c * np.log2(c) for c in cls]) + entropies.append(entropy) + + assert pytest.approx(mf.value) == np.mean(entropies) + + +def test_landmark_lda_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["LandmarkLDA"](X, y, logging.getLogger('Meta')) + assert np.isfinite(mf.value) + + +def test_landmark_naive_bayes_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["LandmarkNaiveBayes"](X, y, logging.getLogger('Meta')) + assert np.isfinite(mf.value) + + +def test_landmark_decision_tree_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["LandmarkDecisionTree"](X, y, logging.getLogger('Meta')) + assert np.isfinite(mf.value) + + +def test_landmark_decision_node_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["LandmarkDecisionNodeLearner"]( + X, y, logging.getLogger('Meta')) + assert np.isfinite(mf.value) + + +def test_landmark_random_node_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["LandmarkRandomNodeLearner"]( + X, y, logging.getLogger('Meta')) + assert np.isfinite(mf.value) + + +def test_1NN_multilabel(multilabel_train_data): + X, y = multilabel_train_data + mf = meta_features.metafeatures["Landmark1NN"](X, y, logging.getLogger('TestMeta')) + assert np.isfinite(mf.value) + + +def test_calculate_all_metafeatures_multilabel(multilabel_train_data): + meta_features.helper_functions.clear() + X, y = multilabel_train_data + categorical = {i: False for i in range(10)} + mf = meta_features.calculate_all_metafeatures( + X, y, categorical, "Generated", logger=logging.getLogger('TestMeta')) + assert 52 == len(mf.metafeature_values) + + +def test_calculate_all_metafeatures_same_results_across_datatypes(): + """ + This test makes sure that numpy and pandas produce the same metafeatures. + This also is an excuse to fully test anneal dataset, and make sure + all metafeatures work in this complex dataset + """ + X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=True) + categorical = {col: True if X[col].dtype.name == 'category' else False + for col in X.columns} + mf = meta_features.calculate_all_metafeatures( + X, y, categorical, "2", logger=logging.getLogger('Meta')) + assert 52 == len(mf.metafeature_values) + expected = { + 'PCASkewnessFirstPC': 0.41897660337677867, + 'PCAKurtosisFirstPC': -0.677692541156901, + 'PCAFractionOfComponentsFor95PercentVariance': 0.2716049382716049, + 'ClassEntropy': 1.1898338562043977, + 'SkewnessSTD': 7.540418815675546, + 'SkewnessMean': 1.47397188548894, + 'SkewnessMax': 29.916569235579203, + 'SkewnessMin': -29.916569235579203, + 'KurtosisSTD': 153.0563504598898, + 'KurtosisMean': 56.998860939761165, + 'KurtosisMax': 893.0011148272025, + 'KurtosisMin': -3.0, + 'SymbolsSum': 49, + 'SymbolsSTD': 1.3679553264445183, + 'SymbolsMean': 1.8846153846153846, + 'SymbolsMax': 7, + 'SymbolsMin': 1, + 'ClassProbabilitySTD': 0.28282850691819206, + 'ClassProbabilityMean': 0.2, + 'ClassProbabilityMax': 0.7616926503340757, + 'ClassProbabilityMin': 0.008908685968819599, + 'InverseDatasetRatio': 23.63157894736842, + 'DatasetRatio': 0.042316258351893093, + 'RatioNominalToNumerical': 5.333333333333333, + 'RatioNumericalToNominal': 0.1875, + 'NumberOfCategoricalFeatures': 32, + 'NumberOfNumericFeatures': 6, + 'NumberOfMissingValues': 22175.0, + 'NumberOfFeaturesWithMissingValues': 29.0, + 'NumberOfInstancesWithMissingValues': 898.0, + 'NumberOfFeatures': 38.0, + 'NumberOfClasses': 5.0, + 'NumberOfInstances': 898.0, + 'LogInverseDatasetRatio': 3.162583908575814, + 'LogDatasetRatio': -3.162583908575814, + 'PercentageOfMissingValues': 0.6498358926268901, + 'PercentageOfFeaturesWithMissingValues': 0.7631578947368421, + 'PercentageOfInstancesWithMissingValues': 1.0, + 'LogNumberOfFeatures': 3.6375861597263857, + 'LogNumberOfInstances': 6.8001700683022, + } + assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) + + expected_landmarks = { + 'Landmark1NN': 0.9721601489757914, + 'LandmarkRandomNodeLearner': 0.7616945996275606, + 'LandmarkDecisionNodeLearner': 0.7827932960893855, + 'LandmarkDecisionTree': 0.9899875853507139, + 'LandmarkNaiveBayes': 0.9287150837988827, + 'LandmarkLDA': 0.9610242085661079, + } + assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( + expected_landmarks, rel=1e-5) + + # Then do numpy! + X, y = fetch_openml(data_id=2, return_X_y=True, as_frame=False) + categorical = {i: True if category else False + for i, category in enumerate(categorical.values())} + mf = meta_features.calculate_all_metafeatures( + X, y, categorical, "2", logger=logging.getLogger('Meta')) + assert {k: mf[k].value for k in expected.keys()} == pytest.approx(expected) + + # The column-reorder of pandas and numpy array are different after + # the data preprocessing. So we cannot directly compare, and landmarking is + # sensible to column order + expected_landmarks['LandmarkDecisionTree'] = 0.9922098075729361 + assert {k: mf[k].value for k in expected_landmarks.keys()} == pytest.approx( + expected_landmarks, rel=1e-5) diff --git a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py index 9cf58910c6..18b795cb45 100644 --- a/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py +++ b/test/test_metalearning/pyMetaLearn/test_meta_features_sparse.py @@ -1,216 +1,309 @@ -from io import StringIO import logging import os -import sys -import unittest import arff + import numpy as np + +import pytest + from scipy import sparse + from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler + +import autosklearn.metalearning.metafeatures.metafeatures as meta_features from autosklearn.pipeline.components.data_preprocessing.data_preprocessing \ import DataPreprocessor -import autosklearn.metalearning.metafeatures.metafeatures as meta_features -# Make the super class importable -sys.path.append(os.path.dirname(__file__)) -import test_meta_features # noqa: E402 - - -class SparseMetaFeaturesTest(test_meta_features.MetaFeaturesTest, - unittest.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - self.cwd = os.getcwd() - tests_dir = __file__ - os.chdir(os.path.dirname(tests_dir)) - - decoder = arff.ArffDecoder() - with open(os.path.join("datasets", "dataset.arff")) as fh: - dataset = decoder.decode(fh, encode_nominal=True) - - # -1 because the last attribute is the class - self.attribute_types = [ - 'numeric' if type(type_) != list else 'nominal' - for name, type_ in dataset['attributes'][:-1]] - self.categorical = [True if attribute == 'nominal' else False - for attribute in self.attribute_types] - - data = np.array(dataset['data'], dtype=np.float64) - X = data[:, :-1] - y = data[:, -1].reshape((-1,)) - - # First, swap NaNs and zeros, because when converting an encoded - # dense matrix to sparse, the values which are encoded to zero are lost - X_sparse = X.copy() - NaNs = ~np.isfinite(X_sparse) - X_sparse[NaNs] = 0 - X_sparse = sparse.csr_matrix(X_sparse) - - ohe = DataPreprocessor(categorical_features=self.categorical) - X_transformed = X_sparse.copy() - X_transformed = ohe.fit_transform(X_transformed) - imp = SimpleImputer(copy=False) - X_transformed = imp.fit_transform(X_transformed) - standard_scaler = StandardScaler(with_mean=False) - X_transformed = standard_scaler.fit_transform(X_transformed) - - # Transform the array which indicates the categorical metafeatures - number_numerical = np.sum(~np.array(self.categorical)) - categorical_transformed = [True] * (X_transformed.shape[1] - - number_numerical) + \ - [False] * number_numerical - self.categorical_transformed = categorical_transformed - - self.X = X_sparse - self.X_transformed = X_transformed - self.y = y - self.mf = meta_features.metafeatures - self.helpers = meta_features.helper_functions - self.logger = logging.getLogger() - - # Precompute some helper functions - self.helpers.set_value( - "PCA", - self.helpers["PCA"](self.X_transformed, self.y, self.logger), - ) - self.helpers.set_value( - "MissingValues", - self.helpers["MissingValues"](self.X, self.y, self.logger, self.categorical), - ) - self.mf.set_value( - "NumberOfMissingValues", - self.mf["NumberOfMissingValues"](self.X, self.y, self.logger, self.categorical), - ) - self.helpers.set_value( - "NumSymbols", - self.helpers["NumSymbols"](self.X, self.y, self.logger, self.categorical), - ) - self.helpers.set_value( - "ClassOccurences", - self.helpers["ClassOccurences"](self.X, self.y, self.logger), - ) - self.helpers.set_value( - "Skewnesses", - self.helpers["Skewnesses"](self.X_transformed, self.y, self.logger, - self.categorical_transformed), - ) - self.helpers.set_value( - "Kurtosisses", - self.helpers["Kurtosisses"](self.X_transformed, self.y, self.logger, - self.categorical_transformed), - ) - - def test_missing_values(self): - mf = self.helpers["MissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertTrue(sparse.issparse(mf.value)) - self.assertEqual(mf.value.shape, self.X.shape) - self.assertEqual(mf.value.dtype, np.bool) - self.assertEqual(0, np.sum(mf.value.data)) - - def test_number_of_missing_values(self): - mf = self.mf["NumberOfMissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(0, mf.value) - - def test_percentage_missing_values(self): - mf = self.mf["PercentageOfMissingValues"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(0, mf.value) - - def test_number_of_Instances_with_missing_values(self): - mf = self.mf["NumberOfInstancesWithMissingValues"]( - self.X, self.y, self.logger, self.categorical) - self.assertEqual(0, mf.value) - - def test_percentage_of_Instances_with_missing_values(self): - self.mf.set_value("NumberOfInstancesWithMissingValues", - self.mf["NumberOfInstancesWithMissingValues"]( - self.X, self.y, self.logger, self.categorical)) - mf = self.mf["PercentageOfInstancesWithMissingValues"](self.X, self.y, self.logger, - self.categorical) - self.assertAlmostEqual(0, mf.value) - - def test_number_of_features_with_missing_values(self): - mf = self.mf["NumberOfFeaturesWithMissingValues"](self.X, self.y, self.logger, - self.categorical) - self.assertEqual(0, mf.value) - - def test_percentage_of_features_with_missing_values(self): - self.mf.set_value("NumberOfFeaturesWithMissingValues", - self.mf["NumberOfFeaturesWithMissingValues"]( - self.X, self.y, self.logger, self.categorical)) - mf = self.mf["PercentageOfFeaturesWithMissingValues"](self.X, self.y, self.logger, - self.categorical) - self.assertAlmostEqual(0, mf.value) - - def test_num_symbols(self): - mf = self.helpers["NumSymbols"](self.X, self.y, self.logger, self.categorical) - - symbol_frequency = [2, 0, 6, 0, 1, 3, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 2] - self.assertEqual(mf.value, symbol_frequency) - - def test_symbols_max(self): - # this is attribute steel - mf = self.mf["SymbolsMax"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 6) - - def test_symbols_mean(self): - mf = self.mf["SymbolsMean"](self.X, self.y, self.logger, self.categorical) - # Empty looking spaces denote empty attributes - symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] - self.assertAlmostEqual(mf.value, np.mean(symbol_frequency)) - - def test_symbols_std(self): - mf = self.mf["SymbolsSTD"](self.X, self.y, self.logger, self.categorical) - symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] - self.assertAlmostEqual(mf.value, np.std(symbol_frequency)) - - def test_symbols_sum(self): - mf = self.mf["SymbolsSum"](self.X, self.y, self.logger, self.categorical) - self.assertEqual(mf.value, 25) - - def test_skewnesses(self): - fixture = [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 1.0, 0.0, -1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, - -0.6969708499033568, 0.626346013011263, - 0.3809987596624038, 1.4762248835141034, - 0.07687661087633726, 0.36889797830360116] - mf = self.helpers["Skewnesses"](self.X_transformed, self.y, self.logger) - print(mf.value) - print(fixture) - np.testing.assert_allclose(mf.value, fixture) - - def test_kurtosisses(self): - fixture = [-3.0, -3.0, -2.0, -2.0, -3.0, -3.0, -3.0, -3.0, - -3.0, -2.0, -3.0, -2.0, -3.0, -3.0, -2.0, -3.0, - -3.0, -3.0, -3.0, -3.0, -3.0, -2.0, -3.0, - -3.0, -3.0, -1.1005836114255765, - -1.1786325509475712, -1.2387998382327912, - 1.393438264413704, -0.9768209837948336, - -1.7937072296512782] - mf = self.helpers["Kurtosisses"](self.X_transformed, self.y, self.logger) - np.testing.assert_allclose(mf.value, fixture) - - def test_pca_95percent(self): - mf = self.mf["PCAFractionOfComponentsFor95PercentVariance"]( - self.X_transformed, self.y, self.logger) - self.assertAlmostEqual(0.7741935483870968, mf.value) - - def test_pca_kurtosis_first_pc(self): - mf = self.mf["PCAKurtosisFirstPC"](self.X_transformed, self.y, self.logger) - self.assertAlmostEqual(-0.15444516166802469, mf.value) - - def test_pca_skewness_first_pc(self): - mf = self.mf["PCASkewnessFirstPC"](self.X_transformed, self.y, self.logger) - self.assertAlmostEqual(0.026514792083623905, mf.value) - - def test_calculate_all_metafeatures(self): - mf = meta_features.calculate_all_metafeatures( - self.X, self.y, self.categorical, "2", logger=self.logger) - self.assertEqual(52, len(mf.metafeature_values)) - sio = StringIO() - mf.dump(sio) + +@pytest.fixture +def sparse_data(): + tests_dir = __file__ + os.chdir(os.path.dirname(tests_dir)) + + decoder = arff.ArffDecoder() + with open(os.path.join("datasets", "dataset.arff")) as fh: + dataset = decoder.decode(fh, encode_nominal=True) + + # -1 because the last attribute is the class + attribute_types = [ + 'numeric' if type(type_) != list else 'nominal' + for name, type_ in dataset['attributes'][:-1]] + categorical = {i: True if attribute == 'nominal' else False + for i, attribute in enumerate(attribute_types)} + + data = np.array(dataset['data'], dtype=np.float64) + X = data[:, :-1] + y = data[:, -1].reshape((-1,)) + + # First, swap NaNs and zeros, because when converting an encoded + # dense matrix to sparse, the values which are encoded to zero are lost + X_sparse = X.copy() + NaNs = ~np.isfinite(X_sparse) + X_sparse[NaNs] = 0 + X_sparse = sparse.csr_matrix(X_sparse) + + X = X_sparse + y = y + mf = meta_features.metafeatures + helpers = meta_features.helper_functions + logger = logging.getLogger() + # Precompute some helper functions + helpers.set_value( + "MissingValues", + helpers["MissingValues"](X, y, logger, categorical), + ) + mf.set_value( + "NumberOfMissingValues", + mf["NumberOfMissingValues"](X, y, logger, categorical), + ) + helpers.set_value( + "NumSymbols", + helpers["NumSymbols"](X, y, logger, categorical), + ) + helpers.set_value( + "ClassOccurences", + helpers["ClassOccurences"](X, y, logger), + ) + return X, y, categorical + + +@pytest.fixture +def sparse_data_transformed(): + tests_dir = __file__ + os.chdir(os.path.dirname(tests_dir)) + + decoder = arff.ArffDecoder() + with open(os.path.join("datasets", "dataset.arff")) as fh: + dataset = decoder.decode(fh, encode_nominal=True) + + # -1 because the last attribute is the class + attribute_types = [ + 'numeric' if type(type_) != list else 'nominal' + for name, type_ in dataset['attributes'][:-1]] + categorical = {i: True if attribute == 'nominal' else False + for i, attribute in enumerate(attribute_types)} + + data = np.array(dataset['data'], dtype=np.float64) + X = data[:, :-1] + y = data[:, -1].reshape((-1,)) + + # First, swap NaNs and zeros, because when converting an encoded + # dense matrix to sparse, the values which are encoded to zero are lost + X_sparse = X.copy() + NaNs = ~np.isfinite(X_sparse) + X_sparse[NaNs] = 0 + X_sparse = sparse.csr_matrix(X_sparse) + + ohe = DataPreprocessor(feat_type={ + col: 'categorical' if category else 'numerical' + for col, category in categorical.items() + }) + X_transformed = X_sparse.copy() + X_transformed = ohe.fit_transform(X_transformed) + imp = SimpleImputer(copy=False) + X_transformed = imp.fit_transform(X_transformed) + standard_scaler = StandardScaler(with_mean=False) + X_transformed = standard_scaler.fit_transform(X_transformed) + + # Transform the array which indicates the categorical metafeatures + number_numerical = np.sum(~np.array(list(categorical.values()))) + categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False + for i in range(X_transformed.shape[1])} + + X = X_sparse + X_transformed = X_transformed + y = y + mf = meta_features.metafeatures + helpers = meta_features.helper_functions + logger = logging.getLogger() + + # Precompute some helper functions + helpers.set_value( + "PCA", + helpers["PCA"](X_transformed, y, logger), + ) + helpers.set_value( + "MissingValues", + helpers["MissingValues"](X, y, logger, categorical), + ) + mf.set_value( + "NumberOfMissingValues", + mf["NumberOfMissingValues"](X, y, logger, categorical), + ) + helpers.set_value( + "NumSymbols", + helpers["NumSymbols"](X, y, logger, categorical), + ) + helpers.set_value( + "ClassOccurences", + helpers["ClassOccurences"](X, y, logger), + ) + helpers.set_value( + "Skewnesses", + helpers["Skewnesses"](X_transformed, y, logger, + categorical_transformed), + ) + helpers.set_value( + "Kurtosisses", + helpers["Kurtosisses"](X_transformed, y, logger, categorical_transformed), + ) + return X_transformed, y, categorical_transformed + + +def test_missing_values(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.helper_functions["MissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert sparse.issparse(mf.value) + assert mf.value.shape == X.shape + assert mf.value.dtype == np.bool + assert 0 == np.sum(mf.value.data) + + +def test_number_of_missing_values(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["NumberOfMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert 0 == mf.value + + +def test_percentage_missing_values(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["PercentageOfMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert 0 == mf.value + + +def test_number_of_Instances_with_missing_values(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert 0 == mf.value + + +def test_percentage_of_Instances_with_missing_values(sparse_data): + X, y, categorical = sparse_data + meta_features.metafeatures.set_value( + "NumberOfInstancesWithMissingValues", + meta_features.metafeatures["NumberOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical)) + mf = meta_features.metafeatures["PercentageOfInstancesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(0) == mf.value + + +def test_number_of_features_with_missing_values(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert 0 == mf.value + + +def test_percentage_of_features_with_missing_values(sparse_data): + X, y, categorical = sparse_data + meta_features.metafeatures.set_value( + "NumberOfFeaturesWithMissingValues", + meta_features.metafeatures["NumberOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical)) + mf = meta_features.metafeatures["PercentageOfFeaturesWithMissingValues"]( + X, y, logging.getLogger('Meta'), categorical) + assert pytest.approx(0, mf.value) + + +def test_num_symbols(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.helper_functions["NumSymbols"]( + X, y, logging.getLogger('Meta'), categorical) + + symbol_frequency = [2, 0, 6, 0, 1, 3, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 2, 2] + assert mf.value == symbol_frequency + + +def test_symbols_max(sparse_data): + X, y, categorical = sparse_data + # this is attribute steel + mf = meta_features.metafeatures["SymbolsMax"](X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 6 + + +def test_symbols_mean(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["SymbolsMean"]( + X, y, logging.getLogger('Meta'), categorical) + # Empty looking spaces denote empty attributes + symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] + assert pytest.approx(mf.value) == np.mean(symbol_frequency) + + +def test_symbols_std(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["SymbolsSTD"]( + X, y, logging.getLogger('Meta'), categorical) + symbol_frequency = [2, 6, 1, 3, 3, 1, 1, 2, 1, 1, 2, 2] + assert pytest.approx(mf.value) == np.std(symbol_frequency) + + +def test_symbols_sum(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.metafeatures["SymbolsSum"]( + X, y, logging.getLogger('Meta'), categorical) + assert mf.value == 25 + + +def test_skewnesses(sparse_data_transformed): + X_transformed, y, categorical_transformed = sparse_data_transformed + fixture = [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 1.0, 0.0, -1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, + -0.6969708499033568, 0.626346013011263, + 0.3809987596624038, 1.4762248835141034, + 0.07687661087633726, 0.36889797830360116] + mf = meta_features.helper_functions["Skewnesses"](X_transformed, y, logging.getLogger('Meta')) + print(mf.value) + print(fixture) + np.testing.assert_allclose(mf.value, fixture) + + +def test_kurtosisses(sparse_data_transformed): + fixture = [-3.0, -3.0, -2.0, -2.0, -3.0, -3.0, -3.0, -3.0, + -3.0, -2.0, -3.0, -2.0, -3.0, -3.0, -2.0, -3.0, + -3.0, -3.0, -3.0, -3.0, -3.0, -2.0, -3.0, + -3.0, -3.0, -1.1005836114255765, + -1.1786325509475712, -1.2387998382327912, + 1.393438264413704, -0.9768209837948336, + -1.7937072296512782] + X_transformed, y, categorical_transformed = sparse_data_transformed + mf = meta_features.helper_functions["Kurtosisses"](X_transformed, y, logging.getLogger('Meta')) + np.testing.assert_allclose(mf.value, fixture) + + +def test_pca_95percent(sparse_data_transformed): + X_transformed, y, categorical_transformed = sparse_data_transformed + mf = meta_features.metafeatures["PCAFractionOfComponentsFor95PercentVariance"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(0.7741935483870968) == mf.value + + +def test_pca_kurtosis_first_pc(sparse_data_transformed): + X_transformed, y, categorical_transformed = sparse_data_transformed + mf = meta_features.metafeatures["PCAKurtosisFirstPC"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(-0.15444516166802469) == mf.value + + +def test_pca_skewness_first_pc(sparse_data_transformed): + X_transformed, y, categorical_transformed = sparse_data_transformed + mf = meta_features.metafeatures["PCASkewnessFirstPC"]( + X_transformed, y, logging.getLogger('Meta')) + assert pytest.approx(0.026514792083623905) == mf.value + + +def test_calculate_all_metafeatures(sparse_data): + X, y, categorical = sparse_data + mf = meta_features.calculate_all_metafeatures( + X, y, categorical, "2", logger=logging.getLogger('Meta')) + assert 52 == len(mf.metafeature_values) diff --git a/test/test_metalearning/test_metalearning.py b/test/test_metalearning/test_metalearning.py index a21745c4bb..cb9cea9afa 100644 --- a/test/test_metalearning/test_metalearning.py +++ b/test/test_metalearning/test_metalearning.py @@ -81,7 +81,7 @@ def test_metalearning(self): include_preprocessors=['no_preprocessing']) X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) - categorical = [False] * X_train.shape[1] + categorical = {i: False for i in range(X_train.shape[1])} meta_features_label = _calculate_metafeatures( X_train, Y_train, categorical, dataset_name, task) diff --git a/test/test_optimizer/test_smbo.py b/test/test_optimizer/test_smbo.py index a15302b8d7..2408e1ce71 100644 --- a/test/test_optimizer/test_smbo.py +++ b/test/test_optimizer/test_smbo.py @@ -53,7 +53,7 @@ def test_smbo_metalearning_configurations(backend, context, dask_client): X_test, Y_test, task=BINARY_CLASSIFICATION, dataset_name='iris', - feat_type=None, + feat_type={i: 'numerical' for i in range(X_train.shape[1])}, ) backend.save_datamanager(datamanager) smbo.task = BINARY_CLASSIFICATION diff --git a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py index 817dc093f8..dffa763397 100644 --- a/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py +++ b/test/test_pipeline/components/data_preprocessing/test_categorical_imputation.py @@ -1,29 +1,76 @@ import numpy as np from scipy import sparse +import pandas as pd +import pytest + from autosklearn.pipeline.components.data_preprocessing.imputation.categorical_imputation\ import CategoricalImputation -from autosklearn.pipeline.util import PreprocessingTestCase -class CategoricalImputationTest(PreprocessingTestCase): - def _get_dataset(self): - size = (50, 20) - X = np.array(np.random.randint(3, 10, size=size), dtype=float) - mask = np.logical_not(np.random.randint(0, 5, size=size), dtype=bool) +@pytest.fixture +def input_data_imputation(request): + size = (50, 20) + X = np.array(np.random.randint(3, 10, size=size), dtype=float) + mask = np.logical_not(np.random.randint(0, 5, size=size), dtype=bool) + X[mask] = np.nan + if request.param == 'numpy': + pass + elif request.param == 'pandas': + X = pd.DataFrame(X) + return X, mask + + +@pytest.mark.parametrize('input_data_imputation', ('numpy', 'pandas'), indirect=True) +@pytest.mark.parametrize('categorical', (True, False)) +def test_default_imputation(input_data_imputation, categorical): + """ + Makes sure that imputation works for both numerical and categorical data. + This also has to be guaranteed for numpy and pandas like objects. + """ + X, mask = input_data_imputation + if categorical: + imputation_value = 'missing_value' + X = X.astype('str').astype('object') X[mask] = np.nan - return X, mask - - def test_default(self): - X, mask = self._get_dataset() - Y = CategoricalImputation().fit_transform(X) - self.assertTrue((np.argwhere(Y == 2) == np.argwhere(mask)).all()) - self.assertTrue((np.argwhere(Y != 2) == np.argwhere(np.logical_not(mask))).all()) - - def test_default_sparse(self): - X, mask = self._get_dataset() - X = sparse.csc_matrix(X) - Y = CategoricalImputation().fit_transform(X) - Y = Y.todense() - self.assertTrue((np.argwhere(Y == 2) == np.argwhere(mask)).all()) - self.assertTrue((np.argwhere(Y != 2) == np.argwhere(np.logical_not(mask))).all()) + else: + imputation_value = 0 + Y = CategoricalImputation().fit_transform(X.copy()) + assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all()) + assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all()) + + +@pytest.mark.parametrize('format_type', ('numpy', 'pandas')) +def test_nonzero_numerical_imputation(format_type): + + # First try with an array with 0 as only valid category. The imputation should + # happen with -1 + X = np.full(fill_value=np.nan, shape=(10, 10)) + X[0, :] = 0 + if 'pandas' in format_type: + X = pd.DataFrame(X) + elif 'numpy' in format_type: + pass + else: + pytest.fail(format_type) + Y = CategoricalImputation().fit_transform(X.copy()) + np.testing.assert_equal(np.nan_to_num(X, nan=-1, copy=True), Y) + + # Then if there is also a -1 in the category, we expect -2 as imputation + X = np.full(fill_value=np.nan, shape=(10, 10)) + X[0, :] = 0 + X[1, :] = -1 + if 'pandas' in format_type: + X = pd.DataFrame(X) + Y = CategoricalImputation().fit_transform(X.copy()) + np.testing.assert_equal(np.nan_to_num(X, nan=-2, copy=True), Y) + + +@pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True) +def test_default_sparse(input_data_imputation): + X, mask = input_data_imputation + X = sparse.csc_matrix(X) + Y = CategoricalImputation().fit_transform(X) + Y = Y.todense() + assert (np.argwhere(Y == 0) == np.argwhere(mask)).all() + assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all() diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py index 4f95d278a9..11771bf4fc 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing.py @@ -62,19 +62,24 @@ def do_a_fit_transform(self, sparse_input): # Combine datasets and shuffle columns: n_feats = len(categ_feat) random_order = np.random.choice(np.arange(n_feats), size=n_feats, replace=False) - # Shuffle cat_feat according to random_order - categ_feat = np.array(categ_feat)[random_order] # Shuffle X according to random_order X = np.array(X)[random_order] X_comb = np.hstack(X) # Shuffle Y according to random_order and reorder it as the PreprocessingPipeline # does (i.e. categorical features come first in Y). - num_feat = np.logical_not(categ_feat) - y_order = random_order[np.argsort(num_feat)] - Y = [Y[n] for n in y_order] + + categ_feat = {i: 'categorical' if categ_feat[order] else 'numerical' + for i, order in enumerate(random_order)} + cat_to_left_order = [index for col, index in sorted( + [(col_type, i) for i, col_type in categ_feat.items()] + )] + # Sort so that Y Matches the random ordering + Y = [Y[n] for n in random_order] + # Then move the categorical columns to the left + Y = [Y[n] for n in cat_to_left_order] Y_comb = np.hstack(Y) # Data preprocessing - DPP = DataPreprocessor(categorical_features=categ_feat) + DPP = DataPreprocessor(feat_type=categ_feat) X_comb = sparse.csc_matrix(X_comb) if sparse_input else X_comb Y_comb_out_1 = DPP.fit_transform(X_comb) # Check if Y_comb_out is what we expect it to be: @@ -107,10 +112,12 @@ def test_string_categories(self): ['white', 'tall', np.nan]]) # Combined dataset with shuffled columns: X_comb = np.hstack((X_num, X_cat)) - categ_feat = np.array([False] * 3 + [True] * 3) + categ_feat = [False] * 3 + [True] * 3 random_order = np.random.choice(np.arange(6), size=6, replace=False) X_comb = X_comb[:, random_order] - categ_feat = categ_feat[random_order] + categ_feat = [categ_feat[order] for order in random_order] # Strings are not allowed, therefore: with self.assertRaises(ValueError): - DataPreprocessor(categorical_features=categ_feat).fit_transform(X_comb) + categ_feat = {i: 'categorical' if feat else 'numerical' + for i, feat in enumerate(categ_feat)} + DataPreprocessor(feat_type=categ_feat).fit_transform(X_comb) diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py index 750dc41c8c..7c11bd36e0 100644 --- a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py +++ b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_categorical.py @@ -28,7 +28,9 @@ def test_fit_transform(self): [0, 0, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]]) # dense input - Yt = CategoricalPreprocessingPipeline().fit_transform(X) + # Notice the X.copy() here as the imputation + # is in place to save resources + Yt = CategoricalPreprocessingPipeline().fit_transform(X.copy()) np.testing.assert_array_equal(Yt, Y) # sparse input X_sparse = sparse.csc_matrix(X) diff --git a/test/test_pipeline/test_classification.py b/test/test_pipeline/test_classification.py index 78b497feec..ddd276453d 100644 --- a/test/test_pipeline/test_classification.py +++ b/test/test_pipeline/test_classification.py @@ -244,6 +244,8 @@ def test_configurations_categorical_data(self): True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, True, True, True] + categorical = {i: 'categorical' if bool_cat else 'numerical' + for i, bool_cat in enumerate(categorical)} this_directory = os.path.dirname(__file__) X = np.loadtxt(os.path.join(this_directory, "components", "data_preprocessing", "dataset.pkl")) @@ -255,7 +257,7 @@ def test_configurations_categorical_data(self): 'X_test': X_test, 'Y_test': Y_test} init_params = { - 'data_preprocessing:categorical_features': + 'data_preprocessing:feat_type': categorical } @@ -271,21 +273,25 @@ def test_categorical_passed_to_one_hot_encoder(self, ohe_mock): with unittest.mock.patch('autosklearn.pipeline.classification.SimpleClassificationPipeline' '._check_init_params_honored'): cls = SimpleClassificationPipeline( - init_params={'data_preprocessing:categorical_features': [True, False]} + init_params={'data_preprocessing:feat_type': {0: 'categorical', + 1: 'numerical'}} ) self.assertEqual( ohe_mock.call_args[1]['init_params'], - {'categorical_features': [True, False]} + {'feat_type': {0: 'categorical', 1: 'numerical'}} ) default = cls.get_hyperparameter_search_space().get_default_configuration() cls.set_hyperparameters( configuration=default, - init_params={'data_preprocessing:categorical_features': [True, True, False]}, + init_params={'data_preprocessing:feat_type': {0: 'categorical', + 1: 'categorical', + 2: 'numerical'}}, ) self.assertEqual( ohe_mock.call_args[1]['init_params'], - {'categorical_features': [True, True, False]} + {'feat_type': {0: 'categorical', 1: 'categorical', + 2: 'numerical'}} ) def _test_configurations(self, configurations_space, make_sparse=False,