Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensemble bayesian learning clean #3

Open
wants to merge 16 commits into
base: reg_cocktails
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,156 changes: 854 additions & 302 deletions autoPyTorch/api/base_task.py

Large diffs are not rendered by default.

69 changes: 67 additions & 2 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union

import dask.distributed

import numpy as np

import pandas as pd

from smac.optimizer.smbo import SMBO

from autoPyTorch.api.base_task import BaseTask
from autoPyTorch.automl_common.common.utils.backend import Backend
from autoPyTorch.constants import (
Expand All @@ -22,6 +26,7 @@
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.evaluation.utils import DisableFileOutputParameters
from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates

Expand Down Expand Up @@ -87,6 +92,8 @@ def __init__(
logging_config: Optional[Dict] = None,
ensemble_size: int = 50,
ensemble_nbest: int = 50,
ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
num_stacking_layers: int = 1,
max_models_on_disc: int = 50,
temporary_directory: Optional[str] = None,
output_directory: Optional[str] = None,
Expand All @@ -96,6 +103,7 @@ def __init__(
exclude_components: Optional[Dict[str, Any]] = None,
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
feat_type: Optional[List[str]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
):
Expand All @@ -106,6 +114,7 @@ def __init__(
logging_config=logging_config,
ensemble_size=ensemble_size,
ensemble_nbest=ensemble_nbest,
ensemble_method=ensemble_method,
max_models_on_disc=max_models_on_disc,
temporary_directory=temporary_directory,
output_directory=output_directory,
Expand All @@ -116,8 +125,10 @@ def __init__(
backend=backend,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
feat_type=feat_type,
search_space_updates=search_space_updates,
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
num_stacking_layers=num_stacking_layers
)

def build_pipeline(
Expand Down Expand Up @@ -164,6 +175,7 @@ def _get_dataset_input_validator(
y_train: Union[List, pd.DataFrame, np.ndarray],
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
feat_type: Optional[List] = None,
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
Expand Down Expand Up @@ -205,13 +217,14 @@ def _get_dataset_input_validator(
resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_type = feat_type if feat_type is not None else self.feat_type
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_type=feat_type
)

# Fit a input validator to check the provided data
Expand All @@ -230,6 +243,51 @@ def _get_dataset_input_validator(

return dataset, input_validator

def run_autogluon_stacking(
self,
optimize_metric: str,
X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
max_budget: int = 50,
budget_type: str = 'epochs',
total_walltime_limit: int = 100,
func_eval_time_limit_secs: Optional[int] = None,
memory_limit: Optional[int] = 4096,
dataset_compression: Union[Mapping[str, Any], bool] = False,
all_supported_metrics: bool = True,
precision: int = 32,
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
dask_client: Optional[dask.distributed.Client] = None
):
self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression)

self.dataset, self.input_validator = self._get_dataset_input_validator(
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
resampling_strategy=self.resampling_strategy,
resampling_strategy_args=self.resampling_strategy_args,
dataset_name=dataset_name,
dataset_compression=self._dataset_compression)

return self._run_autogluon_stacking(
optimize_metric=optimize_metric,
dataset=self.dataset,
max_budget=max_budget,
budget_type=budget_type,
total_walltime_limit=total_walltime_limit,
func_eval_time_limit_secs=func_eval_time_limit_secs,
memory_limit=memory_limit,
all_supported_metrics=all_supported_metrics,
precision=precision,
disable_file_output=disable_file_output,
dask_client=dask_client,
)

def search(
self,
optimize_metric: str,
Expand All @@ -253,6 +311,9 @@ def search(
load_models: bool = True,
portfolio_selection: Optional[str] = None,
dataset_compression: Union[Mapping[str, Any], bool] = False,
smbo_class: Optional[SMBO] = None,
use_ensemble_opt_loss=False,
posthoc_ensemble_fit_stacking_ensemble_optimization: bool = False
) -> 'BaseTask':
"""
Search for the best pipeline configuration for the given dataset.
Expand Down Expand Up @@ -452,6 +513,9 @@ def search(
disable_file_output=disable_file_output,
load_models=load_models,
portfolio_selection=portfolio_selection,
smbo_class=smbo_class,
use_ensemble_opt_loss=use_ensemble_opt_loss,
posthoc_ensemble_fit_stacking_ensemble_optimization=posthoc_ensemble_fit_stacking_ensemble_optimization
)

def predict(
Expand Down Expand Up @@ -495,3 +559,4 @@ def predict_proba(self,
"the estimator search() method.")
X_test = self.input_validator.feature_validator.transform(X_test)
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)

13 changes: 11 additions & 2 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.evaluation.utils import DisableFileOutputParameters
from autoPyTorch.ensemble.utils import EnsembleSelectionTypes
from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates

Expand Down Expand Up @@ -87,6 +88,8 @@ def __init__(
logging_config: Optional[Dict] = None,
ensemble_size: int = 50,
ensemble_nbest: int = 50,
ensemble_method: int = EnsembleSelectionTypes.ensemble_selection,
num_stacking_layers: int = 1,
max_models_on_disc: int = 50,
temporary_directory: Optional[str] = None,
output_directory: Optional[str] = None,
Expand All @@ -96,6 +99,7 @@ def __init__(
exclude_components: Optional[Dict[str, Any]] = None,
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
feat_type: Optional[List[str]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
):
Expand All @@ -106,6 +110,8 @@ def __init__(
logging_config=logging_config,
ensemble_size=ensemble_size,
ensemble_nbest=ensemble_nbest,
ensemble_method=ensemble_method,
num_stacking_layers=num_stacking_layers,
max_models_on_disc=max_models_on_disc,
temporary_directory=temporary_directory,
output_directory=output_directory,
Expand All @@ -116,6 +122,7 @@ def __init__(
backend=backend,
resampling_strategy=resampling_strategy,
resampling_strategy_args=resampling_strategy_args,
feat_type=feat_type,
search_space_updates=search_space_updates,
task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
)
Expand Down Expand Up @@ -164,6 +171,7 @@ def _get_dataset_input_validator(
y_train: Union[List, pd.DataFrame, np.ndarray],
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
feat_type: Optional[List[str]] = None,
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
dataset_name: Optional[str] = None,
Expand Down Expand Up @@ -204,13 +212,14 @@ def _get_dataset_input_validator(
resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy
resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \
self.resampling_strategy_args

feat_type = feat_type if feat_type is not None else self.feat_type
# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
input_validator = TabularInputValidator(
is_classification=False,
logger_port=self._logger_port,
dataset_compression=dataset_compression
dataset_compression=dataset_compression,
feat_type=feat_type
)

# Fit a input validator to check the provided data
Expand Down
139 changes: 139 additions & 0 deletions autoPyTorch/api/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
from smac.runhistory.runhistory import RunHistory

def get_autogluon_default_nn_config(feat_type):
has_numerical_features = "numerical" in feat_type
has_cat_features = "categorical" in feat_type
search_space_updates = HyperparameterSearchSpaceUpdates()


# architecture head
search_space_updates.append(
node_name='network_head',
hyperparameter='__choice__',
value_range=['no_head'],
default_value='no_head',
)
search_space_updates.append(
node_name='network_head',
hyperparameter='no_head:activation',
value_range=['relu', 'elu'],
default_value='relu',
)

# backbone architecture
search_space_updates.append(
node_name='network_backbone',
hyperparameter='__choice__',
value_range=['MLPBackbone'],
default_value='MLPBackbone',
)
search_space_updates.append(
node_name='network_backbone',
hyperparameter='MLPBackbone:num_groups',
value_range=(2, 4),
default_value=4,
)
search_space_updates.append(
node_name='network_backbone',
hyperparameter='MLPBackbone:num_units',
value_range=[128, 512],
default_value=128,
log=True
)
search_space_updates.append(
node_name='network_backbone',
hyperparameter='MLPBackbone:dropout',
value_range=(0.1, 0.5),
default_value=0.1,
)
search_space_updates.append(
node_name='network_backbone',
hyperparameter='MLPBackbone:activation',
value_range=['relu', 'elu'],
default_value='relu',
)

# training updates
search_space_updates.append(
node_name='lr_scheduler',
hyperparameter='__choice__',
value_range=['NoScheduler'],
default_value='NoScheduler',
)
search_space_updates.append(
node_name='optimizer',
hyperparameter='__choice__',
value_range=['AdamOptimizer', 'SGDOptimizer'],
default_value='AdamOptimizer',
)
search_space_updates.append(
node_name='optimizer',
hyperparameter='AdamOptimizer:lr',
value_range=[1e-4, 3e-2],
default_value=3e-4,
)
search_space_updates.append(
node_name='optimizer',
hyperparameter='AdamOptimizer:weight_decay',
value_range=(1E-12, 0.1),
default_value=1e-6,
)
search_space_updates.append(
node_name='data_loader',
hyperparameter='max_batch_size',
value_range=[512],
default_value=512,
)

# preprocessing
search_space_updates.append(
node_name='feature_preprocessor',
hyperparameter='__choice__',
value_range=['NoFeaturePreprocessor'],
default_value='NoFeaturePreprocessor',
)

if has_numerical_features:
search_space_updates.append(
node_name='imputer',
hyperparameter='numerical_strategy',
value_range=['median', 'mean', 'most_frequent'],
default_value='median',
)
search_space_updates.append(
node_name='scaler',
hyperparameter='__choice__',
value_range=['StandardScaler'],
default_value='StandardScaler',
)
# preprocessing
search_space_updates.append(
node_name='skew_transformer',
hyperparameter='__choice__',
value_range=['QuantileTransformer'],
default_value='QuantileTransformer',
)

if has_cat_features:
search_space_updates.append(
node_name='encoder',
hyperparameter='__choice__',
value_range=['OneHotEncoder', 'NoEncoder'],
default_value='OneHotEncoder',
)
search_space_updates.append(
node_name="network_embedding",
hyperparameter="__choice__",
value_range=('NoEmbedding', 'LearnedEntityEmbedding'),
default_value='LearnedEntityEmbedding'
)

return search_space_updates


def get_config_from_run_history(run_history: RunHistory, num_run: int):
for _, run_value in run_history.data.items():
if run_value.additional_info.get('num_run', -1) == num_run: # to ensure that unsuccessful configs are not returned
return run_value.additional_info['configuration']

2 changes: 1 addition & 1 deletion autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(

# Required for dataset properties
self.num_features: Optional[int] = None
self.categories: List[List[int]] = []
self.num_categories_per_col: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []

Expand Down
Loading