Skip to content

Commit

Permalink
init checkin to add LassoCV and RERF to optimizers (#263)
Browse files Browse the repository at this point in the history
* init checkin to add LassoCV and RERF to optimizers

* fixes to pylint catches + ...

* addressing more random config failures

* continued to clean up RERF hyperparam config space and restricted some values in unit tests for BayesianOptimier

* fixes to pylint catches

* cleaned up random model_config unit test failures in LassoCV and RERF models

* force lassoCV cv parameter < num_samples

* cleaned up comments, restricted sklearnRF max_samples range

* correcting incorrect DEFAULT point in SKLearnRF model_config

* cleaned up hypergrid adapters to solve failing random model_config optimizer tests

* cleaned up pylint issues

* fixed additional rerf random config failures and tried to accelerate tests

* decreased num random config tested from 100 to prev 10

* fixes to allow gRPC random optimizer config unit tests to succeed

* additional fixes to allow gRPC random optimizer config unit tests to succeed

* reduced unit test duration by reducing train/test sizes in new optimizers

* addressing review feedback

Co-authored-by: Ed Thayer <edthaye@microsoft.com>
  • Loading branch information
edcthayer and Ed Thayer authored Sep 30, 2021
1 parent bb9d09d commit 791d670
Show file tree
Hide file tree
Showing 15 changed files with 468 additions and 94 deletions.
43 changes: 36 additions & 7 deletions source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from mlos.Optimizers.RegressionModels.GoodnessOfFitMetrics import DataSetType
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel
from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest
from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionModel import MultiObjectiveRegressionModel
from mlos.Optimizers.RegressionModels.Prediction import Prediction
from mlos.Tracer import trace
Expand Down Expand Up @@ -59,20 +61,47 @@ def __init__(

# Now let's put together the surrogate model.
#
self.logger.info(f'self.optimizer_config.surrogate_model_implementation: {self.optimizer_config.surrogate_model_implementation}')
assert self.optimizer_config.surrogate_model_implementation in (
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
MultiObjectiveHomogeneousRandomForest.__name__,
MultiObjectiveLassoCrossValidated.__name__,
MultiObjectiveRegressionEnhancedRandomForest.__name__
)

# Note that even if the user requested a HomogeneousRandomForestRegressionModel, we still create a MultiObjectiveRegressionModel
# with just a single RandomForest inside it. This means we have to maintain only a single interface.
#
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
if self.optimizer_config.surrogate_model_implementation == HomogeneousRandomForestRegressionModel.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveHomogeneousRandomForest.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveLassoCrossValidated.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveLassoCrossValidated(
model_config=self.optimizer_config.lasso_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveRegressionEnhancedRandomForest(
model_config=self.optimizer_config.regression_enhanced_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
else:
raise RuntimeError(f"Unrecognized surrogate_model_implementation {self.optimizer_config.surrogate_model_implementation}")

# Now let's put together the experiment designer that will suggest parameters for each experiment.
#
Expand Down
24 changes: 21 additions & 3 deletions source/Mlos.Python/mlos/Optimizers/BayesianOptimizerConfigStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,24 @@
#
from mlos.Spaces import SimpleHypergrid, DiscreteDimension, CategoricalDimension, Point
from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore

from mlos.Optimizers.ExperimentDesigner.ExperimentDesigner import ExperimentDesigner, experiment_designer_config_store
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestConfigStore import homogeneous_random_forest_config_store
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel
from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest

from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store
from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest

bayesian_optimizer_config_store = ComponentConfigStore(
parameter_space=SimpleHypergrid(
name="bayesian_optimizer_config",
dimensions=[
CategoricalDimension(name="surrogate_model_implementation", values=[
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
MultiObjectiveHomogeneousRandomForest.__name__,
MultiObjectiveLassoCrossValidated.__name__,
MultiObjectiveRegressionEnhancedRandomForest.__name__
]),
CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__]),
DiscreteDimension(name="min_samples_required_for_guided_design_of_experiments", min=2, max=100)
Expand All @@ -30,6 +34,20 @@
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
])
).join(
subgrid=lasso_cross_validated_config_store.parameter_space,
on_external_dimension=CategoricalDimension(
name="surrogate_model_implementation",
values=[
MultiObjectiveLassoCrossValidated.__name__
])
).join(
subgrid=regression_enhanced_random_forest_config_store.parameter_space,
on_external_dimension=CategoricalDimension(
name="surrogate_model_implementation",
values=[
MultiObjectiveRegressionEnhancedRandomForest.__name__
])
).join(
subgrid=experiment_designer_config_store.parameter_space,
on_external_dimension=CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,19 @@ class Selection(Enum):
parameter_space=SimpleHypergrid(
name="lasso_regression_model_config",
dimensions=[
ContinuousDimension(name="eps", min=0, max=2 ** 16),
DiscreteDimension(name="num_alphas", min=0, max=10 ** 5),
ContinuousDimension(name="eps", min=0, max=10.0 ** -3),
DiscreteDimension(name="num_alphas", min=0, max=200),
CategoricalDimension(name="fit_intercept", values=[False, True]),
CategoricalDimension(name="normalize", values=[False, True]),
CategoricalDimension(name="precompute", values=[False, True]),
DiscreteDimension(name="max_iter", min=0, max=10 ** 5),
ContinuousDimension(name="tol", min=0, max=2 ** 10),
DiscreteDimension(name="max_iter", min=100, max=5 * 10 ** 3),
ContinuousDimension(name="tol", min=0, max=1.0),
CategoricalDimension(name="copy_x", values=[False, True]),
ContinuousDimension(name="num_cross_validations", min=1, max=10),
DiscreteDimension(name="num_cross_validations", min=2, max=10),
CategoricalDimension(name="verbose", values=[False, True]),
DiscreteDimension(name="num_jobs", min=1, max=4),
DiscreteDimension(name="num_jobs", min=1, max=2),
CategoricalDimension(name="positive", values=[False, True]),
CategoricalDimension(name="selection", values=[selection.value for selection in Selection]),
DiscreteDimension(name="min_num_samples_per_input_dimension_to_fit", min=1, max=32),
DiscreteDimension(name="num_new_samples_per_input_dimension_before_refit", min=1, max=32)
CategoricalDimension(name="selection", values=[selection.value for selection in Selection])
]
),
default=Point(
Expand All @@ -55,9 +53,7 @@ class Selection(Enum):
verbose=False,
num_jobs=1,
positive=False,
selection=Selection.CYCLIC.value,
min_num_samples_per_input_dimension_to_fit=10,
num_new_samples_per_input_dimension_before_refit=5
selection=Selection.CYCLIC.value
),
description="Wrapper for sklearn.linear_model.Lasso model."
"This wrapper includes optional CV grid search to tune Lasso hyper parameters within each fit."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
import logging
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LassoCV
Expand Down Expand Up @@ -32,7 +33,7 @@ def __init__(
model_config: Point,
input_space: Hypergrid,
output_space: Hypergrid,
logger=None
logger: logging.Logger = None
):
if logger is None:
logger = create_logger("LassoRegressionModel")
Expand All @@ -54,12 +55,13 @@ def __init__(
merge_all_categorical_dimensions=True,
drop='first'
)
self.input_dimension_names = self.input_space.dimension_names

self.input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions]
self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions]
self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions
if isinstance(dimension, ContinuousDimension)]
self.target_dimension_names = [dimension.name for dimension in self.output_space.dimensions]
self.logger.debug(f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.")
self.logger.debug(f"Input dimensions: {str(self._projected_input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.")

assert len(self.target_dimension_names) == 1, "For now (and perhaps forever) we only support single target per Lasso model."

Expand Down Expand Up @@ -89,6 +91,15 @@ def __init__(
self.partial_hat_matrix_ = 0
self.regressor_standard_error_ = 0

# When LassoCV is used as part of RERF, it cannot reasonably compute the upper and lower bounds on its input space dimensions,
# as they are a polynomial combination of inputs to RERF. Thus, it approximates them with the empirical min and max.
# These approximations are biased: the lower bound is too large, the upper bound is too small.
# Consequently, during scoring, LassoCV is likely to see input outside of these bounds, but we still want
# LassoCV to produce predictions for those points. So we introduce a little hack: whenever LassoCV is instantiated as part of RERF,
# it should skip input filtering on predict. This field, controls this behavior.
self.skip_input_filtering_on_predict = False


@property
def trained(self):
return self._trained
Expand Down Expand Up @@ -120,7 +131,7 @@ def should_fit(self, num_samples):
:param num_samples:
:return:
"""
num_input_dims = len(self.input_dimension_names)
num_input_dims = len(self._projected_input_dimension_names)
model_config = self.model_config
if not self.trained:
return num_samples > model_config.min_num_samples_per_input_dimension_to_fit * num_input_dims
Expand All @@ -135,19 +146,28 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration
x_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False)
y = target_values_pandas_frame[self.target_dimension_names].to_numpy()
design_matrix = self._transform_x(x_df)

# ensure num_cross_validations < num_samples; and reinstantiate LassoCV regressor
if design_matrix.shape[0] < self.model_config.num_cross_validations:
self.lasso_model_kwargs['cv'] = design_matrix.shape[0] - 1
self._regressor = LassoCV(**self.lasso_model_kwargs)

self._regressor.fit(design_matrix, y)
self._trained = True
self.last_refit_iteration_number = iteration_number

# retain inverse(x.T * x) to use for confidence intervals on predicted values
condition_number = np.linalg.cond(design_matrix)
if condition_number > 10.0 ** 10:
self.logger.info(
f'LassoCV: design_matrix condition number: {condition_number}'
)
if condition_number > 10.0 ** 4:
# add small noise to x to remove singularity,
# expect prediction confidence to be reduced (wider intervals) by doing this
self.logger.info(
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^10."
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10**4."
)
design_matrix += np.random.normal(0, 10.0**-4, size=design_matrix.shape)
design_matrix += np.random.normal(0, 10.0**-2, size=design_matrix.shape)
condition_number = np.linalg.cond(design_matrix)
self.logger.info(
f"Resulting condition number {condition_number}."
Expand Down Expand Up @@ -175,6 +195,11 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
valid_rows_index = None
features_df = None
if self.trained:
if not self.skip_input_filtering_on_predict:
feature_values_pandas_frame = self.input_space.filter_out_invalid_rows(
original_dataframe=feature_values_pandas_frame,
exclude_extra_columns=False
)
features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False)
valid_rows_index = features_df.index

Expand Down Expand Up @@ -213,7 +238,7 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
def _transform_x(self, x_df: DataFrame):
# confirm feature_values_pandas_frame contains all expected columns
# if any are missing, impute NaN values
missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
missing_column_names = set.difference(set(self._projected_input_dimension_names), set(x_df.columns.values))
for missing_column_name in missing_column_names:
x_df[missing_column_name] = np.NaN

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
import logging
from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store
from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel
from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel
from mlos.Spaces import Hypergrid, Point, SimpleHypergrid


class MultiObjectiveLassoCrossValidated(NaiveMultiObjectiveRegressionModel):
"""Maintains multiple LassoCrossValidatedRegressionModels each predicting a different objective.
All single-objective models are configured according to model_config.
"""
def __init__(
self,
model_config: Point,
input_space: Hypergrid,
output_space: Hypergrid,
logger: logging.Logger = None
):
NaiveMultiObjectiveRegressionModel.__init__(
self,
model_type=LassoCrossValidatedRegressionModel,
model_config=model_config,
input_space=input_space,
output_space=output_space,
logger=logger
)


# We just need to assert that the model config belongs in lasso_cross_validated_config_store.parameter_space.
# A more elaborate solution might be needed down the road, but for now this simple solution should suffice.
#
assert model_config in lasso_cross_validated_config_store.parameter_space

for output_dimension in output_space.dimensions:
print(f'output_dimension.name: {output_dimension.name}')
lasso_model = LassoCrossValidatedRegressionModel(
model_config=model_config,
input_space=input_space,
output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]),
logger=self.logger
)
self._regressors_by_objective_name[output_dimension.name] = lasso_model
Loading

0 comments on commit 791d670

Please sign in to comment.