Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

init checkin to add LassoCV and RERF to optimizers #263

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from mlos.Optimizers.RegressionModels.GoodnessOfFitMetrics import DataSetType
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel
from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest
from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionModel import MultiObjectiveRegressionModel
from mlos.Optimizers.RegressionModels.Prediction import Prediction
from mlos.Tracer import trace
Expand Down Expand Up @@ -59,20 +61,47 @@ def __init__(

# Now let's put together the surrogate model.
#
print(f'self.optimizer_config.surrogate_model_implementation: {self.optimizer_config.surrogate_model_implementation}')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print(f'self.optimizer_config.surrogate_model_implementation: {self.optimizer_config.surrogate_model_implementation}')
self.logger.info(f'self.optimizer_config.surrogate_model_implementation: {self.optimizer_config.surrogate_model_implementation}')

assert self.optimizer_config.surrogate_model_implementation in (
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
MultiObjectiveHomogeneousRandomForest.__name__,
MultiObjectiveLassoCrossValidated.__name__,
MultiObjectiveRegressionEnhancedRandomForest.__name__
)

# Note that even if the user requested a HomogeneousRandomForestRegressionModel, we still create a MultiObjectiveRegressionModel
# with just a single RandomForest inside it. This means we have to maintain only a single interface.
#
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
if self.optimizer_config.surrogate_model_implementation == HomogeneousRandomForestRegressionModel.__name__:
byte-sculptor marked this conversation as resolved.
Show resolved Hide resolved
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveHomogeneousRandomForest.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest(
model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveLassoCrossValidated.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveLassoCrossValidated(
model_config=self.optimizer_config.lasso_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__:
self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveRegressionEnhancedRandomForest(
model_config=self.optimizer_config.regression_enhanced_random_forest_regression_model_config,
input_space=self.optimization_problem.feature_space,
output_space=self.surrogate_model_output_space,
logger=self.logger
)
else:
raise RuntimeError(f"Unrecognized surrogate_model_implementation {self.optimizer_config.surrogate_model_implementation}")

# Now let's put together the experiment designer that will suggest parameters for each experiment.
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,24 @@
#
from mlos.Spaces import SimpleHypergrid, DiscreteDimension, CategoricalDimension, Point
from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore

from mlos.Optimizers.ExperimentDesigner.ExperimentDesigner import ExperimentDesigner, experiment_designer_config_store
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestConfigStore import homogeneous_random_forest_config_store
from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel
from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest

from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store
from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store
from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest

bayesian_optimizer_config_store = ComponentConfigStore(
parameter_space=SimpleHypergrid(
name="bayesian_optimizer_config",
dimensions=[
CategoricalDimension(name="surrogate_model_implementation", values=[
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
MultiObjectiveHomogeneousRandomForest.__name__,
MultiObjectiveLassoCrossValidated.__name__,
MultiObjectiveRegressionEnhancedRandomForest.__name__
]),
CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__]),
DiscreteDimension(name="min_samples_required_for_guided_design_of_experiments", min=2, max=100)
Expand All @@ -30,6 +34,20 @@
HomogeneousRandomForestRegressionModel.__name__,
MultiObjectiveHomogeneousRandomForest.__name__
])
).join(
subgrid=lasso_cross_validated_config_store.parameter_space,
on_external_dimension=CategoricalDimension(
name="surrogate_model_implementation",
values=[
MultiObjectiveLassoCrossValidated.__name__
])
).join(
subgrid=regression_enhanced_random_forest_config_store.parameter_space,
on_external_dimension=CategoricalDimension(
name="surrogate_model_implementation",
values=[
MultiObjectiveRegressionEnhancedRandomForest.__name__
])
).join(
subgrid=experiment_designer_config_store.parameter_space,
on_external_dimension=CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,19 @@ class Selection(Enum):
parameter_space=SimpleHypergrid(
name="lasso_regression_model_config",
dimensions=[
ContinuousDimension(name="eps", min=0, max=2 ** 16),
DiscreteDimension(name="num_alphas", min=0, max=10 ** 5),
ContinuousDimension(name="eps", min=0, max=10.0 ** -3),
DiscreteDimension(name="num_alphas", min=0, max=200),
CategoricalDimension(name="fit_intercept", values=[False, True]),
CategoricalDimension(name="normalize", values=[False, True]),
CategoricalDimension(name="precompute", values=[False, True]),
DiscreteDimension(name="max_iter", min=0, max=10 ** 5),
ContinuousDimension(name="tol", min=0, max=2 ** 10),
DiscreteDimension(name="max_iter", min=100, max=5 * 10 **3),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DiscreteDimension(name="max_iter", min=100, max=5 * 10 **3),
DiscreteDimension(name="max_iter", min=100, max=5 * (10 ** 3)),

ContinuousDimension(name="tol", min=0, max=1.0),
CategoricalDimension(name="copy_x", values=[False, True]),
ContinuousDimension(name="num_cross_validations", min=1, max=10),
DiscreteDimension(name="num_cross_validations", min=2, max=10),
CategoricalDimension(name="verbose", values=[False, True]),
DiscreteDimension(name="num_jobs", min=1, max=4),
DiscreteDimension(name="num_jobs", min=1, max=2),
CategoricalDimension(name="positive", values=[False, True]),
CategoricalDimension(name="selection", values=[selection.value for selection in Selection]),
DiscreteDimension(name="min_num_samples_per_input_dimension_to_fit", min=1, max=32),
DiscreteDimension(name="num_new_samples_per_input_dimension_before_refit", min=1, max=32)
CategoricalDimension(name="selection", values=[selection.value for selection in Selection])
]
),
default=Point(
Expand All @@ -55,9 +53,7 @@ class Selection(Enum):
verbose=False,
num_jobs=1,
positive=False,
selection=Selection.CYCLIC.value,
min_num_samples_per_input_dimension_to_fit=10,
num_new_samples_per_input_dimension_before_refit=5
selection=Selection.CYCLIC.value
),
description="Wrapper for sklearn.linear_model.Lasso model."
"This wrapper includes optional CV grid search to tune Lasso hyper parameters within each fit."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
import logging
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LassoCV
Expand Down Expand Up @@ -32,7 +33,7 @@ def __init__(
model_config: Point,
input_space: Hypergrid,
output_space: Hypergrid,
logger=None
logger: logging.Logger = None
):
if logger is None:
logger = create_logger("LassoRegressionModel")
Expand All @@ -54,12 +55,13 @@ def __init__(
merge_all_categorical_dimensions=True,
drop='first'
)
self.input_dimension_names = self.input_space.dimension_names

self.input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions]
self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions]
self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions
if isinstance(dimension, ContinuousDimension)]
self.target_dimension_names = [dimension.name for dimension in self.output_space.dimensions]
self.logger.debug(f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.")
self.logger.debug(f"Input dimensions: {str(self._projected_input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.")

assert len(self.target_dimension_names) == 1, "For now (and perhaps forever) we only support single target per Lasso model."

Expand Down Expand Up @@ -89,6 +91,10 @@ def __init__(
self.partial_hat_matrix_ = 0
self.regressor_standard_error_ = 0

# THE HACK
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need to explain a little more here. If I remember right:

When LassoCV is used as part of RERF, it cannot reasonably compute the upper and lower bounds on its input space dimensions, as they are a polynomial combination of inputs to RERF. Thus, it approximates them with the empirical min and max. These approximations are biased: the lower bound is too large, the upper bound is too small. Consequently, during scoring, LassoCV is likely to see input outside of these bounds, but we still want LassoCV to produce predictions for those points. So we introduce a little hack: whenever LassoCV is instantiated as part of RERF, it should skip input filtering on predict. This field, controls this behavior.

Feel free to just copy-paste that in, or polish it to your liking!

self.skip_input_filtering_on_predict = False


@property
def trained(self):
return self._trained
Expand Down Expand Up @@ -120,7 +126,7 @@ def should_fit(self, num_samples):
:param num_samples:
:return:
"""
num_input_dims = len(self.input_dimension_names)
num_input_dims = len(self._projected_input_dimension_names)
model_config = self.model_config
if not self.trained:
return num_samples > model_config.min_num_samples_per_input_dimension_to_fit * num_input_dims
Expand All @@ -135,19 +141,28 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration
x_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False)
y = target_values_pandas_frame[self.target_dimension_names].to_numpy()
design_matrix = self._transform_x(x_df)

# ensure num_cross_validations < num_samples; and reinstantiate LassoCV regressor
if design_matrix.shape[0] < self.model_config.num_cross_validations:
self.lasso_model_kwargs['cv'] = design_matrix.shape[0] - 1
self._regressor = LassoCV(**self.lasso_model_kwargs)

self._regressor.fit(design_matrix, y)
self._trained = True
self.last_refit_iteration_number = iteration_number

# retain inverse(x.T * x) to use for confidence intervals on predicted values
condition_number = np.linalg.cond(design_matrix)
if condition_number > 10.0 ** 10:
self.logger.info(
f'LassoCV: design_matrix condition number: {condition_number}'
)
if condition_number > 10.0 ** 4:
# add small noise to x to remove singularity,
# expect prediction confidence to be reduced (wider intervals) by doing this
self.logger.info(
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^10."
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^4."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

10**4

Suggested change
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^4."
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10**4."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's clear what you mean... but my CDO strongly suggests that we should stick to the Python exponentiation operator :)

)
design_matrix += np.random.normal(0, 10.0**-4, size=design_matrix.shape)
design_matrix += np.random.normal(0, 10.0**-2, size=design_matrix.shape)
condition_number = np.linalg.cond(design_matrix)
self.logger.info(
f"Resulting condition number {condition_number}."
Expand Down Expand Up @@ -175,6 +190,11 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
valid_rows_index = None
features_df = None
if self.trained:
if not self.skip_input_filtering_on_predict:
feature_values_pandas_frame = self.input_space.filter_out_invalid_rows(
original_dataframe=feature_values_pandas_frame,
exclude_extra_columns=False
)
features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False)
valid_rows_index = features_df.index

Expand Down Expand Up @@ -213,7 +233,7 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
def _transform_x(self, x_df: DataFrame):
# confirm feature_values_pandas_frame contains all expected columns
# if any are missing, impute NaN values
missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
missing_column_names = set.difference(set(self._projected_input_dimension_names), set(x_df.columns.values))
for missing_column_name in missing_column_names:
x_df[missing_column_name] = np.NaN

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
import logging
from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store
from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel
from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel
from mlos.Spaces import Hypergrid, Point, SimpleHypergrid


class MultiObjectiveLassoCrossValidated(NaiveMultiObjectiveRegressionModel):
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective.
Copy link
Contributor

@byte-sculptor byte-sculptor Sep 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective.
"""Maintains multiple LassoCrossValidatedRegressionModels each predicting a different objective.


All single-objective models are configured according to model_config.

"""
def __init__(
self,
model_config: Point,
input_space: Hypergrid,
output_space: Hypergrid,
logger: logging.Logger = None
):
NaiveMultiObjectiveRegressionModel.__init__(
self,
model_type=LassoCrossValidatedRegressionModel,
model_config=model_config,
input_space=input_space,
output_space=output_space,
logger=logger
)


# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space.
# We just need to assert that the model config belongs in lasso_cross_validated_config_store.parameter_space.

# A more elaborate solution might be needed down the road, but for now this simple solution should suffice.
#
assert model_config in lasso_cross_validated_config_store.parameter_space

for output_dimension in output_space.dimensions:
print(f'output_dimension.name: {output_dimension.name}')
lasso_model = LassoCrossValidatedRegressionModel(
model_config=model_config,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You copy the model_config in multi-objective RERF, but not here. Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Values in the model config are altered by the random forest GridSearchCV for the RERF. When these configs are assigned to different objectives, they stomped all over each other. I'll track down the lines in RERF model that alter the model_config and explain this in the MultiObjectiveRERF code where you've spotted this difference.

input_space=input_space,
output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]),
logger=self.logger
)
self._regressors_by_objective_name[output_dimension.name] = lasso_model
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
import logging
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestModel import RegressionEnhancedRandomForestRegressionModel
from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel
from mlos.Spaces import Hypergrid, Point, SimpleHypergrid


class MultiObjectiveRegressionEnhancedRandomForest(NaiveMultiObjectiveRegressionModel):
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective.
"""Maintains multiple RegressionEnhancedRandomForestRegressionModel each predicting a different objective.


All single-objective models are configured according to model_config.

"""
def __init__(
self,
model_config: Point,
input_space: Hypergrid,
output_space: Hypergrid,
logger: logging.Logger = None
):
NaiveMultiObjectiveRegressionModel.__init__(
self,
model_type=RegressionEnhancedRandomForestRegressionModel,
model_config=model_config,
input_space=input_space,
output_space=output_space,
logger=logger
)


# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space.
# We just need to assert that the model config belongs in regression_enhanced_random_forest_config_store.parameter_space.

# A more elaborate solution might be needed down the road, but for now this simple solution should suffice.
#
assert model_config in regression_enhanced_random_forest_config_store.parameter_space

for output_dimension in output_space.dimensions:
rerf_model = RegressionEnhancedRandomForestRegressionModel(
model_config=model_config.copy(),
input_space=input_space,
output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]),
logger=self.logger
)
self._regressors_by_objective_name[output_dimension.name] = rerf_model
Loading