-
Notifications
You must be signed in to change notification settings - Fork 65
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
init checkin to add LassoCV and RERF to optimizers #263
Changes from 16 commits
34db416
107dd85
a575acc
53c9640
710d803
0b48501
d8414c9
e383d35
42c76b0
5662a44
bbda9d6
81c71bb
c6a3ea6
368b333
a8874a2
8810557
9efbcb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -24,21 +24,19 @@ class Selection(Enum): | |||||||
parameter_space=SimpleHypergrid( | ||||||||
name="lasso_regression_model_config", | ||||||||
dimensions=[ | ||||||||
ContinuousDimension(name="eps", min=0, max=2 ** 16), | ||||||||
DiscreteDimension(name="num_alphas", min=0, max=10 ** 5), | ||||||||
ContinuousDimension(name="eps", min=0, max=10.0 ** -3), | ||||||||
DiscreteDimension(name="num_alphas", min=0, max=200), | ||||||||
CategoricalDimension(name="fit_intercept", values=[False, True]), | ||||||||
CategoricalDimension(name="normalize", values=[False, True]), | ||||||||
CategoricalDimension(name="precompute", values=[False, True]), | ||||||||
DiscreteDimension(name="max_iter", min=0, max=10 ** 5), | ||||||||
ContinuousDimension(name="tol", min=0, max=2 ** 10), | ||||||||
DiscreteDimension(name="max_iter", min=100, max=5 * 10 **3), | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
ContinuousDimension(name="tol", min=0, max=1.0), | ||||||||
CategoricalDimension(name="copy_x", values=[False, True]), | ||||||||
ContinuousDimension(name="num_cross_validations", min=1, max=10), | ||||||||
DiscreteDimension(name="num_cross_validations", min=2, max=10), | ||||||||
CategoricalDimension(name="verbose", values=[False, True]), | ||||||||
DiscreteDimension(name="num_jobs", min=1, max=4), | ||||||||
DiscreteDimension(name="num_jobs", min=1, max=2), | ||||||||
CategoricalDimension(name="positive", values=[False, True]), | ||||||||
CategoricalDimension(name="selection", values=[selection.value for selection in Selection]), | ||||||||
DiscreteDimension(name="min_num_samples_per_input_dimension_to_fit", min=1, max=32), | ||||||||
DiscreteDimension(name="num_new_samples_per_input_dimension_before_refit", min=1, max=32) | ||||||||
CategoricalDimension(name="selection", values=[selection.value for selection in Selection]) | ||||||||
] | ||||||||
), | ||||||||
default=Point( | ||||||||
|
@@ -55,9 +53,7 @@ class Selection(Enum): | |||||||
verbose=False, | ||||||||
num_jobs=1, | ||||||||
positive=False, | ||||||||
selection=Selection.CYCLIC.value, | ||||||||
min_num_samples_per_input_dimension_to_fit=10, | ||||||||
num_new_samples_per_input_dimension_before_refit=5 | ||||||||
selection=Selection.CYCLIC.value | ||||||||
), | ||||||||
description="Wrapper for sklearn.linear_model.Lasso model." | ||||||||
"This wrapper includes optional CV grid search to tune Lasso hyper parameters within each fit." | ||||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -2,6 +2,7 @@ | |||||||
# Copyright (c) Microsoft Corporation. | ||||||||
# Licensed under the MIT License. | ||||||||
# | ||||||||
import logging | ||||||||
import numpy as np | ||||||||
from pandas import DataFrame | ||||||||
from sklearn.linear_model import LassoCV | ||||||||
|
@@ -32,7 +33,7 @@ def __init__( | |||||||
model_config: Point, | ||||||||
input_space: Hypergrid, | ||||||||
output_space: Hypergrid, | ||||||||
logger=None | ||||||||
logger: logging.Logger = None | ||||||||
): | ||||||||
if logger is None: | ||||||||
logger = create_logger("LassoRegressionModel") | ||||||||
|
@@ -54,12 +55,13 @@ def __init__( | |||||||
merge_all_categorical_dimensions=True, | ||||||||
drop='first' | ||||||||
) | ||||||||
self.input_dimension_names = self.input_space.dimension_names | ||||||||
|
||||||||
self.input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] | ||||||||
self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] | ||||||||
self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions | ||||||||
if isinstance(dimension, ContinuousDimension)] | ||||||||
self.target_dimension_names = [dimension.name for dimension in self.output_space.dimensions] | ||||||||
self.logger.debug(f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.") | ||||||||
self.logger.debug(f"Input dimensions: {str(self._projected_input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.") | ||||||||
|
||||||||
assert len(self.target_dimension_names) == 1, "For now (and perhaps forever) we only support single target per Lasso model." | ||||||||
|
||||||||
|
@@ -89,6 +91,10 @@ def __init__( | |||||||
self.partial_hat_matrix_ = 0 | ||||||||
self.regressor_standard_error_ = 0 | ||||||||
|
||||||||
# THE HACK | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may need to explain a little more here. If I remember right: When LassoCV is used as part of RERF, it cannot reasonably compute the upper and lower bounds on its input space dimensions, as they are a polynomial combination of inputs to RERF. Thus, it approximates them with the empirical min and max. These approximations are biased: the lower bound is too large, the upper bound is too small. Consequently, during scoring, LassoCV is likely to see input outside of these bounds, but we still want LassoCV to produce predictions for those points. So we introduce a little hack: whenever LassoCV is instantiated as part of RERF, it should skip input filtering on predict. This field, controls this behavior. Feel free to just copy-paste that in, or polish it to your liking! |
||||||||
self.skip_input_filtering_on_predict = False | ||||||||
|
||||||||
|
||||||||
@property | ||||||||
def trained(self): | ||||||||
return self._trained | ||||||||
|
@@ -120,7 +126,7 @@ def should_fit(self, num_samples): | |||||||
:param num_samples: | ||||||||
:return: | ||||||||
""" | ||||||||
num_input_dims = len(self.input_dimension_names) | ||||||||
num_input_dims = len(self._projected_input_dimension_names) | ||||||||
model_config = self.model_config | ||||||||
if not self.trained: | ||||||||
return num_samples > model_config.min_num_samples_per_input_dimension_to_fit * num_input_dims | ||||||||
|
@@ -135,19 +141,28 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration | |||||||
x_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) | ||||||||
y = target_values_pandas_frame[self.target_dimension_names].to_numpy() | ||||||||
design_matrix = self._transform_x(x_df) | ||||||||
|
||||||||
# ensure num_cross_validations < num_samples; and reinstantiate LassoCV regressor | ||||||||
if design_matrix.shape[0] < self.model_config.num_cross_validations: | ||||||||
self.lasso_model_kwargs['cv'] = design_matrix.shape[0] - 1 | ||||||||
self._regressor = LassoCV(**self.lasso_model_kwargs) | ||||||||
|
||||||||
self._regressor.fit(design_matrix, y) | ||||||||
self._trained = True | ||||||||
self.last_refit_iteration_number = iteration_number | ||||||||
|
||||||||
# retain inverse(x.T * x) to use for confidence intervals on predicted values | ||||||||
condition_number = np.linalg.cond(design_matrix) | ||||||||
if condition_number > 10.0 ** 10: | ||||||||
self.logger.info( | ||||||||
f'LassoCV: design_matrix condition number: {condition_number}' | ||||||||
) | ||||||||
if condition_number > 10.0 ** 4: | ||||||||
# add small noise to x to remove singularity, | ||||||||
# expect prediction confidence to be reduced (wider intervals) by doing this | ||||||||
self.logger.info( | ||||||||
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^10." | ||||||||
f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^4." | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 10**4
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's clear what you mean... but my CDO strongly suggests that we should stick to the Python exponentiation operator :) |
||||||||
) | ||||||||
design_matrix += np.random.normal(0, 10.0**-4, size=design_matrix.shape) | ||||||||
design_matrix += np.random.normal(0, 10.0**-2, size=design_matrix.shape) | ||||||||
condition_number = np.linalg.cond(design_matrix) | ||||||||
self.logger.info( | ||||||||
f"Resulting condition number {condition_number}." | ||||||||
|
@@ -175,6 +190,11 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): | |||||||
valid_rows_index = None | ||||||||
features_df = None | ||||||||
if self.trained: | ||||||||
if not self.skip_input_filtering_on_predict: | ||||||||
feature_values_pandas_frame = self.input_space.filter_out_invalid_rows( | ||||||||
original_dataframe=feature_values_pandas_frame, | ||||||||
exclude_extra_columns=False | ||||||||
) | ||||||||
features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) | ||||||||
valid_rows_index = features_df.index | ||||||||
|
||||||||
|
@@ -213,7 +233,7 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): | |||||||
def _transform_x(self, x_df: DataFrame): | ||||||||
# confirm feature_values_pandas_frame contains all expected columns | ||||||||
# if any are missing, impute NaN values | ||||||||
missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values)) | ||||||||
missing_column_names = set.difference(set(self._projected_input_dimension_names), set(x_df.columns.values)) | ||||||||
for missing_column_name in missing_column_names: | ||||||||
x_df[missing_column_name] = np.NaN | ||||||||
|
||||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,48 @@ | ||||||||
# | ||||||||
# Copyright (c) Microsoft Corporation. | ||||||||
# Licensed under the MIT License. | ||||||||
# | ||||||||
import logging | ||||||||
from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store | ||||||||
from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel | ||||||||
from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel | ||||||||
from mlos.Spaces import Hypergrid, Point, SimpleHypergrid | ||||||||
|
||||||||
|
||||||||
class MultiObjectiveLassoCrossValidated(NaiveMultiObjectiveRegressionModel): | ||||||||
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
All single-objective models are configured according to model_config. | ||||||||
|
||||||||
""" | ||||||||
def __init__( | ||||||||
self, | ||||||||
model_config: Point, | ||||||||
input_space: Hypergrid, | ||||||||
output_space: Hypergrid, | ||||||||
logger: logging.Logger = None | ||||||||
): | ||||||||
NaiveMultiObjectiveRegressionModel.__init__( | ||||||||
self, | ||||||||
model_type=LassoCrossValidatedRegressionModel, | ||||||||
model_config=model_config, | ||||||||
input_space=input_space, | ||||||||
output_space=output_space, | ||||||||
logger=logger | ||||||||
) | ||||||||
|
||||||||
|
||||||||
# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# A more elaborate solution might be needed down the road, but for now this simple solution should suffice. | ||||||||
# | ||||||||
assert model_config in lasso_cross_validated_config_store.parameter_space | ||||||||
|
||||||||
for output_dimension in output_space.dimensions: | ||||||||
print(f'output_dimension.name: {output_dimension.name}') | ||||||||
lasso_model = LassoCrossValidatedRegressionModel( | ||||||||
model_config=model_config, | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You copy the model_config in multi-objective RERF, but not here. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Values in the model config are altered by the random forest GridSearchCV for the RERF. When these configs are assigned to different objectives, they stomped all over each other. I'll track down the lines in RERF model that alter the model_config and explain this in the MultiObjectiveRERF code where you've spotted this difference. |
||||||||
input_space=input_space, | ||||||||
output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]), | ||||||||
logger=self.logger | ||||||||
) | ||||||||
self._regressors_by_objective_name[output_dimension.name] = lasso_model |
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,47 @@ | ||||||||
# | ||||||||
# Copyright (c) Microsoft Corporation. | ||||||||
# Licensed under the MIT License. | ||||||||
# | ||||||||
import logging | ||||||||
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store | ||||||||
from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestModel import RegressionEnhancedRandomForestRegressionModel | ||||||||
from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel | ||||||||
from mlos.Spaces import Hypergrid, Point, SimpleHypergrid | ||||||||
|
||||||||
|
||||||||
class MultiObjectiveRegressionEnhancedRandomForest(NaiveMultiObjectiveRegressionModel): | ||||||||
"""Maintains multiple HomogeneousRandomForestRegressionModels each predicting a different objective. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
All single-objective models are configured according to model_config. | ||||||||
|
||||||||
""" | ||||||||
def __init__( | ||||||||
self, | ||||||||
model_config: Point, | ||||||||
input_space: Hypergrid, | ||||||||
output_space: Hypergrid, | ||||||||
logger: logging.Logger = None | ||||||||
): | ||||||||
NaiveMultiObjectiveRegressionModel.__init__( | ||||||||
self, | ||||||||
model_type=RegressionEnhancedRandomForestRegressionModel, | ||||||||
model_config=model_config, | ||||||||
input_space=input_space, | ||||||||
output_space=output_space, | ||||||||
logger=logger | ||||||||
) | ||||||||
|
||||||||
|
||||||||
# We just need to assert that the model config belongs in homogeneous_random_forest_config_store.parameter_space. | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
# A more elaborate solution might be needed down the road, but for now this simple solution should suffice. | ||||||||
# | ||||||||
assert model_config in regression_enhanced_random_forest_config_store.parameter_space | ||||||||
|
||||||||
for output_dimension in output_space.dimensions: | ||||||||
rerf_model = RegressionEnhancedRandomForestRegressionModel( | ||||||||
model_config=model_config.copy(), | ||||||||
input_space=input_space, | ||||||||
output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]), | ||||||||
logger=self.logger | ||||||||
) | ||||||||
self._regressors_by_objective_name[output_dimension.name] = rerf_model |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.