From 791d670a4c44467b2b4c9633f8aa1bebab50771f Mon Sep 17 00:00:00 2001 From: edcthayer Date: Wed, 29 Sep 2021 22:51:18 -0700 Subject: [PATCH] init checkin to add LassoCV and RERF to optimizers (#263) * init checkin to add LassoCV and RERF to optimizers * fixes to pylint catches + ... * addressing more random config failures * continued to clean up RERF hyperparam config space and restricted some values in unit tests for BayesianOptimier * fixes to pylint catches * cleaned up random model_config unit test failures in LassoCV and RERF models * force lassoCV cv parameter < num_samples * cleaned up comments, restricted sklearnRF max_samples range * correcting incorrect DEFAULT point in SKLearnRF model_config * cleaned up hypergrid adapters to solve failing random model_config optimizer tests * cleaned up pylint issues * fixed additional rerf random config failures and tried to accelerate tests * decreased num random config tested from 100 to prev 10 * fixes to allow gRPC random optimizer config unit tests to succeed * additional fixes to allow gRPC random optimizer config unit tests to succeed * reduced unit test duration by reducing train/test sizes in new optimizers * addressing review feedback Co-authored-by: Ed Thayer --- .../mlos/Optimizers/BayesianOptimizer.py | 43 ++++++-- .../BayesianOptimizerConfigStore.py | 24 ++++- .../LassoCrossValidatedConfigStore.py | 20 ++-- .../LassoCrossValidatedRegressionModel.py | 41 ++++++-- .../MultiObjectiveLassoCrossValidated.py | 48 +++++++++ ...ObjectiveRegressionEnhancedRandomForest.py | 49 ++++++++++ ...gressionEnhancedRandomForestConfigStore.py | 11 +-- .../RegressionEnhancedRandomForestModel.py | 98 ++++++++++++------- ...klearnRandomForestRegressionModelConfig.py | 14 +-- .../TestLassoCrossValidatedRegressionModel.py | 5 +- .../TestMultiObjectiveLassoCrossValidated.py | 74 ++++++++++++++ ...ObjectiveRegressionEnhancedRandomForest.py | 73 ++++++++++++++ ...TestRegressionEnhancedRandomForestModel.py | 15 +-- .../unit_tests/TestBayesianOptimizer.py | 21 +++- .../TestBayesianOptimizerGrpcClient.py | 26 ++++- 15 files changed, 468 insertions(+), 94 deletions(-) create mode 100644 source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveLassoCrossValidated.py create mode 100644 source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveRegressionEnhancedRandomForest.py create mode 100644 source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveLassoCrossValidated.py create mode 100644 source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveRegressionEnhancedRandomForest.py diff --git a/source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py b/source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py index 970dab5471..13c0481f32 100644 --- a/source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py +++ b/source/Mlos.Python/mlos/Optimizers/BayesianOptimizer.py @@ -14,6 +14,8 @@ from mlos.Optimizers.RegressionModels.GoodnessOfFitMetrics import DataSetType from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest +from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated +from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionModel import MultiObjectiveRegressionModel from mlos.Optimizers.RegressionModels.Prediction import Prediction from mlos.Tracer import trace @@ -59,20 +61,47 @@ def __init__( # Now let's put together the surrogate model. # + self.logger.info(f'self.optimizer_config.surrogate_model_implementation: {self.optimizer_config.surrogate_model_implementation}') assert self.optimizer_config.surrogate_model_implementation in ( HomogeneousRandomForestRegressionModel.__name__, - MultiObjectiveHomogeneousRandomForest.__name__ + MultiObjectiveHomogeneousRandomForest.__name__, + MultiObjectiveLassoCrossValidated.__name__, + MultiObjectiveRegressionEnhancedRandomForest.__name__ ) # Note that even if the user requested a HomogeneousRandomForestRegressionModel, we still create a MultiObjectiveRegressionModel # with just a single RandomForest inside it. This means we have to maintain only a single interface. # - self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest( - model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config, - input_space=self.optimization_problem.feature_space, - output_space=self.surrogate_model_output_space, - logger=self.logger - ) + if self.optimizer_config.surrogate_model_implementation == HomogeneousRandomForestRegressionModel.__name__: + self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest( + model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config, + input_space=self.optimization_problem.feature_space, + output_space=self.surrogate_model_output_space, + logger=self.logger + ) + elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveHomogeneousRandomForest.__name__: + self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveHomogeneousRandomForest( + model_config=self.optimizer_config.homogeneous_random_forest_regression_model_config, + input_space=self.optimization_problem.feature_space, + output_space=self.surrogate_model_output_space, + logger=self.logger + ) + elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveLassoCrossValidated.__name__: + self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveLassoCrossValidated( + model_config=self.optimizer_config.lasso_regression_model_config, + input_space=self.optimization_problem.feature_space, + output_space=self.surrogate_model_output_space, + logger=self.logger + ) + elif self.optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__: + self.surrogate_model: MultiObjectiveRegressionModel = MultiObjectiveRegressionEnhancedRandomForest( + model_config=self.optimizer_config.regression_enhanced_random_forest_regression_model_config, + input_space=self.optimization_problem.feature_space, + output_space=self.surrogate_model_output_space, + logger=self.logger + ) + else: + raise RuntimeError(f"Unrecognized surrogate_model_implementation {self.optimizer_config.surrogate_model_implementation}") # Now let's put together the experiment designer that will suggest parameters for each experiment. # diff --git a/source/Mlos.Python/mlos/Optimizers/BayesianOptimizerConfigStore.py b/source/Mlos.Python/mlos/Optimizers/BayesianOptimizerConfigStore.py index 71d391d857..2672d2955e 100644 --- a/source/Mlos.Python/mlos/Optimizers/BayesianOptimizerConfigStore.py +++ b/source/Mlos.Python/mlos/Optimizers/BayesianOptimizerConfigStore.py @@ -4,12 +4,14 @@ # from mlos.Spaces import SimpleHypergrid, DiscreteDimension, CategoricalDimension, Point from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore - from mlos.Optimizers.ExperimentDesigner.ExperimentDesigner import ExperimentDesigner, experiment_designer_config_store from mlos.Optimizers.RegressionModels.HomogeneousRandomForestConfigStore import homogeneous_random_forest_config_store from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest - +from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store +from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated +from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store +from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest bayesian_optimizer_config_store = ComponentConfigStore( parameter_space=SimpleHypergrid( @@ -17,7 +19,9 @@ dimensions=[ CategoricalDimension(name="surrogate_model_implementation", values=[ HomogeneousRandomForestRegressionModel.__name__, - MultiObjectiveHomogeneousRandomForest.__name__ + MultiObjectiveHomogeneousRandomForest.__name__, + MultiObjectiveLassoCrossValidated.__name__, + MultiObjectiveRegressionEnhancedRandomForest.__name__ ]), CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__]), DiscreteDimension(name="min_samples_required_for_guided_design_of_experiments", min=2, max=100) @@ -30,6 +34,20 @@ HomogeneousRandomForestRegressionModel.__name__, MultiObjectiveHomogeneousRandomForest.__name__ ]) + ).join( + subgrid=lasso_cross_validated_config_store.parameter_space, + on_external_dimension=CategoricalDimension( + name="surrogate_model_implementation", + values=[ + MultiObjectiveLassoCrossValidated.__name__ + ]) + ).join( + subgrid=regression_enhanced_random_forest_config_store.parameter_space, + on_external_dimension=CategoricalDimension( + name="surrogate_model_implementation", + values=[ + MultiObjectiveRegressionEnhancedRandomForest.__name__ + ]) ).join( subgrid=experiment_designer_config_store.parameter_space, on_external_dimension=CategoricalDimension(name="experiment_designer_implementation", values=[ExperimentDesigner.__name__]) diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedConfigStore.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedConfigStore.py index a80f8ad075..d0460ef594 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedConfigStore.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedConfigStore.py @@ -24,21 +24,19 @@ class Selection(Enum): parameter_space=SimpleHypergrid( name="lasso_regression_model_config", dimensions=[ - ContinuousDimension(name="eps", min=0, max=2 ** 16), - DiscreteDimension(name="num_alphas", min=0, max=10 ** 5), + ContinuousDimension(name="eps", min=0, max=10.0 ** -3), + DiscreteDimension(name="num_alphas", min=0, max=200), CategoricalDimension(name="fit_intercept", values=[False, True]), CategoricalDimension(name="normalize", values=[False, True]), CategoricalDimension(name="precompute", values=[False, True]), - DiscreteDimension(name="max_iter", min=0, max=10 ** 5), - ContinuousDimension(name="tol", min=0, max=2 ** 10), + DiscreteDimension(name="max_iter", min=100, max=5 * 10 ** 3), + ContinuousDimension(name="tol", min=0, max=1.0), CategoricalDimension(name="copy_x", values=[False, True]), - ContinuousDimension(name="num_cross_validations", min=1, max=10), + DiscreteDimension(name="num_cross_validations", min=2, max=10), CategoricalDimension(name="verbose", values=[False, True]), - DiscreteDimension(name="num_jobs", min=1, max=4), + DiscreteDimension(name="num_jobs", min=1, max=2), CategoricalDimension(name="positive", values=[False, True]), - CategoricalDimension(name="selection", values=[selection.value for selection in Selection]), - DiscreteDimension(name="min_num_samples_per_input_dimension_to_fit", min=1, max=32), - DiscreteDimension(name="num_new_samples_per_input_dimension_before_refit", min=1, max=32) + CategoricalDimension(name="selection", values=[selection.value for selection in Selection]) ] ), default=Point( @@ -55,9 +53,7 @@ class Selection(Enum): verbose=False, num_jobs=1, positive=False, - selection=Selection.CYCLIC.value, - min_num_samples_per_input_dimension_to_fit=10, - num_new_samples_per_input_dimension_before_refit=5 + selection=Selection.CYCLIC.value ), description="Wrapper for sklearn.linear_model.Lasso model." "This wrapper includes optional CV grid search to tune Lasso hyper parameters within each fit." diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py index 84606ec23f..4aa75dee04 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # +import logging import numpy as np from pandas import DataFrame from sklearn.linear_model import LassoCV @@ -32,7 +33,7 @@ def __init__( model_config: Point, input_space: Hypergrid, output_space: Hypergrid, - logger=None + logger: logging.Logger = None ): if logger is None: logger = create_logger("LassoRegressionModel") @@ -54,12 +55,13 @@ def __init__( merge_all_categorical_dimensions=True, drop='first' ) + self.input_dimension_names = self.input_space.dimension_names - self.input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] + self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions if isinstance(dimension, ContinuousDimension)] self.target_dimension_names = [dimension.name for dimension in self.output_space.dimensions] - self.logger.debug(f"Input dimensions: {str(self.input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.") + self.logger.debug(f"Input dimensions: {str(self._projected_input_dimension_names)}; Target dimensions: {str(self.target_dimension_names)}.") assert len(self.target_dimension_names) == 1, "For now (and perhaps forever) we only support single target per Lasso model." @@ -89,6 +91,15 @@ def __init__( self.partial_hat_matrix_ = 0 self.regressor_standard_error_ = 0 + # When LassoCV is used as part of RERF, it cannot reasonably compute the upper and lower bounds on its input space dimensions, + # as they are a polynomial combination of inputs to RERF. Thus, it approximates them with the empirical min and max. + # These approximations are biased: the lower bound is too large, the upper bound is too small. + # Consequently, during scoring, LassoCV is likely to see input outside of these bounds, but we still want + # LassoCV to produce predictions for those points. So we introduce a little hack: whenever LassoCV is instantiated as part of RERF, + # it should skip input filtering on predict. This field, controls this behavior. + self.skip_input_filtering_on_predict = False + + @property def trained(self): return self._trained @@ -120,7 +131,7 @@ def should_fit(self, num_samples): :param num_samples: :return: """ - num_input_dims = len(self.input_dimension_names) + num_input_dims = len(self._projected_input_dimension_names) model_config = self.model_config if not self.trained: return num_samples > model_config.min_num_samples_per_input_dimension_to_fit * num_input_dims @@ -135,19 +146,28 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration x_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) y = target_values_pandas_frame[self.target_dimension_names].to_numpy() design_matrix = self._transform_x(x_df) + + # ensure num_cross_validations < num_samples; and reinstantiate LassoCV regressor + if design_matrix.shape[0] < self.model_config.num_cross_validations: + self.lasso_model_kwargs['cv'] = design_matrix.shape[0] - 1 + self._regressor = LassoCV(**self.lasso_model_kwargs) + self._regressor.fit(design_matrix, y) self._trained = True self.last_refit_iteration_number = iteration_number # retain inverse(x.T * x) to use for confidence intervals on predicted values condition_number = np.linalg.cond(design_matrix) - if condition_number > 10.0 ** 10: + self.logger.info( + f'LassoCV: design_matrix condition number: {condition_number}' + ) + if condition_number > 10.0 ** 4: # add small noise to x to remove singularity, # expect prediction confidence to be reduced (wider intervals) by doing this self.logger.info( - f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^10." + f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10**4." ) - design_matrix += np.random.normal(0, 10.0**-4, size=design_matrix.shape) + design_matrix += np.random.normal(0, 10.0**-2, size=design_matrix.shape) condition_number = np.linalg.cond(design_matrix) self.logger.info( f"Resulting condition number {condition_number}." @@ -175,6 +195,11 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): valid_rows_index = None features_df = None if self.trained: + if not self.skip_input_filtering_on_predict: + feature_values_pandas_frame = self.input_space.filter_out_invalid_rows( + original_dataframe=feature_values_pandas_frame, + exclude_extra_columns=False + ) features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) valid_rows_index = features_df.index @@ -213,7 +238,7 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): def _transform_x(self, x_df: DataFrame): # confirm feature_values_pandas_frame contains all expected columns # if any are missing, impute NaN values - missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values)) + missing_column_names = set.difference(set(self._projected_input_dimension_names), set(x_df.columns.values)) for missing_column_name in missing_column_names: x_df[missing_column_name] = np.NaN diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveLassoCrossValidated.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveLassoCrossValidated.py new file mode 100644 index 0000000000..2d24e8eca4 --- /dev/null +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveLassoCrossValidated.py @@ -0,0 +1,48 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +import logging +from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store +from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel +from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel +from mlos.Spaces import Hypergrid, Point, SimpleHypergrid + + +class MultiObjectiveLassoCrossValidated(NaiveMultiObjectiveRegressionModel): + """Maintains multiple LassoCrossValidatedRegressionModels each predicting a different objective. + + All single-objective models are configured according to model_config. + + """ + def __init__( + self, + model_config: Point, + input_space: Hypergrid, + output_space: Hypergrid, + logger: logging.Logger = None + ): + NaiveMultiObjectiveRegressionModel.__init__( + self, + model_type=LassoCrossValidatedRegressionModel, + model_config=model_config, + input_space=input_space, + output_space=output_space, + logger=logger + ) + + + # We just need to assert that the model config belongs in lasso_cross_validated_config_store.parameter_space. + # A more elaborate solution might be needed down the road, but for now this simple solution should suffice. + # + assert model_config in lasso_cross_validated_config_store.parameter_space + + for output_dimension in output_space.dimensions: + print(f'output_dimension.name: {output_dimension.name}') + lasso_model = LassoCrossValidatedRegressionModel( + model_config=model_config, + input_space=input_space, + output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]), + logger=self.logger + ) + self._regressors_by_objective_name[output_dimension.name] = lasso_model diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveRegressionEnhancedRandomForest.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveRegressionEnhancedRandomForest.py new file mode 100644 index 0000000000..fb187ce1ac --- /dev/null +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/MultiObjectiveRegressionEnhancedRandomForest.py @@ -0,0 +1,49 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +import logging +from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store +from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestModel import RegressionEnhancedRandomForestRegressionModel +from mlos.Optimizers.RegressionModels.NaiveMultiObjectiveRegressionModel import NaiveMultiObjectiveRegressionModel +from mlos.Spaces import Hypergrid, Point, SimpleHypergrid + + +class MultiObjectiveRegressionEnhancedRandomForest(NaiveMultiObjectiveRegressionModel): + """Maintains multiple RegressionEnhancedRandomForestRegressionModel each predicting a different objective. + + All single-objective models are configured according to model_config. + + """ + def __init__( + self, + model_config: Point, + input_space: Hypergrid, + output_space: Hypergrid, + logger: logging.Logger = None + ): + NaiveMultiObjectiveRegressionModel.__init__( + self, + model_type=RegressionEnhancedRandomForestRegressionModel, + model_config=model_config, + input_space=input_space, + output_space=output_space, + logger=logger + ) + + + # We just need to assert that the model config belongs in regression_enhanced_random_forest_config_store.parameter_space. + # A more elaborate solution might be needed down the road, but for now this simple solution should suffice. + # + assert model_config in regression_enhanced_random_forest_config_store.parameter_space + + for output_dimension in output_space.dimensions: + # We copy the model_config (rather than share across objectives below because the perform_initial_random_forest_hyper_parameter_search + # is set to False after the initial fit() call so that subsequent .fit() calls don't pay the cost penalty for this embedded hyper parameter search + rerf_model = RegressionEnhancedRandomForestRegressionModel( + model_config=model_config.copy(), + input_space=input_space, + output_space=SimpleHypergrid(name=f"{output_dimension.name}_objective", dimensions=[output_dimension]), + logger=self.logger + ) + self._regressors_by_objective_name[output_dimension.name] = rerf_model diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py index c13c05d42a..8b2bd92302 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py @@ -4,7 +4,7 @@ # from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel, lasso_cross_validated_config_store from mlos.Optimizers.RegressionModels.SklearnRandomForestRegressionModelConfig import SklearnRandomForestRegressionModelConfig -from mlos.Spaces import SimpleHypergrid, ContinuousDimension, DiscreteDimension, CategoricalDimension, Point +from mlos.Spaces import SimpleHypergrid, DiscreteDimension, CategoricalDimension, Point from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore # TODO : Add back the RidgeRegressionModel boosting_root_model option after adding new RidgeCrossValidatedRegressionModel @@ -19,9 +19,8 @@ values=[SklearnRandomForestRegressionModelConfig.__name__]), CategoricalDimension(name="boosting_root_model_name", values=[LassoCrossValidatedRegressionModel.__name__]), - ContinuousDimension(name="min_abs_root_model_coef", min=0, max=2 ** 10), - CategoricalDimension(name="perform_initial_root_model_hyper_parameter_search", values=[False, True]), - CategoricalDimension(name="perform_initial_random_forest_hyper_parameter_search", values=[False, True]) + CategoricalDimension(name="perform_initial_random_forest_hyper_parameter_search", + values=[True, False]) ] ).join( subgrid=lasso_cross_validated_config_store.parameter_space, @@ -36,11 +35,9 @@ max_basis_function_degree=2, residual_model_name=SklearnRandomForestRegressionModelConfig.__name__, boosting_root_model_name=LassoCrossValidatedRegressionModel.__name__, - min_abs_root_model_coef=0.01, lasso_regression_model_config=lasso_cross_validated_config_store.default, sklearn_random_forest_regression_model_config=SklearnRandomForestRegressionModelConfig.DEFAULT, - perform_initial_root_model_hyper_parameter_search=True, - perform_initial_random_forest_hyper_parameter_search=True + perform_initial_random_forest_hyper_parameter_search=False ), description="Regression-enhanced random forest model hyper-parameters. " "Model inspired by : https://arxiv.org/pdf/1904.10416.pdf" diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py index 6ecdd59ce2..33cd60e109 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py @@ -2,13 +2,13 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # +import logging from typing import List import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV -from sklearn.preprocessing import StandardScaler from mlos.Logger import create_logger from mlos.Optimizers.RegressionModels.RegressionModel import RegressionModel @@ -25,13 +25,9 @@ class RegressionEnhancedRandomForestRegressionModel(RegressionModel): """ Regression-Enhanced RandomForest Regression model - See https://arxiv.org/pdf/1904.10416.pdf for inspiration. See following PRs for exploration notes/observations: - 1. https://msdata.visualstudio.com/Database%20Systems/_git/MLOS/pullrequest/377907 - - Goals/Motivations: 1. RandomForest models are not well suited for extrapolation. As shown in the publication referenced above the RERF Lasso model tries to correct this by using the polynomial basis Lasso regression as the @@ -44,7 +40,6 @@ class RegressionEnhancedRandomForestRegressionModel(RegressionModel): 4. The RandomForest model in RERF fits the Lasso model's residuals, hence any overall regression pattern (polynomial includes linear) within a decision tree's leaf data may have been eliminated by the Lasso fit. - """ _PREDICTOR_OUTPUT_COLUMNS = [ @@ -60,7 +55,7 @@ def __init__( model_config: Point, input_space: Hypergrid, output_space: Hypergrid, - logger=None + logger: logging.Logger = None ): if logger is None: logger = create_logger("RegressionEnhancedRandomForestRegressionModel") @@ -76,6 +71,16 @@ def __init__( ) self.model_config = model_config + self.model_config.perform_initial_root_model_hyper_parameter_search = True + + # enforce model_config constraints (needed by sklearn regression model classes) + # For .lasso_regression_model_config.fit_intercept, the intercept term in added in the design_matrix construction + # For .lasso_regression_model_config.normalize, since the random forest would also need the scaled features, + # scaling would have to be managed by ReRF directly + model_config.lasso_regression_model_config.fit_intercept = False + model_config.lasso_regression_model_config.normalize = False + if model_config.sklearn_random_forest_regression_model_config.oob_score: + model_config.sklearn_random_forest_regression_model_config.bootstrap = True # Explode continuous dimensions to polynomial features up to model config specified monomial degree # am using include_bias to produce constant term (all 1s) column to simplify one hot encoding logic @@ -91,9 +96,9 @@ def __init__( merge_all_categorical_dimensions=True, drop='first' ) - self.input_space = self.one_hot_encoder_adapter self.input_dimension_names = [dimension.name for dimension in self.input_space.dimensions] + self._projected_input_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.dimensions] self.continuous_dimension_names = [dimension.name for dimension in self.one_hot_encoder_adapter.target.dimensions if isinstance(dimension, ContinuousDimension)] self.output_dimension_names = [dimension.name for dimension in self.output_space.dimensions] @@ -115,7 +120,6 @@ def __init__( self.polynomial_features_powers_ = None self.categorical_zero_cols_idx_to_delete_ = None - self.scaler_ = StandardScaler() self._trained = False self.last_refit_iteration_number = None @@ -128,14 +132,23 @@ def trained(self) -> bool: def num_observations_used_to_fit(self): return self.last_refit_iteration_number + @property + def num_model_coefficients(self): + num_continuous_features = len(self.continuous_dimension_names) + num_dummy_vars = len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) + + return num_continuous_features * (num_dummy_vars + 1) + def should_fit( self, num_samples: int ) -> bool: - root_base_model_should_fit = self.base_regressor_.should_fit(num_samples=num_samples) - # TODO : determine min sample needed to fit based on model configs - random_forest_should_fit = True - return root_base_model_should_fit and random_forest_should_fit + # since polynomial basis functions decrease the degrees of freedom (TODO: add reference), + # and prediction degrees of freedom = sample size - num coef - 1 + # need sufficiently many samples to exceed the number of coefficients + dof = num_samples - self.num_model_coefficients - 1 + + return dof > 0 @trace() def fit( @@ -145,13 +158,12 @@ def fit( iteration_number: int = 0 ): """ Fits the RegressionEnhancedRandomForest - :param feature_values_pandas_frame: :param target_values_pandas_frame: :param iteration_number: :return: """ - features_df = self.input_space.project_dataframe(feature_values_pandas_frame, in_place=False) + features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) # produce design_matrix (incorporating polynomial basis function expansion + one hot encoding) (model_design_matrix, new_column_names) = self._transform_x(features_df) @@ -179,7 +191,7 @@ def fit( # add small noise to fit_x to remove singularity, # expect prediction confidence to be reduced (wider intervals) by doing this self.logger.info( - f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10^10." + f"Adding noise to design matrix used for prediction confidence due to condition number {condition_number} > 10 ** 10." ) model_design_matrix += np.random.normal(0, 10.0 ** -4, size=model_design_matrix.shape) condition_number = np.linalg.cond(model_design_matrix) @@ -191,9 +203,6 @@ def fit( # retain standard error from base model (used for prediction confidence intervals) residual_sum_of_squares = np.sum(y_residuals ** 2) - total_sum_of_squares = ((y - y.mean()) ** 2).sum() - unexplained_variance = residual_sum_of_squares / total_sum_of_squares - print(f'RERF::LassoOnly R2: {1.0 - unexplained_variance}') dof = model_design_matrix.shape[0] - (len(self.base_regressor_.coef_) + 1) # +1 for intercept self.base_regressor_standard_error_ = residual_sum_of_squares / float(dof) @@ -217,9 +226,6 @@ def _fit_root_regression( y: pd.DataFrame, iteration_number: int ): - # Assumes x has already been transformed - self.detected_feature_indices_ = [] - # TODO : Add back RidgeCV option after creating RidgeCrossValidatedRegressionModel assert \ self.model_config.boosting_root_model_name in [ @@ -228,6 +234,8 @@ def _fit_root_regression( # Since the RERF transform_x created the proper design_matrix, this serves as the input space for the root regression model. # Hence the code below creates a (temporary) hypergrid reflecting the design_matrix. + # This is less than ideal solution, but deriving min and max of polynomial terms (given feature column degrees) is non-trivial + # TODO: set bounds on the polynomial terms correctly and eliminate the hack forcing the base_regressor to skip filtering invalid features design_matrix_hypergrid = SimpleHypergrid( name='RegressionEnhanceRandomForest_design_matrix', dimensions=None @@ -248,6 +256,8 @@ def _fit_root_regression( input_space=design_matrix_hypergrid, output_space=self.output_space ) + # skips filtering to valid features in the base_regressor since the valid range of design_matrix column values is incorrect above + self.base_regressor_.skip_input_filtering_on_predict = True self.base_regressor_.fit( x, @@ -259,17 +269,36 @@ def _fit_root_regression( def _fit_random_forest_regression( self, - x_star, + x, y_residuals ): # Assumes x has already been transformed and the reduced feature space and residuals relative to base model # are passed to the random forest regression if self.model_config.perform_initial_random_forest_hyper_parameter_search: - self._execute_grid_search_for_random_forest_regressor_model(x_star, y_residuals) + self._execute_grid_search_for_random_forest_regressor_model(x, y_residuals) else: - self.random_forest_regressor_ = RandomForestRegressor(**self.random_forest_kwargs) - self.random_forest_regressor_.fit(x_star, y_residuals) + #self.random_forest_regressor_ = RandomForestRegressor(**self.random_forest_kwargs) + model_config = self.model_config.sklearn_random_forest_regression_model_config + + self.random_forest_regressor_ = RandomForestRegressor( + n_estimators=model_config.n_estimators, + criterion=model_config.criterion, + max_depth=model_config.max_depth if model_config.max_depth > 0 else None, + min_samples_split=model_config.min_samples_split, + min_samples_leaf=model_config.min_samples_leaf, + min_weight_fraction_leaf=model_config.min_weight_fraction_leaf, + max_features=model_config.max_features, + max_leaf_nodes=model_config.max_leaf_nodes if model_config.max_leaf_nodes > 0 else None, + min_impurity_decrease=model_config.min_impurity_decrease, + bootstrap=model_config.bootstrap, + oob_score=model_config.oob_score, + n_jobs=model_config.n_jobs, + warm_start=model_config.warm_start, + ccp_alpha=model_config.ccp_alpha, + max_samples=model_config.max_samples if model_config.max_samples > 0 else None + ) + self.random_forest_regressor_.fit(x, y_residuals) self.random_forest_kwargs = self.random_forest_regressor_.get_params() @@ -277,7 +306,7 @@ def _fit_random_forest_regression( def _execute_grid_search_for_random_forest_regressor_model( self, - x_filtered_to_detected_features, + x, y_residuals ): model_config = self.model_config.sklearn_random_forest_regression_model_config @@ -299,7 +328,7 @@ def _execute_grid_search_for_random_forest_regressor_model( max_samples=model_config.max_samples if model_config.max_samples > 0 else None ) - num_features = x_filtered_to_detected_features.shape[1] + num_features = x.shape[1] max_feature_param = [1] p_floor_3 = round(num_features / 3) if p_floor_3 > 0: @@ -312,10 +341,11 @@ def _execute_grid_search_for_random_forest_regressor_model( } self.logger.info(f"Performing Random Forest Grid Search CV") rf_gscv = GridSearchCV(self.random_forest_regressor_, rf_params) - rf_gscv.fit(x_filtered_to_detected_features, y_residuals) + rf_gscv.fit(x, y_residuals) # retrieve best random forest model and hyper parameters self.random_forest_regressor_ = rf_gscv.best_estimator_ + self.random_forest_kwargs = rf_gscv.best_params_ # only perform hyper-parameter search on first fit self.model_config.perform_initial_random_forest_hyper_parameter_search = False @@ -334,14 +364,16 @@ def predict( dof_col = Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM.value valid_rows_index = None - model_design_matrix: np.ndarray = None - model_design_matrix_dataframe: pd.DataFrame = None + model_design_matrix: np.ndarray = np.array([]) + model_design_matrix_dataframe: pd.DataFrame = pd.DataFrame() if self.trained: + feature_values_pandas_frame = self.input_space.filter_out_invalid_rows(original_dataframe=feature_values_pandas_frame, exclude_extra_columns=False) + if self.x_is_design_matrix: model_design_matrix = feature_values_pandas_frame.to_numpy() model_design_matrix_dataframe = feature_values_pandas_frame else: - features_df = self.input_space.project_dataframe(feature_values_pandas_frame, in_place=False) + features_df = self.one_hot_encoder_adapter.project_dataframe(feature_values_pandas_frame, in_place=False) (model_design_matrix, new_column_names) = self._transform_x(features_df) model_design_matrix_dataframe = pd.DataFrame(model_design_matrix, columns=new_column_names) valid_rows_index = feature_values_pandas_frame.index @@ -384,7 +416,7 @@ def _transform_x( ) -> (np.ndarray, List[str]): # confirm feature_values_pandas_frame contains all expected columns # if any are missing, impute NaN values - missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values)) + missing_column_names = set.difference(set(self._projected_input_dimension_names), set(x_df.columns.values)) for missing_column_name in missing_column_names: x_df[missing_column_name] = np.NaN diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnRandomForestRegressionModelConfig.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnRandomForestRegressionModelConfig.py index 6c8b2ab054..66c8c558d2 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnRandomForestRegressionModelConfig.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnRandomForestRegressionModelConfig.py @@ -36,18 +36,18 @@ class Criterion(Enum): DiscreteDimension(name="n_estimators", min=1, max=2 ** 10), CategoricalDimension(name="criterion", values=[criterion.value for criterion in Criterion]), DiscreteDimension(name="max_depth", min=0, max=2 ** 10), - ContinuousDimension(name="min_samples_split", min=2, max=2 ** 10), - ContinuousDimension(name="min_samples_leaf", min=1, max=2 ** 10), + ContinuousDimension(name="min_samples_split", min=0, max=1), + ContinuousDimension(name="min_samples_leaf", min=0, max=0.5), ContinuousDimension(name="min_weight_fraction_leaf", min=0, max=0.5), CategoricalDimension(name="max_features", values=[max_feature.value for max_feature in MaxFeatures]), DiscreteDimension(name="max_leaf_nodes", min=0, max=2 ** 10), ContinuousDimension(name="min_impurity_decrease", min=0, max=2 ** 10), CategoricalDimension(name="bootstrap", values=[False, True]), CategoricalDimension(name="oob_score", values=[False, True]), - DiscreteDimension(name="n_jobs", min=1, max=2 ** 10), + DiscreteDimension(name="n_jobs", min=1, max=10), CategoricalDimension(name="warm_start", values=[False, True]), ContinuousDimension(name="ccp_alpha", min=0, max=2 ** 10), - ContinuousDimension(name="max_samples", min=0, max=2 ** 10) + ContinuousDimension(name="max_samples", min=0.01, max=0.99) ] ) @@ -55,8 +55,8 @@ class Criterion(Enum): n_estimators=100, criterion=Criterion.MSE.value, max_depth=0, # overloading 0 as None to deal with sklearn param type interpretation - min_samples_split=2, - min_samples_leaf=1, + min_samples_split=0.2, + min_samples_leaf=0.1, min_weight_fraction_leaf=0.0, max_features=MaxFeatures.AUTO.value, max_leaf_nodes=0, # overloading 0 as None to deal with sklearn param type interpretation @@ -66,7 +66,7 @@ class Criterion(Enum): n_jobs=1, warm_start=False, ccp_alpha=0, - max_samples=0 + max_samples=0.01 ) @classmethod diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestLassoCrossValidatedRegressionModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestLassoCrossValidatedRegressionModel.py index 96fef34053..086f8c5ed8 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestLassoCrossValidatedRegressionModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestLassoCrossValidatedRegressionModel.py @@ -25,7 +25,6 @@ def setup_class(cls): def setup_method(self, method): self.model_config = lasso_cross_validated_config_store.default - self.max_basis_function_degree = 2 self.test_case_globals = { '2d_X_deg2_poly_input_space': SimpleHypergrid( @@ -192,11 +191,14 @@ def test_lasso_hierarchical_categorical_predictions(self): include_bias=True, interaction_only=False ) + lasso_cross_validated_model = LassoCrossValidatedRegressionModel( model_config=self.model_config, input_space=polynomial_features_adapter, output_space=objective_function.output_space ) + # since the model input_space stacked the polynomial basis function on in the original input space, we can skip validating input features + lasso_cross_validated_model.skip_input_filtering_on_predict = True # fit model with same degree as true y # The input space consists of 3 2-d domains 200 x 200 units. Hence random samples smaller than a certain size will produce too few points to @@ -212,6 +214,7 @@ def test_lasso_hierarchical_categorical_predictions(self): x_test_df = objective_function.parameter_space.random_dataframe(num_samples=num_test_x) y_test = objective_function.evaluate_dataframe(x_test_df).to_numpy().reshape(-1) predictions = lasso_cross_validated_model.predict(x_test_df) + pred_df = predictions.get_dataframe() predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value predicted_y = pred_df[predicted_value_col].to_numpy() diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveLassoCrossValidated.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveLassoCrossValidated.py new file mode 100644 index 0000000000..22d3dcb1f3 --- /dev/null +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveLassoCrossValidated.py @@ -0,0 +1,74 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +import pytest + +import mlos.global_values +from mlos.OptimizerEvaluationTools.ObjectiveFunctionFactory import ObjectiveFunctionFactory, objective_function_config_store +from mlos.Optimizers.RegressionModels.GoodnessOfFitMetrics import DataSetType +from mlos.Optimizers.RegressionModels.LassoCrossValidatedConfigStore import lasso_cross_validated_config_store +from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel +from mlos.Optimizers.RegressionModels.MultiObjectiveLassoCrossValidated import MultiObjectiveLassoCrossValidated +from mlos.Logger import create_logger + +class TestMultiObjectiveLassoCrossValidated: + + @classmethod + def setup_class(cls) -> None: + mlos.global_values.declare_singletons() + cls.logger = create_logger("TestMultiObjectiveLassoCrossValidated") + + @pytest.mark.parametrize('objective_function_config_name', ["2d_hypersphere_minimize_some", "10d_hypersphere_minimize_some", "5_mutually_exclusive_polynomials"]) + def test_default_config(self, objective_function_config_name): + objective_function_config = objective_function_config_store.get_config_by_name(objective_function_config_name) + objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config) + + lasso_model_config = lasso_cross_validated_config_store.default + multi_objective_rf = MultiObjectiveLassoCrossValidated( + model_config=lasso_model_config, + input_space=objective_function.parameter_space, + output_space=objective_function.output_space, + logger=self.logger + ) + + if objective_function_config_name == '2d_hypersphere_minimize_some': + num_training_samples = 25 + num_testing_samples = 10 + elif objective_function_config_name == '10d_hypersphere_minimize_some': + num_training_samples = 50 + num_testing_samples = 10 + elif objective_function_config_name == '5_mutually_exclusive_polynomials': + num_training_samples = 100 + num_testing_samples = 50 + else: + assert False + train_params_df = objective_function.parameter_space.random_dataframe(num_samples=num_training_samples) + train_objectives_df = objective_function.evaluate_dataframe(train_params_df) + + test_params_df = objective_function.parameter_space.random_dataframe(num_samples=num_testing_samples) + test_objectives_df = objective_function.evaluate_dataframe(test_params_df) + + multi_objective_rf.fit(features_df=train_params_df, targets_df=train_objectives_df, iteration_number=num_training_samples) + multi_objective_predictions = multi_objective_rf.predict(features_df=train_params_df, include_only_valid_rows=True) + + # TRAINING DATA + # + print("------------------------------------------------------------------------------------") + print("--------------------------------------- TRAIN --------------------------------------") + print("------------------------------------------------------------------------------------") + training_gof = multi_objective_rf.compute_goodness_of_fit(features_df=train_params_df, targets_df=train_objectives_df, data_set_type=DataSetType.TRAIN) + for objective_name in objective_function.output_space.dimension_names: + print("------------------------------------------------------------------------------------") + print(objective_name) + print(training_gof[objective_name].to_json(indent=2)) + + # TESTING DATA + print("------------------------------------------------------------------------------------") + print("--------------------------------------- TEST ---------------------------------------") + print("------------------------------------------------------------------------------------") + testing_gof = multi_objective_rf.compute_goodness_of_fit(features_df=test_params_df, targets_df=test_objectives_df, data_set_type=DataSetType.TEST_KNOWN_RANDOM) + for objective_name in objective_function.output_space.dimension_names: + print("------------------------------------------------------------------------------------") + print(objective_name) + print(testing_gof[objective_name].to_json(indent=2)) diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveRegressionEnhancedRandomForest.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveRegressionEnhancedRandomForest.py new file mode 100644 index 0000000000..c3e9e987b0 --- /dev/null +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestMultiObjectiveRegressionEnhancedRandomForest.py @@ -0,0 +1,73 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +import pytest + +import mlos.global_values +from mlos.OptimizerEvaluationTools.ObjectiveFunctionFactory import ObjectiveFunctionFactory, objective_function_config_store +from mlos.Optimizers.RegressionModels.GoodnessOfFitMetrics import DataSetType +from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestConfigStore import regression_enhanced_random_forest_config_store +from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest +from mlos.Logger import create_logger + +class TestMultiObjectiveRegressionEnhancedRandomForest: + + @classmethod + def setup_class(cls) -> None: + mlos.global_values.declare_singletons() + cls.logger = create_logger("TestMultiObjectiveRegressionEnhancedRandomForest") + + @pytest.mark.parametrize('objective_function_config_name', ["2d_hypersphere_minimize_some", "10d_hypersphere_minimize_some", "5_mutually_exclusive_polynomials"]) + def test_default_config(self, objective_function_config_name): + objective_function_config = objective_function_config_store.get_config_by_name(objective_function_config_name) + objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config) + + rerf_model_config = regression_enhanced_random_forest_config_store.default + multi_objective_rf = MultiObjectiveRegressionEnhancedRandomForest( + model_config=rerf_model_config, + input_space=objective_function.parameter_space, + output_space=objective_function.output_space, + logger=self.logger + ) + + if objective_function_config_name == '2d_hypersphere_minimize_some': + num_training_samples = 25 + num_testing_samples = 10 + elif objective_function_config_name == '10d_hypersphere_minimize_some': + num_training_samples = 50 + num_testing_samples = 10 + elif objective_function_config_name == '5_mutually_exclusive_polynomials': + num_training_samples = 100 + num_testing_samples = 50 + else: + assert False + train_params_df = objective_function.parameter_space.random_dataframe(num_samples=num_training_samples) + train_objectives_df = objective_function.evaluate_dataframe(train_params_df) + + test_params_df = objective_function.parameter_space.random_dataframe(num_samples=num_testing_samples) + test_objectives_df = objective_function.evaluate_dataframe(test_params_df) + + multi_objective_rf.fit(features_df=train_params_df, targets_df=train_objectives_df, iteration_number=num_training_samples) + multi_objective_predictions = multi_objective_rf.predict(features_df=train_params_df, include_only_valid_rows=True) + + # TRAINING DATA + # + print("------------------------------------------------------------------------------------") + print("--------------------------------------- TRAIN --------------------------------------") + print("------------------------------------------------------------------------------------") + training_gof = multi_objective_rf.compute_goodness_of_fit(features_df=train_params_df, targets_df=train_objectives_df, data_set_type=DataSetType.TRAIN) + for objective_name in objective_function.output_space.dimension_names: + print("------------------------------------------------------------------------------------") + print(objective_name) + print(training_gof[objective_name].to_json(indent=2)) + + # TESTING DATA + print("------------------------------------------------------------------------------------") + print("--------------------------------------- TEST ---------------------------------------") + print("------------------------------------------------------------------------------------") + testing_gof = multi_objective_rf.compute_goodness_of_fit(features_df=test_params_df, targets_df=test_objectives_df, data_set_type=DataSetType.TEST_KNOWN_RANDOM) + for objective_name in objective_function.output_space.dimension_names: + print("------------------------------------------------------------------------------------") + print(objective_name) + print(testing_gof[objective_name].to_json(indent=2)) diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py index 28ad1e333f..07fa551161 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py @@ -59,21 +59,16 @@ def setup_method(self, method): ) } - @staticmethod - def n_choose_k(n, k): - return math.factorial(n) / math.factorial(k) / math.factorial(n - k) - @staticmethod def get_simple_quadratic_coefficients(): return np.array([1, -3, -4, -0.5, 0.0, -2.0]) - @staticmethod - def generate_points_simple_quadratic(num_points, num_features): - x = np.random.uniform(0, 5, [num_points, num_features]) - x_df = pd.DataFrame(x, columns=['x1', 'x2']) + def generate_points_simple_quadratic(self, num_points, num_features): + x_df = self.test_case_globals['2d_X_input_space'].random_dataframe(num_samples=num_points) + x = x_df.to_numpy() # y = 1 -3*X_1 -4*X_2 -0.5*X_1**2 -2*X_2**2 - y_coef_true = TestRegressionEnhancedRandomForestRegressionModel.get_simple_quadratic_coefficients() + y_coef_true = self.get_simple_quadratic_coefficients() poly_reg = PolynomialFeatures(degree=2) poly_terms_x = poly_reg.fit_transform(x) y = np.matmul(poly_terms_x, y_coef_true) @@ -121,7 +116,7 @@ def test_rerf_predictions(self): output_space=self.test_case_globals['output_space'] ) - num_train_points = 50 + num_train_points = 51 x_train_df, y_train_df = self.generate_points_simple_quadratic(num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_train_df, y_train_df) diff --git a/source/Mlos.Python/mlos/Optimizers/unit_tests/TestBayesianOptimizer.py b/source/Mlos.Python/mlos/Optimizers/unit_tests/TestBayesianOptimizer.py index 76ca902c05..00b8e2c24f 100644 --- a/source/Mlos.Python/mlos/Optimizers/unit_tests/TestBayesianOptimizer.py +++ b/source/Mlos.Python/mlos/Optimizers/unit_tests/TestBayesianOptimizer.py @@ -28,12 +28,14 @@ from mlos.Optimizers.BayesianOptimizerFactory import BayesianOptimizerFactory from mlos.Optimizers.ExperimentDesigner.UtilityFunctionOptimizers.GlowWormSwarmOptimizer import GlowWormSwarmOptimizer from mlos.Optimizers.ExperimentDesigner.UtilityFunctionOptimizers.RandomNearIncumbentOptimizer import RandomNearIncumbentOptimizer -from mlos.Optimizers.ExperimentDesigner.UtilityFunctionOptimizers.RandomSearchOptimizer import RandomSearchOptimizer +from mlos.Optimizers.ExperimentDesigner.UtilityFunctionOptimizers.RandomSearchOptimizer import RandomSearchOptimizer, random_search_optimizer_config_store from mlos.Optimizers.OptimizationProblem import OptimizationProblem, Objective from mlos.Optimizers.OptimizerBase import OptimizerBase from mlos.Optimizers.OptimumDefinition import OptimumDefinition from mlos.Optimizers.RegressionModels.HomogeneousRandomForestRegressionModel import HomogeneousRandomForestRegressionModel from mlos.Optimizers.RegressionModels.MultiObjectiveHomogeneousRandomForest import MultiObjectiveHomogeneousRandomForest +from mlos.Optimizers.RegressionModels.RegressionEnhancedRandomForestModel import RegressionEnhancedRandomForestRegressionModel +from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest from mlos.Optimizers.RegressionModels.Prediction import Prediction from mlos.Spaces import Point, SimpleHypergrid, ContinuousDimension from mlos.Tracer import Tracer, trace, traced @@ -347,7 +349,6 @@ def test_bayesian_optimizer_on_simple_2d_quadratic_function_cold_start(self, use @pytest.mark.parametrize("restart_num", [i for i in range(10)]) @pytest.mark.parametrize("use_remote_optimizer", [False]) def test_hierarchical_quadratic_cold_start_random_configs(self, restart_num, use_remote_optimizer): - objective_function_config = objective_function_config_store.get_config_by_name('three_level_quadratic') objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config=objective_function_config) @@ -383,6 +384,17 @@ def test_hierarchical_quadratic_cold_start_random_configs(self, restart_num, use decision_tree_config.min_samples_to_fit = 10 decision_tree_config.n_new_samples_before_refit = 10 + if optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__: + optimizer_config.min_samples_required_for_guided_design_of_experiments = 25 + rerf_model_config = optimizer_config.regression_enhanced_random_forest_regression_model_config + rerf_model_config.max_basis_function_degree = min(rerf_model_config.max_basis_function_degree, 2) + # increased polynomial degree requires more data to estimate model parameters (poly term coefficients) + optimizer_config.min_samples_required_for_guided_design_of_experiments += 25 * (rerf_model_config.max_basis_function_degree - 1) + rf_model_config = rerf_model_config.sklearn_random_forest_regression_model_config + rf_model_config.perform_initial_random_forest_hyper_parameter_search = False + rf_model_config.max_depth = min(rf_model_config.max_depth, 10) + rf_model_config.n_jobs = min(rf_model_config.n_jobs, 4) + if optimizer_config.experiment_designer_config.numeric_optimizer_implementation == GlowWormSwarmOptimizer.__name__: optimizer_config.experiment_designer_config.glow_worm_swarm_optimizer_config.num_iterations = 5 @@ -399,7 +411,6 @@ def test_hierarchical_quadratic_cold_start_random_configs(self, restart_num, use print(f"[Restart: {restart_num}] Creating a BayesianOptimimizer with the following config: ") print(optimizer_config.to_json(indent=2)) - if not use_remote_optimizer: bayesian_optimizer = self.bayesian_optimizer_factory.create_local_optimizer( optimization_problem=optimization_problem, @@ -856,7 +867,9 @@ def validate_optima(self, optimizer: OptimizerBase): if degrees_of_freedom == 0: assert ucb_90_ci_optimum.upper_confidence_bound <= ucb_95_ci_optimum.upper_confidence_bound <= ucb_99_ci_optimum.upper_confidence_bound else: - print(predicted_optimum.predicted_value, ucb_90_ci_optimum.upper_confidence_bound, ucb_95_ci_optimum.upper_confidence_bound, ucb_99_ci_optimum.upper_confidence_bound) + print(f'upper confidence intervals not nested as expected: \n\tpredicted_value: {predicted_optimum.predicted_value}\n' + f'\t 90th, 95th, and 99th upper confidence bounds: {ucb_90_ci_optimum.upper_confidence_bound}, {ucb_95_ci_optimum.upper_confidence_bound}, {ucb_99_ci_optimum.upper_confidence_bound}') + print(f'degrees of freedom: {optimum_predicted_value_prediction_df[Prediction.LegalColumnNames.PREDICTED_VALUE_DEGREES_OF_FREEDOM.value]}') assert False diff --git a/source/Mlos.Python/mlos/unit_tests/TestBayesianOptimizerGrpcClient.py b/source/Mlos.Python/mlos/unit_tests/TestBayesianOptimizerGrpcClient.py index a2ed423023..72aedc6f77 100644 --- a/source/Mlos.Python/mlos/unit_tests/TestBayesianOptimizerGrpcClient.py +++ b/source/Mlos.Python/mlos/unit_tests/TestBayesianOptimizerGrpcClient.py @@ -22,6 +22,7 @@ from mlos.Optimizers.BayesianOptimizerFactory import BayesianOptimizerFactory from mlos.Optimizers.OptimizationProblem import OptimizationProblem, Objective from mlos.Spaces import CategoricalDimension, ContinuousDimension, DiscreteDimension +from mlos.Optimizers.RegressionModels.MultiObjectiveRegressionEnhancedRandomForest import MultiObjectiveRegressionEnhancedRandomForest class TestBayesianOptimizerGrpcClient: @@ -161,11 +162,21 @@ def test_optimizer_with_default_config(self): def test_optimizer_with_random_config(self, i): optimizer_config = bayesian_optimizer_config_store.parameter_space.random() - optimizer_config.min_samples_required_for_guided_design_of_experiments = min(optimizer_config.min_samples_required_for_guided_design_of_experiments, 100) + optimizer_config.min_samples_required_for_guided_design_of_experiments = max(min(optimizer_config.min_samples_required_for_guided_design_of_experiments, 100), 20) if optimizer_config.surrogate_model_implementation == "HomogeneousRandomForestRegressionModel": rf_config = optimizer_config.homogeneous_random_forest_regression_model_config rf_config.n_estimators = min(rf_config.n_estimators, 20) + if optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__: + optimizer_config.min_samples_required_for_guided_design_of_experiments = 25 + rerf_model_config = optimizer_config.regression_enhanced_random_forest_regression_model_config + rerf_model_config.max_basis_function_degree = min(rerf_model_config.max_basis_function_degree, 2) + # increased polynomial degree requires more data to estimate model parameters (poly term coefficients) + optimizer_config.min_samples_required_for_guided_design_of_experiments += 25 * (rerf_model_config.max_basis_function_degree - 1) + rf_model_config = rerf_model_config.sklearn_random_forest_regression_model_config + rf_model_config.perform_initial_random_forest_hyper_parameter_search = False + rf_model_config.max_depth = min(rf_model_config.max_depth, 10) + rf_model_config.n_jobs = min(rf_model_config.n_jobs, 4) print(f"[{i+1}] Creating a bayesian optimizer with config: {optimizer_config}") bayesian_optimizer = self.bayesian_optimizer_factory.create_remote_optimizer( @@ -201,11 +212,22 @@ def test_optimizer_with_random_config_random_objective(self, i): optimizer_config = bayesian_optimizer_config_store.parameter_space.random() - optimizer_config.min_samples_required_for_guided_design_of_experiments = min(optimizer_config.min_samples_required_for_guided_design_of_experiments, 100) + optimizer_config.min_samples_required_for_guided_design_of_experiments = max(min(optimizer_config.min_samples_required_for_guided_design_of_experiments, 100), 20) if optimizer_config.surrogate_model_implementation == "HomogeneousRandomForestRegressionModel": rf_config = optimizer_config.homogeneous_random_forest_regression_model_config rf_config.n_estimators = min(rf_config.n_estimators, 20) + if optimizer_config.surrogate_model_implementation == MultiObjectiveRegressionEnhancedRandomForest.__name__: + optimizer_config.min_samples_required_for_guided_design_of_experiments = 25 + rerf_model_config = optimizer_config.regression_enhanced_random_forest_regression_model_config + rerf_model_config.max_basis_function_degree = min(rerf_model_config.max_basis_function_degree, 2) + # increased polynomial degree requires more data to estimate model parameters (poly term coefficients) + optimizer_config.min_samples_required_for_guided_design_of_experiments += 25 * (rerf_model_config.max_basis_function_degree - 1) + rf_model_config = rerf_model_config.sklearn_random_forest_regression_model_config + rf_model_config.perform_initial_random_forest_hyper_parameter_search = False + rf_model_config.max_depth = min(rf_model_config.max_depth, 10) + rf_model_config.n_jobs = min(rf_model_config.n_jobs, 4) + print(f"[{i+1}] Creating a bayesian optimizer with config: {optimizer_config} \n\n\nObjective function config: {objective_function_config}") bayesian_optimizer = self.bayesian_optimizer_factory.create_remote_optimizer(