Moved RERF from sklearn lasso to MLOS LassoCVRegressionModel (#255)

* Moved RERF use of sklearn lasso regressor to MLOS LassoCrossValidatedRegressionModel * adding missed files in first commit * incorporated PR feedback * incorporating recent changes from ADO to disable broken CDPx test * remove obsolete config.py for SklearnLassoRegressionModel * protecting against negative prediction variance * fixes for prediction var < 0 in RERF and LassoCV Co-authored-by: Ed Thayer <edthaye@microsoft.com>
microsoft · Jul 30, 2021 · bb9d09d · bb9d09d
1 parent 9065116
commit bb9d09d
Show file tree

Hide file tree

Showing 5 changed files with 287 additions and 616 deletions.
diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/LassoCrossValidatedRegressionModel.py
@@ -82,6 +82,7 @@ def __init__(
  }
  self._regressor = LassoCV(**self.lasso_model_kwargs)
  self._trained: bool = False
+ self.last_refit_iteration_number = None
 
  self.categorical_zero_cols_idx_to_delete_ = None
  self.dof_ = 0
@@ -126,23 +127,6 @@ def should_fit(self, num_samples):
  num_new_samples = num_samples - self.num_observations_used_to_fit
  return num_new_samples >= model_config.num_new_samples_per_input_dimension_before_refit * num_input_dims
 
- def _transform_x(self, x_df: DataFrame):
- # confirm feature_values_pandas_frame contains all expected columns
- # if any are missing, impute NaN values
- missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
- for missing_column_name in missing_column_names:
- x_df[missing_column_name] = np.NaN
-
- # impute 0s for NaNs (NaNs can come from hierarchical hypergrids)
- x_df.fillna(value=0, inplace=True)
-
- # construct traditional design matrix when fitting with one hot encoded categorical dimensions
- if len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0:
- design_matrix = self._create_one_hot_encoded_design_matrix(x_df)
- else:
- design_matrix = x_df.to_numpy()
- return design_matrix
-
  @trace()
  def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number):
  self.logger.debug(f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations.")
@@ -182,7 +166,7 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration
  def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
  self.logger.debug(f"Creating predictions for {len(feature_values_pandas_frame.index)} samples.")
 
- # dataframe column shortcuts
+ # Prediction dataframe column shortcuts
  is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value
  predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
  predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value
@@ -209,14 +193,14 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
  # else:
  # design_matrix = features_df.to_numpy()
  design_matrix = self._transform_x(features_df)
- print(f'design_matrix.shape: {design_matrix.shape}')
  prediction_dataframe[predicted_value_col] = self._regressor.predict(design_matrix)
 
  # compute variance needed for prediction interval
  prediction_variances = []
  for xi in design_matrix:
  leverage_x = np.matmul(np.matmul(xi.T, self.partial_hat_matrix_), xi)
- prediction_variances.append(self.regressor_standard_error_ * (1.0 + leverage_x))
+ prediction_var = self.regressor_standard_error_ * (1.0 + leverage_x)
+ prediction_variances.append(prediction_var if prediction_var > 0 else 0)
 
  prediction_dataframe[predicted_value_var_col] = prediction_variances
  prediction_dataframe[dof_col] = self.dof_
@@ -226,13 +210,29 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
  predictions.add_invalid_rows_at_missing_indices(desired_index=feature_values_pandas_frame.index)
  return predictions
 
+ def _transform_x(self, x_df: DataFrame):
+ # confirm feature_values_pandas_frame contains all expected columns
+ # if any are missing, impute NaN values
+ missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
+ for missing_column_name in missing_column_names:
+ x_df[missing_column_name] = np.NaN
+
+ # impute 0s for NaNs (NaNs can come from hierarchical hypergrids)
+ x_df.fillna(value=0, inplace=True)
+
+ # construct traditional design matrix when fitting with one hot encoded categorical dimensions
+ if len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0:
+ design_matrix = self._create_one_hot_encoded_design_matrix(x_df)
+ else:
+ design_matrix = x_df.to_numpy()
+ return design_matrix
+
  def _create_one_hot_encoded_design_matrix(self, x: DataFrame) -> np.ndarray:
  assert len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0
 
  # use the following to create one hot encoding columns prior to constructing fit_x and powers_ table
  num_continuous_features = len(self.continuous_dimension_names)
  continuous_features_x = x[self.continuous_dimension_names]
- print(f'continuous dim names: {self.continuous_dimension_names}')
 
  dummy_var_cols = self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()
  num_dummy_vars = len(dummy_var_cols)

diff --git a/...Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py b/...Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestConfigStore.py
@@ -0,0 +1,47 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel, lasso_cross_validated_config_store
+from mlos.Optimizers.RegressionModels.SklearnRandomForestRegressionModelConfig import SklearnRandomForestRegressionModelConfig
+from mlos.Spaces import SimpleHypergrid, ContinuousDimension, DiscreteDimension, CategoricalDimension, Point
+from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore
+
+# TODO : Add back the RidgeRegressionModel boosting_root_model option after adding new RidgeCrossValidatedRegressionModel
+# TODO : Move from Sklearn random forest to HomogeneousRandomForest
+
+regression_enhanced_random_forest_config_store = ComponentConfigStore(
+ parameter_space=SimpleHypergrid(
+ name="regression_enhanced_random_forest_regression_model_config",
+ dimensions=[
+ DiscreteDimension(name="max_basis_function_degree", min=1, max=10),
+ CategoricalDimension(name="residual_model_name",
+ values=[SklearnRandomForestRegressionModelConfig.__name__]),
+ CategoricalDimension(name="boosting_root_model_name",
+ values=[LassoCrossValidatedRegressionModel.__name__]),
+ ContinuousDimension(name="min_abs_root_model_coef", min=0, max=2 ** 10),
+ CategoricalDimension(name="perform_initial_root_model_hyper_parameter_search", values=[False, True]),
+ CategoricalDimension(name="perform_initial_random_forest_hyper_parameter_search", values=[False, True])
+ ]
+ ).join(
+ subgrid=lasso_cross_validated_config_store.parameter_space,
+ on_external_dimension=CategoricalDimension(name="boosting_root_model_name",
+ values=[LassoCrossValidatedRegressionModel.__name__])
+ ).join(
+ subgrid=SklearnRandomForestRegressionModelConfig.CONFIG_SPACE,
+ on_external_dimension=CategoricalDimension(name="residual_model_name",
+ values=[SklearnRandomForestRegressionModelConfig.__name__])
+ ),
+ default=Point(
+ max_basis_function_degree=2,
+ residual_model_name=SklearnRandomForestRegressionModelConfig.__name__,
+ boosting_root_model_name=LassoCrossValidatedRegressionModel.__name__,
+ min_abs_root_model_coef=0.01,
+ lasso_regression_model_config=lasso_cross_validated_config_store.default,
+ sklearn_random_forest_regression_model_config=SklearnRandomForestRegressionModelConfig.DEFAULT,
+ perform_initial_root_model_hyper_parameter_search=True,
+ perform_initial_random_forest_hyper_parameter_search=True
+ ),
+ description="Regression-enhanced random forest model hyper-parameters. "
+ "Model inspired by : https://arxiv.org/pdf/1904.10416.pdf"
+)