Skip to content

Commit

Permalink
Moved RERF from sklearn lasso to MLOS LassoCVRegressionModel (#255)
Browse files Browse the repository at this point in the history
* Moved RERF use of sklearn lasso regressor to MLOS LassoCrossValidatedRegressionModel

* adding missed files in first commit

* incorporated PR feedback

* incorporating recent changes from ADO to disable broken CDPx test

* remove obsolete config.py for SklearnLassoRegressionModel

* protecting against negative prediction variance

* fixes for prediction var < 0 in RERF and LassoCV

Co-authored-by: Ed Thayer <edthaye@microsoft.com>
  • Loading branch information
edcthayer and Ed Thayer authored Jul 30, 2021
1 parent 9065116 commit bb9d09d
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 616 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def __init__(
}
self._regressor = LassoCV(**self.lasso_model_kwargs)
self._trained: bool = False
self.last_refit_iteration_number = None

self.categorical_zero_cols_idx_to_delete_ = None
self.dof_ = 0
Expand Down Expand Up @@ -126,23 +127,6 @@ def should_fit(self, num_samples):
num_new_samples = num_samples - self.num_observations_used_to_fit
return num_new_samples >= model_config.num_new_samples_per_input_dimension_before_refit * num_input_dims

def _transform_x(self, x_df: DataFrame):
# confirm feature_values_pandas_frame contains all expected columns
# if any are missing, impute NaN values
missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
for missing_column_name in missing_column_names:
x_df[missing_column_name] = np.NaN

# impute 0s for NaNs (NaNs can come from hierarchical hypergrids)
x_df.fillna(value=0, inplace=True)

# construct traditional design matrix when fitting with one hot encoded categorical dimensions
if len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0:
design_matrix = self._create_one_hot_encoded_design_matrix(x_df)
else:
design_matrix = x_df.to_numpy()
return design_matrix

@trace()
def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number):
self.logger.debug(f"Fitting a {self.__class__.__name__} with {len(feature_values_pandas_frame.index)} observations.")
Expand Down Expand Up @@ -182,7 +166,7 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration
def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
self.logger.debug(f"Creating predictions for {len(feature_values_pandas_frame.index)} samples.")

# dataframe column shortcuts
# Prediction dataframe column shortcuts
is_valid_input_col = Prediction.LegalColumnNames.IS_VALID_INPUT.value
predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value
predicted_value_var_col = Prediction.LegalColumnNames.PREDICTED_VALUE_VARIANCE.value
Expand All @@ -209,14 +193,14 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
# else:
# design_matrix = features_df.to_numpy()
design_matrix = self._transform_x(features_df)
print(f'design_matrix.shape: {design_matrix.shape}')
prediction_dataframe[predicted_value_col] = self._regressor.predict(design_matrix)

# compute variance needed for prediction interval
prediction_variances = []
for xi in design_matrix:
leverage_x = np.matmul(np.matmul(xi.T, self.partial_hat_matrix_), xi)
prediction_variances.append(self.regressor_standard_error_ * (1.0 + leverage_x))
prediction_var = self.regressor_standard_error_ * (1.0 + leverage_x)
prediction_variances.append(prediction_var if prediction_var > 0 else 0)

prediction_dataframe[predicted_value_var_col] = prediction_variances
prediction_dataframe[dof_col] = self.dof_
Expand All @@ -226,13 +210,29 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
predictions.add_invalid_rows_at_missing_indices(desired_index=feature_values_pandas_frame.index)
return predictions

def _transform_x(self, x_df: DataFrame):
# confirm feature_values_pandas_frame contains all expected columns
# if any are missing, impute NaN values
missing_column_names = set.difference(set(self.input_dimension_names), set(x_df.columns.values))
for missing_column_name in missing_column_names:
x_df[missing_column_name] = np.NaN

# impute 0s for NaNs (NaNs can come from hierarchical hypergrids)
x_df.fillna(value=0, inplace=True)

# construct traditional design matrix when fitting with one hot encoded categorical dimensions
if len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0:
design_matrix = self._create_one_hot_encoded_design_matrix(x_df)
else:
design_matrix = x_df.to_numpy()
return design_matrix

def _create_one_hot_encoded_design_matrix(self, x: DataFrame) -> np.ndarray:
assert len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) > 0

# use the following to create one hot encoding columns prior to constructing fit_x and powers_ table
num_continuous_features = len(self.continuous_dimension_names)
continuous_features_x = x[self.continuous_dimension_names]
print(f'continuous dim names: {self.continuous_dimension_names}')

dummy_var_cols = self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()
num_dummy_vars = len(dummy_var_cols)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
from mlos.Optimizers.RegressionModels.LassoCrossValidatedRegressionModel import LassoCrossValidatedRegressionModel, lasso_cross_validated_config_store
from mlos.Optimizers.RegressionModels.SklearnRandomForestRegressionModelConfig import SklearnRandomForestRegressionModelConfig
from mlos.Spaces import SimpleHypergrid, ContinuousDimension, DiscreteDimension, CategoricalDimension, Point
from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore

# TODO : Add back the RidgeRegressionModel boosting_root_model option after adding new RidgeCrossValidatedRegressionModel
# TODO : Move from Sklearn random forest to HomogeneousRandomForest

regression_enhanced_random_forest_config_store = ComponentConfigStore(
parameter_space=SimpleHypergrid(
name="regression_enhanced_random_forest_regression_model_config",
dimensions=[
DiscreteDimension(name="max_basis_function_degree", min=1, max=10),
CategoricalDimension(name="residual_model_name",
values=[SklearnRandomForestRegressionModelConfig.__name__]),
CategoricalDimension(name="boosting_root_model_name",
values=[LassoCrossValidatedRegressionModel.__name__]),
ContinuousDimension(name="min_abs_root_model_coef", min=0, max=2 ** 10),
CategoricalDimension(name="perform_initial_root_model_hyper_parameter_search", values=[False, True]),
CategoricalDimension(name="perform_initial_random_forest_hyper_parameter_search", values=[False, True])
]
).join(
subgrid=lasso_cross_validated_config_store.parameter_space,
on_external_dimension=CategoricalDimension(name="boosting_root_model_name",
values=[LassoCrossValidatedRegressionModel.__name__])
).join(
subgrid=SklearnRandomForestRegressionModelConfig.CONFIG_SPACE,
on_external_dimension=CategoricalDimension(name="residual_model_name",
values=[SklearnRandomForestRegressionModelConfig.__name__])
),
default=Point(
max_basis_function_degree=2,
residual_model_name=SklearnRandomForestRegressionModelConfig.__name__,
boosting_root_model_name=LassoCrossValidatedRegressionModel.__name__,
min_abs_root_model_coef=0.01,
lasso_regression_model_config=lasso_cross_validated_config_store.default,
sklearn_random_forest_regression_model_config=SklearnRandomForestRegressionModelConfig.DEFAULT,
perform_initial_root_model_hyper_parameter_search=True,
perform_initial_random_forest_hyper_parameter_search=True
),
description="Regression-enhanced random forest model hyper-parameters. "
"Model inspired by : https://arxiv.org/pdf/1904.10416.pdf"
)
Loading

0 comments on commit bb9d09d

Please sign in to comment.