diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py index 6f449e2f2f..17836fb15a 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py @@ -10,7 +10,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score from sklearn.model_selection import GridSearchCV -from sklearn.preprocessing import PolynomialFeatures +from sklearn.preprocessing import PolynomialFeatures, StandardScaler from mlos.Logger import create_logger from mlos.Optimizers.RegressionModels.RegressionModel import RegressionModel @@ -20,6 +20,7 @@ from mlos.Optimizers.RegressionModels.SklearnRandomForestRegressionModelConfig import SklearnRandomForestRegressionModelConfig from mlos.Spaces import Hypergrid, SimpleHypergrid, ContinuousDimension, DiscreteDimension, CategoricalDimension, Point from mlos.Spaces.Configs.DefaultConfigMeta import DefaultConfigMeta +from mlos.Spaces.HypergridAdapters.CategoricalToOneHotEncodedHypergridAdapter import CategoricalToOneHotEncodedHypergridAdapter from mlos.Tracer import trace # sklearn injects many warnings, so from @@ -98,43 +99,6 @@ class RegressionEnhancedRandomForestRegressionModelConfig(metaclass=DefaultConfi def contains(cls, config): return config in cls.CONFIG_SPACE - # @classmethod - # def create_from_config_point(cls, config_point): - # assert cls.contains(config_point) - # config_key_value_pairs = {param_name: value for param_name, value in config_point} - # return cls(**config_key_value_pairs) - # - # def __init__( - # self, - # max_basis_function_degree=_DEFAULT.max_basis_function_degree, - # boosting_root_model_name=_DEFAULT.boosting_root_model_name, - # min_abs_root_model_coef=_DEFAULT.min_abs_root_model_coef, - # boosting_root_model_config: Point() = _DEFAULT.sklearn_lasso_regression_model_config, - # random_forest_model_config: Point() = _DEFAULT.sklearn_random_forest_regression_model_config, - # residual_model_name=_DEFAULT.residual_model_name, - # perform_initial_root_model_hyper_parameter_search=_DEFAULT.perform_initial_root_model_hyper_parameter_search, - # perform_initial_random_forest_hyper_parameter_search=_DEFAULT.perform_initial_random_forest_hyper_parameter_search - # ): - # self.max_basis_function_degree = max_basis_function_degree - # self.residual_model_name = residual_model_name - # self.min_abs_root_model_coef = min_abs_root_model_coef - # self.perform_initial_root_model_hyper_parameter_search = perform_initial_root_model_hyper_parameter_search - # self.perform_initial_random_forest_hyper_parameter_search = perform_initial_random_forest_hyper_parameter_search - # - # self.boosting_root_model_name = boosting_root_model_name - # self.boosting_root_model_config = None - # if self.boosting_root_model_name == SklearnLassoRegressionModelConfig.__name__: - # self.boosting_root_model_config = SklearnLassoRegressionModelConfig \ - # .create_from_config_point(boosting_root_model_config) - # elif self.boosting_root_model_name == SklearnRidgeRegressionModelConfig.__name__: - # self.boosting_root_model_config = SklearnRidgeRegressionModelConfig \ - # .create_from_config_point(boosting_root_model_config) - # else: - # print('Unrecognized boosting_root_model_name "{}"'.format(self.boosting_root_model_name)) - # - # self.random_forest_model_config = SklearnRandomForestRegressionModelConfig \ - # .create_from_config_point(random_forest_model_config) - class RegressionEnhancedRandomForestRegressionModel(RegressionModel): """ Regression-Enhanced RandomForest Regression model @@ -182,6 +146,11 @@ def __init__( output_space=output_space ) self.model_config = model_config + + # one hot encode categorical input dimensions + self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter(adaptee=input_space, merge_all_categorical_dimensions=True, drop='first') + self.input_space = input_space + self.input_dimension_names = [dimension.name for dimension in self.input_space.dimensions] self.output_dimension_names = [dimension.name for dimension in self.output_space.dimensions] @@ -203,13 +172,9 @@ def __init__( self.variance_estimate_ = None self.root_model_gradient_coef_ = None self.polynomial_features_powers_ = None - self.num_dummy_vars_ = None - self.num_categorical_dims_ = None - self.continuous_dim_col_names_ = None - self.categorical_dim_col_names_ = None - self.dummy_var_map_ = None - self.dummy_var_cols_ = None + self.categorical_zero_cols_idx_to_delete_ = None + self.scaler_ = StandardScaler() @trace() def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number=0): @@ -227,7 +192,10 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration """ # pull X and y values from data frames passed y = target_values_pandas_frame[self.output_dimension_names].to_numpy().reshape(-1) - x_df = feature_values_pandas_frame[self.input_dimension_names] + x_df = self.one_hot_encoder_adapter.project_dataframe(df=feature_values_pandas_frame, in_place=False) + continuous_dim_col_names = list(set.difference(set(x_df.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()))) + x_df[continuous_dim_col_names] = self.scaler_.fit_transform(x_df[continuous_dim_col_names]) + fit_x = self.transform_x(x_df, what_to_return='fit_x') # run root regression @@ -427,7 +395,10 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True): set(feature_values_pandas_frame.columns.values)) for missing_column_name in missing_column_names: feature_values_pandas_frame[missing_column_name] = np.NaN - x_df = feature_values_pandas_frame[self.input_dimension_names] + x_df = self.one_hot_encoder_adapter.project_dataframe(df=feature_values_pandas_frame, in_place=False) + continuous_dim_col_names = list(set.difference(set(x_df.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()))) + x_df[continuous_dim_col_names] = self.scaler_.transform(x_df[continuous_dim_col_names]) + x_star = self.transform_x(x_df) base_predicted = self.base_regressor_.predict(x_star) @@ -466,24 +437,6 @@ def score(self, feature_values_pandas_frame, target_values_pandas_frame): r2 = r2_score(y, predictions_df[Prediction.LegalColumnNames.PREDICTED_VALUE.value]) return r2 - def _create_one_hot_encoding_map(self, categorical_values): - if self.dummy_var_map_ is not None and self.dummy_var_cols_ is not None: - return self.dummy_var_cols_, self.dummy_var_map_ - - sorted_unique_categorical_levels = np.sort(categorical_values.unique()).tolist() - num_dummy_vars = len(sorted_unique_categorical_levels) - 1 # dropping first - dummy_var_cols = [] - dummy_var_map = {sorted_unique_categorical_levels.pop(0): np.zeros(num_dummy_vars)} - for i, level in enumerate(sorted_unique_categorical_levels): - dummy_var_map[level] = np.zeros(num_dummy_vars) - dummy_var_map[level][i] = 1 - dummy_var_cols.append(f'ohe_{i}') - - self.dummy_var_map_ = dummy_var_map - self.dummy_var_cols_ = dummy_var_cols - - return dummy_var_cols, dummy_var_map - def _set_categorical_powers_table(self, num_continuous_dims=0, num_categorical_levels=0, @@ -560,39 +513,20 @@ def _explode_x(self, x): """ fit_x = x - # find categorical features - if self.categorical_dim_col_names_ is None: - self.categorical_dim_col_names_ = [x.columns.values[i] for i in range(len(x.columns.values)) if x.dtypes[i] == object] - categorical_dim_col_names = self.categorical_dim_col_names_ - if self.continuous_dim_col_names_ is None: - self.continuous_dim_col_names_ = [x.columns.values[i] for i in range(len(x.columns.values)) if x.dtypes[i] != object] - continuous_dim_col_names = self.continuous_dim_col_names_ - if self.num_categorical_dims_ is None: - self.num_categorical_dims_ = len(categorical_dim_col_names) - num_categorical_dims_ = self.num_categorical_dims_ - + continuous_dim_col_names = list(set.difference(set(x.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()))) + num_categorical_dims_ = len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) if num_categorical_dims_ > 0: # use the following to create one hot encoding columns prior to constructing fit_x and powers_ table working_x = x[continuous_dim_col_names].copy() - # create dummy variables for OneHotEncoding with dropped first category level - x['flattened_categoricals'] = x[categorical_dim_col_names].apply( - lambda cat_row: '-'.join(cat_row.map(str)), - axis=1) - dummy_var_cols, dummy_var_map = self._create_one_hot_encoding_map(x['flattened_categoricals']) - working_x[dummy_var_cols] = x.apply(lambda row: dummy_var_map[row['flattened_categoricals']], - axis=1, - result_type="expand") - - # create transformed x for linear fit with dummy variable (one hot encoding) - # add continuous dimension columns corresponding to each categorical level - if self.num_dummy_vars_ is None: - self.num_dummy_vars_ = len(dummy_var_cols) - num_dummy_vars = self.num_dummy_vars_ - for i in range(num_dummy_vars): + dummy_var_cols = self.one_hot_encoder_adapter.get_one_hot_encoded_column_names() + num_dummy_vars = len(dummy_var_cols) + working_x[dummy_var_cols] = x[dummy_var_cols] + + for dummy_var_col in dummy_var_cols: for cont_dim_name in continuous_dim_col_names: - dummy_times_x_col_name = f'{cont_dim_name}*ohe_{i}' - working_x[dummy_times_x_col_name] = working_x[cont_dim_name] * working_x[dummy_var_cols[i]] + dummy_times_x_col_name = f'{cont_dim_name}*{dummy_var_col}' + working_x[dummy_times_x_col_name] = working_x[cont_dim_name] * working_x[dummy_var_col] # add exploded x weighted by oneHotEncoded columns # add polynomial for 000...000 encoding @@ -605,7 +539,7 @@ def _explode_x(self, x): # add polynomial for non-000...000 encodings last_col_filled = num_terms_in_poly for ohe_col_name in dummy_var_cols: - cols_for_poly_transform = [cn for cn in working_x.columns.values if cn.find(ohe_col_name) > 0] + cols_for_poly_transform = [cn for cn in working_x.columns.values if cn.endswith(ohe_col_name) and cn != ohe_col_name] ohe_poly = self.polynomial_features_transform_.fit_transform(working_x[cols_for_poly_transform]) ohe_poly[:, 0] = ohe_poly[:, 0] * working_x[ohe_col_name] # replace global intercept w/ intercept offset term fit_x[:, last_col_filled:last_col_filled + num_terms_in_poly] = ohe_poly @@ -623,15 +557,12 @@ def _explode_x(self, x): if self.polynomial_features_powers_ is None: self._set_categorical_powers_table( num_continuous_dims=len(continuous_dim_col_names), - num_categorical_levels=len(x['flattened_categoricals'].unique()), + num_categorical_levels=num_dummy_vars+1, num_terms_in_poly=num_terms_in_poly, num_dummy_vars=num_dummy_vars, zero_cols_idx=zero_cols_idx ) - # remove temporary fields - x.drop(columns=['flattened_categoricals'], inplace=True) - elif self.model_config.max_basis_function_degree > 1: fit_x = self.polynomial_features_transform_.fit_transform(x) self.polynomial_features_powers_ = self.polynomial_features_transform_.powers_ diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py index 54fe200117..6c3357cf90 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py @@ -42,7 +42,7 @@ class Selection(Enum): # sklearn model expects precompute type str, bool, array-like, so setting to default and exclude list option precompute=False, copy_x=True, - max_iter=1000, + max_iter=2000, tol=10 ** -4, warm_start=False, positive=False diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py index 2dce41ec52..2aecb3c4b0 100644 --- a/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py +++ b/source/Mlos.Python/mlos/Optimizers/RegressionModels/unit_tests/TestRegressionEnhancedRandomForestModel.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. # import unittest - +import random import math import pandas as pd import numpy as np @@ -93,8 +93,8 @@ def generate_points_nonhierarchical_categorical_quadratic(num_points): x_df = pd.DataFrame({ 'x0': np.random.choice(['a', 'b', 'c'], size=num_points), - 'x1': np.random.uniform(-10, 10, size=num_points), - 'x2': np.random.uniform(-10, 10, size=num_points), + 'x1': np.random.uniform(0, 5, size=num_points), + 'x2': np.random.uniform(0, 5, size=num_points), 'i0': np.random.choice(['-5', '5'], size=num_points) }) @@ -118,7 +118,6 @@ def test_lasso_feature_discovery(self): output_space=self.test_case_globals['output_space'] ) - np.random.seed(17) num_points = 100 x_df, y_df = self.generate_points_simple_quadratic(num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) @@ -140,15 +139,14 @@ def test_lasso_feature_discovery(self): self.assertTrue(num_diffs == 0, 'Base model failed to find expected features') # @unittest.expectedFailure # The configs don't belong to their respective config spaces - def test_lasso_polynomial_coefficients(self): + def test_lasso_polynomial_coefficient_invariants(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space'] ) - np.random.seed(23) - num_points = 1000 + num_points = 100 x_df, y_df = self.generate_points_simple_quadratic(num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) @@ -162,25 +160,14 @@ def test_lasso_polynomial_coefficients(self): self.assertTrue(rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue(rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') - # test fit coef match known coef - y_coef_true = self.get_simple_quadratic_coefficients() - epsilon = 10 ** -2 - expected_non_zero_coef = y_coef_true[np.where(y_coef_true != 0.0)[0]] - fit_poly_coef = [rerf.base_regressor_.intercept_] - fit_poly_coef.extend(rerf.base_regressor_.coef_) - incorrect_terms = np.where(np.abs(fit_poly_coef - expected_non_zero_coef) > epsilon)[0] - num_incorrect_terms = len(incorrect_terms) - self.assertTrue(num_incorrect_terms == 0, 'Estimated polynomial coefficients deviated further than expected from known coefficients') - # @unittest.expectedFailure # The configs don't belong to their respective config spaces - def test_lasso_polynomial_gradient(self): + def test_lasso_polynomial_gradient_invariants(self): rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space'] ) - np.random.seed(13) num_points = 100 x_df, y_df = self.generate_points_simple_quadratic(num_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) rerf.fit(x_df, y_df) @@ -195,13 +182,6 @@ def test_lasso_polynomial_gradient(self): self.assertTrue(rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') self.assertTrue(rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') - # test gradient at X - epsilon = 10 ** -2 - true_gradient_coef = np.array([[-3, -0.5 * 2, 0, 0, 0, 0], [-4, -2.0 * 2, 0, 0, 0, 0]]).transpose() - incorrect_terms = np.where(np.abs(true_gradient_coef - rerf.root_model_gradient_coef_) > epsilon)[0] - num_incorrect_terms = len(incorrect_terms) - self.assertTrue(num_incorrect_terms == 0, 'Estimated gradient coefficients deviated further than expected from known coefficients') - # @unittest.expectedFailure # The configs don't belong to their respective config spaces def test_lasso_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( @@ -209,7 +189,6 @@ def test_lasso_predictions(self): input_space=self.test_case_globals['2d_X_input_space'], output_space=self.test_case_globals['output_space'] ) - np.random.seed(13) num_train_points = 100 x_train_df, y_train_df = self.generate_points_simple_quadratic(num_train_points, len(self.test_case_globals['2d_X_input_space'].dimensions)) @@ -236,9 +215,11 @@ def test_lasso_predictions(self): y_test = y_test_df.to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() - r2 = 1 - residual_sum_of_squares / total_sum_of_squares + unexplained_variance = residual_sum_of_squares / total_sum_of_squares - self.assertTrue(r2 > 1 - 10**-4, '1 - R^2 larger than expected') + test_threshold = 10 ** -3 + self.assertTrue(unexplained_variance < test_threshold, + f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})') def test_lasso_categorical_predictions(self): rerf = RegressionEnhancedRandomForestRegressionModel( @@ -247,22 +228,29 @@ def test_lasso_categorical_predictions(self): output_space=self.test_case_globals['output_space'] ) - num_train_x = 300 + # input space consists of 6 2-d domains that are 5 x 5 units wide. Hence placing 25 points in each domain. + num_train_x = 100 x_train_df, y_train_df = self.generate_points_nonhierarchical_categorical_quadratic(num_train_x) rerf.fit(x_train_df, y_train_df) - num_categorical_levels_expected = len(x_train_df['x0'].unique()) * len(x_train_df['i0'].unique()) + num_categorical_levels_expected = len(rerf.one_hot_encoder_adapter.get_one_hot_encoded_column_names()) num_continuous_dimensions = 2 # x1 and x2 - final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions + final_num_features = num_categorical_levels_expected + num_continuous_dimensions polynomial_degree = self.model_config.max_basis_function_degree num_terms_in_polynomial_per_categorical_level = self.n_choose_k(polynomial_degree + num_continuous_dimensions, num_continuous_dimensions) - num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected + # 1 is added to the num_categorical_levels_expected to account for "level 0" which the one hot encoder in RERF drops the first level, + # while the design matrix contains a polynomial fit for that level. + # Since it is possible not all categorical levels will be present in the training set, RERF eliminates zero columns arising from + # OneHotEncoder knowing the missing levels are possible. The list of the dropped columns is established in RERF.fit() and used in the + # RERF.predict() method. + num_cols_in_design_matrix = num_terms_in_polynomial_per_categorical_level * (num_categorical_levels_expected + 1)\ + - len(rerf.categorical_zero_cols_idx_to_delete_) num_detected_features = len(rerf.detected_feature_indices_) self.assertTrue(rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') - self.assertTrue(rerf.fit_X_.shape == (num_train_x, num_terms_in_polynomial), 'Design matrix shape is incorrect') + self.assertTrue(rerf.fit_X_.shape == (num_train_x, num_cols_in_design_matrix), 'Design matrix shape is incorrect') self.assertTrue(rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') - self.assertTrue(rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') + self.assertTrue(rerf.polynomial_features_powers_.shape == (num_cols_in_design_matrix, final_num_features), 'PolynomalFeature.power_ shape is incorrect') # generate new random to test predictions num_test_points = 50 @@ -277,59 +265,15 @@ def test_lasso_categorical_predictions(self): residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares - self.assertTrue(unexplained_variance < 10 ** -4, '1 - R^2 larger than expected') - - def test_lasso_categorical_gradient(self): - rerf = RegressionEnhancedRandomForestRegressionModel( - model_config=self.model_config, - input_space=self.test_case_globals['categorical_input_space'], - output_space=self.test_case_globals['output_space'] - ) - np.random.seed(19) - - num_points = 300 - x_df, y_df = self.generate_points_nonhierarchical_categorical_quadratic(num_points) - rerf.fit(x_df, y_df) - - num_categorical_levels_expected = len(x_df['x0'].unique()) * len(x_df['i0'].unique()) - num_continuous_dimensions = 2 # x1 and x2 - final_num_features = num_categorical_levels_expected - 1 + num_continuous_dimensions - polynomial_degree = self.model_config.max_basis_function_degree - num_terms_in_polynomial_per_categorical_level = self.n_choose_k(polynomial_degree + num_continuous_dimensions, num_continuous_dimensions) - num_terms_in_polynomial = num_terms_in_polynomial_per_categorical_level * num_categorical_levels_expected - num_detected_features = len(rerf.detected_feature_indices_) - - self.assertTrue(rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') - self.assertTrue(rerf.fit_X_.shape == (num_points, num_terms_in_polynomial), 'Design matrix shape is incorrect') - self.assertTrue(rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') - self.assertTrue(rerf.polynomial_features_powers_.shape == (num_terms_in_polynomial, final_num_features), 'PolynomalFeature.power_ shape is incorrect') - - # test gradient coefficients - true_gradient_coef = np.zeros((36, 7)) - true_gradient_coef[0] = np.array([3, 7, 0, 10, 10, 15, 25]) - true_gradient_coef[1] = np.array([12, -11, 0, -11, -11, -3, -3]) - true_gradient_coef[11] = np.array([12, 12, 0, 12, 12, -7, -7]) - true_gradient_coef[13] = np.array([-3, -11, 0, 0, 0, 2, 2]) - true_gradient_coef[15] = np.array([4, 12, 0, 0, 0, 3, 3]) - true_gradient_coef[17] = np.array([-3, -7, 0, 0, 0, 0, 0]) - true_gradient_coef[19] = np.array([4, 6, 0, 0, 0, 0, 0]) - true_gradient_coef[21] = np.array([0, -7, 0, 0, 0, 0, 0]) - true_gradient_coef[23] = np.array([0, 6, 0, 0, 0, 0, 0]) - - epsilon = 10 ** -2 - estimated_gradient_coef = rerf.root_model_gradient_coef_ - coef_abs_diff = np.abs(true_gradient_coef - estimated_gradient_coef) - coef_abs_relative_error = np.divide(coef_abs_diff, np.abs(true_gradient_coef)) - incorrect_terms = np.where(coef_abs_relative_error > epsilon)[0] - num_incorrect_terms = len(incorrect_terms) - - self.assertTrue(num_incorrect_terms == 0, 'Estimated gradient coefficients deviated further than expected from known coefficients') + test_threshold = 10 ** -3 + self.assertTrue(unexplained_variance < test_threshold, + f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})') def test_lasso_hierarchical_categorical_predictions(self): + random.seed(11001) objective_function_config = objective_function_config_store.get_config_by_name('three_level_quadratic') objective_function = ObjectiveFunctionFactory.create_objective_function(objective_function_config=objective_function_config) - rerf = RegressionEnhancedRandomForestRegressionModel( model_config=self.model_config, input_space=objective_function.parameter_space, @@ -337,7 +281,10 @@ def test_lasso_hierarchical_categorical_predictions(self): ) # fit model with same degree as true y - num_train_x = 100 + # The input space consists of 3 2-d domains 200 x 200 units. Hence random samples smaller than a certain size will produce too few points to + # train reliable models. + # TODO: Good place to use a non-random training set design + num_train_x = 600 x_train_df = objective_function.parameter_space.random_dataframe(num_samples=num_train_x) y_train_df = objective_function.evaluate_dataframe(x_train_df) rerf.fit(x_train_df, y_train_df) @@ -346,27 +293,18 @@ def test_lasso_hierarchical_categorical_predictions(self): self.assertTrue(rerf.root_model_gradient_coef_.shape == rerf.polynomial_features_powers_.shape, 'Gradient coefficient shape is incorrect') self.assertTrue(rerf.fit_X_.shape == (num_train_x, rerf.polynomial_features_powers_.shape[0]), 'Design matrix shape is incorrect') self.assertTrue(rerf.partial_hat_matrix_.shape == (num_detected_features, num_detected_features), 'Hat matrix shape is incorrect') - self.assertTrue(rerf.polynomial_features_powers_.shape == (28, 8), 'PolynomalFeature.power_ shape is incorrect') + self.assertTrue(rerf.polynomial_features_powers_.shape == (34, 9), 'PolynomalFeature.power_ shape is incorrect') # test predictions predicted_value_col = Prediction.LegalColumnNames.PREDICTED_VALUE.value - num_test_x = 10 - - # by generating a single X feature on which to make the predictions, the - y_test_list = [] - predicted_y_list = [] - for _ in range(num_test_x): - x_test_df = objective_function.parameter_space.random_dataframe(num_samples=1) - y_test_df = objective_function.evaluate_dataframe(x_test_df) - y_test_list.append(y_test_df['y'].values[0]) - - predictions = rerf.predict(x_test_df) - pred_df = predictions.get_dataframe() - predicted_y_list.append(pred_df[predicted_value_col].values[0]) - - predicted_y = np.array(predicted_y_list) - y_test = np.array(y_test_list) + num_test_x = 50 + x_test_df = objective_function.parameter_space.random_dataframe(num_samples=num_test_x) + predictions = rerf.predict(x_test_df) + pred_df = predictions.get_dataframe() + predicted_y = pred_df[predicted_value_col].to_numpy() + y_test = objective_function.evaluate_dataframe(x_test_df).to_numpy().reshape(-1) residual_sum_of_squares = ((y_test - predicted_y) ** 2).sum() total_sum_of_squares = ((y_test - y_test.mean()) ** 2).sum() unexplained_variance = residual_sum_of_squares / total_sum_of_squares - self.assertTrue(unexplained_variance < 10**-4, '1 - R^2 larger than expected') + test_threshold = 10**-3 + self.assertTrue(unexplained_variance < test_threshold, f'1 - R^2 = {unexplained_variance} larger than expected ({test_threshold})') diff --git a/source/Mlos.Python/mlos/Spaces/HypergridAdapters/CategoricalToOneHotEncodedHypergridAdapter.py b/source/Mlos.Python/mlos/Spaces/HypergridAdapters/CategoricalToOneHotEncodedHypergridAdapter.py index e0e217fce1..490da5dbec 100644 --- a/source/Mlos.Python/mlos/Spaces/HypergridAdapters/CategoricalToOneHotEncodedHypergridAdapter.py +++ b/source/Mlos.Python/mlos/Spaces/HypergridAdapters/CategoricalToOneHotEncodedHypergridAdapter.py @@ -86,11 +86,13 @@ def __init__( self.has_adaptee_been_flattened = True # Since the CategoricalToDiscrete adapter converts categorical dimensions to discrete dimensions, we remember the categorical dim names + self._adaptee_contains_categorical_dimensions = False self._adaptee_dimension_names_to_transform = [] for adaptee_dimension in self._adaptee.dimensions: if isinstance(adaptee_dimension, CategoricalDimension): self._adaptee_dimension_names_to_transform.append(adaptee_dimension.name) self._adaptee_expected_dimension_name_ordering.append(adaptee_dimension.name) + self._adaptee_contains_categorical_dimensions = len(self._adaptee_dimension_names_to_transform) > 0 if any(isinstance(dimension, CategoricalDimension) for dimension in self._adaptee.dimensions) or self.has_adaptee_been_flattened: self._adaptee = CategoricalToDiscreteHypergridAdapter(adaptee=self._adaptee) @@ -126,7 +128,7 @@ def _project_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: columns_to_drop.append(missing_col) columns_to_transform = self._adaptee_dimension_names_to_transform - if self._merge_all_categorical_dimensions: + if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: df[self._merged_categorical_dimension_column_name] = self._concatenate_dataframe_columns(df, columns_to_transform) columns_to_transform = [self._merged_categorical_dimension_column_name] columns_to_drop.extend(self._adaptee_dimension_names_to_transform) @@ -150,13 +152,13 @@ def _unproject_dataframe(self, df: DataFrame, in_place=True) -> DataFrame: df = df.copy(deep=True) columns_to_return = self._adaptee_expected_dimension_name_ordering - if self._merge_all_categorical_dimensions: + if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: for column_to_transform in self._adaptee_dimension_names_to_transform: if column_to_transform not in columns_to_return: columns_to_return.append(column_to_transform) columns_to_drop = [] - if self._merge_all_categorical_dimensions: + if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: my_ohe_dict = self._adaptee_to_target_data_dict[self._merged_categorical_dimension_column_name] target_columns_to_invert = my_ohe_dict.target_dims my_ohe = my_ohe_dict.one_hot_encoder @@ -222,7 +224,7 @@ def _build_simple_hypergrid_target(self) -> None: expanded_categories = ['nan'] + [str(float(x)) for x in adaptee_dimension.linspace()] categories_list_for_ohe_init.append(expanded_categories) - if not self._merge_all_categorical_dimensions: + if not self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: # do not need to encode the cross product of all categorical dimensions, sufficient info here to add target dimensions self._adaptee_to_target_data_dict[adaptee_dimension.name] = CategoricalToOneHotEncodingAdapteeTargetMapping( one_hot_encoder=OneHotEncoder(categories=[expanded_categories], **self._one_hot_encoder_kwargs)) @@ -231,7 +233,7 @@ def _build_simple_hypergrid_target(self) -> None: else: self._target.add_dimension(adaptee_dimension.copy()) - if self._merge_all_categorical_dimensions: + if self._merge_all_categorical_dimensions and self._adaptee_contains_categorical_dimensions: # harvested categories for each categorical dimension in single pass across all adaptee dimensions used to compute the cross product encoding here cross_product_categories = self._create_cross_product_categories(categories_list_for_ohe_init) self._adaptee_to_target_data_dict[self._merged_categorical_dimension_column_name] = CategoricalToOneHotEncodingAdapteeTargetMapping(