OneHotEncoder Hypergrid Adapter integration in Regression-Enhanced Ra…

…ndom Forest Model (#148) * intial commit to confirm branch rename worked * found and fixed defects in CategoricalToOneHotEncodingAdapter for cases without categorical dims; worked through changes in RERF and RERF tests * removed unneeded comment/code * removed test for gradient; will replace this when the path for using this gradient is clear * added sklearn StandardScaler to RERF to reduce hat matrix condition number; eliminated unit tests for exact polynomial and gradient coeffs as these are relative to scaled X and are not knowable from unit tests; eliminated specific random seeds used in unit tests * addessing PR feedback * increased number of lasso fit iterations to address prediction inaccuracies when fitting heirarchical categorical objective * expect to have resolved flakey test by increasing training set size * addressing pylint * still trying to understand prediction accuracy miss Co-authored-by: Ed Thayer <edthaye@microsoft.com>
microsoft · Oct 31, 2020 · 417ff56 · 417ff56
1 parent d946242
commit 417ff56
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 205 deletions.
diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/RegressionEnhancedRandomForestModel.py
@@ -10,7 +10,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import r2_score
 from sklearn.model_selection import GridSearchCV
-from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import PolynomialFeatures, StandardScaler
 
 from mlos.Logger import create_logger
 from mlos.Optimizers.RegressionModels.RegressionModel import RegressionModel
@@ -20,6 +20,7 @@
 from mlos.Optimizers.RegressionModels.SklearnRandomForestRegressionModelConfig import SklearnRandomForestRegressionModelConfig
 from mlos.Spaces import Hypergrid, SimpleHypergrid, ContinuousDimension, DiscreteDimension, CategoricalDimension, Point
 from mlos.Spaces.Configs.DefaultConfigMeta import DefaultConfigMeta
+from mlos.Spaces.HypergridAdapters.CategoricalToOneHotEncodedHypergridAdapter import CategoricalToOneHotEncodedHypergridAdapter
 from mlos.Tracer import trace
 
 # sklearn injects many warnings, so from
@@ -98,43 +99,6 @@ class RegressionEnhancedRandomForestRegressionModelConfig(metaclass=DefaultConfi
  def contains(cls, config):
  return config in cls.CONFIG_SPACE
 
- # @classmethod
- # def create_from_config_point(cls, config_point):
- # assert cls.contains(config_point)
- # config_key_value_pairs = {param_name: value for param_name, value in config_point}
- # return cls(**config_key_value_pairs)
- #
- # def __init__(
- # self,
- # max_basis_function_degree=_DEFAULT.max_basis_function_degree,
- # boosting_root_model_name=_DEFAULT.boosting_root_model_name,
- # min_abs_root_model_coef=_DEFAULT.min_abs_root_model_coef,
- # boosting_root_model_config: Point() = _DEFAULT.sklearn_lasso_regression_model_config,
- # random_forest_model_config: Point() = _DEFAULT.sklearn_random_forest_regression_model_config,
- # residual_model_name=_DEFAULT.residual_model_name,
- # perform_initial_root_model_hyper_parameter_search=_DEFAULT.perform_initial_root_model_hyper_parameter_search,
- # perform_initial_random_forest_hyper_parameter_search=_DEFAULT.perform_initial_random_forest_hyper_parameter_search
- # ):
- # self.max_basis_function_degree = max_basis_function_degree
- # self.residual_model_name = residual_model_name
- # self.min_abs_root_model_coef = min_abs_root_model_coef
- # self.perform_initial_root_model_hyper_parameter_search = perform_initial_root_model_hyper_parameter_search
- # self.perform_initial_random_forest_hyper_parameter_search = perform_initial_random_forest_hyper_parameter_search
- #
- # self.boosting_root_model_name = boosting_root_model_name
- # self.boosting_root_model_config = None
- # if self.boosting_root_model_name == SklearnLassoRegressionModelConfig.__name__:
- # self.boosting_root_model_config = SklearnLassoRegressionModelConfig \
- # .create_from_config_point(boosting_root_model_config)
- # elif self.boosting_root_model_name == SklearnRidgeRegressionModelConfig.__name__:
- # self.boosting_root_model_config = SklearnRidgeRegressionModelConfig \
- # .create_from_config_point(boosting_root_model_config)
- # else:
- # print('Unrecognized boosting_root_model_name "{}"'.format(self.boosting_root_model_name))
- #
- # self.random_forest_model_config = SklearnRandomForestRegressionModelConfig \
- # .create_from_config_point(random_forest_model_config)
-
 
 class RegressionEnhancedRandomForestRegressionModel(RegressionModel):
  """ Regression-Enhanced RandomForest Regression model
@@ -182,6 +146,11 @@ def __init__(
  output_space=output_space
  )
  self.model_config = model_config
+
+ # one hot encode categorical input dimensions
+ self.one_hot_encoder_adapter = CategoricalToOneHotEncodedHypergridAdapter(adaptee=input_space, merge_all_categorical_dimensions=True, drop='first')
+ self.input_space = input_space
+
  self.input_dimension_names = [dimension.name for dimension in self.input_space.dimensions]
  self.output_dimension_names = [dimension.name for dimension in self.output_space.dimensions]
 
@@ -203,13 +172,9 @@ def __init__(
  self.variance_estimate_ = None
  self.root_model_gradient_coef_ = None
  self.polynomial_features_powers_ = None
- self.num_dummy_vars_ = None
- self.num_categorical_dims_ = None
- self.continuous_dim_col_names_ = None
- self.categorical_dim_col_names_ = None
- self.dummy_var_map_ = None
- self.dummy_var_cols_ = None
+
  self.categorical_zero_cols_idx_to_delete_ = None
+ self.scaler_ = StandardScaler()
 
  @trace()
  def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration_number=0):
@@ -227,7 +192,10 @@ def fit(self, feature_values_pandas_frame, target_values_pandas_frame, iteration
  """
  # pull X and y values from data frames passed
  y = target_values_pandas_frame[self.output_dimension_names].to_numpy().reshape(-1)
- x_df = feature_values_pandas_frame[self.input_dimension_names]
+ x_df = self.one_hot_encoder_adapter.project_dataframe(df=feature_values_pandas_frame, in_place=False)
+ continuous_dim_col_names = list(set.difference(set(x_df.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names())))
+ x_df[continuous_dim_col_names] = self.scaler_.fit_transform(x_df[continuous_dim_col_names])
+
  fit_x = self.transform_x(x_df, what_to_return='fit_x')
 
  # run root regression
@@ -427,7 +395,10 @@ def predict(self, feature_values_pandas_frame, include_only_valid_rows=True):
  set(feature_values_pandas_frame.columns.values))
  for missing_column_name in missing_column_names:
  feature_values_pandas_frame[missing_column_name] = np.NaN
- x_df = feature_values_pandas_frame[self.input_dimension_names]
+ x_df = self.one_hot_encoder_adapter.project_dataframe(df=feature_values_pandas_frame, in_place=False)
+ continuous_dim_col_names = list(set.difference(set(x_df.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names())))
+ x_df[continuous_dim_col_names] = self.scaler_.transform(x_df[continuous_dim_col_names])
+
  x_star = self.transform_x(x_df)
 
  base_predicted = self.base_regressor_.predict(x_star)
@@ -466,24 +437,6 @@ def score(self, feature_values_pandas_frame, target_values_pandas_frame):
  r2 = r2_score(y, predictions_df[Prediction.LegalColumnNames.PREDICTED_VALUE.value])
  return r2
 
- def _create_one_hot_encoding_map(self, categorical_values):
- if self.dummy_var_map_ is not None and self.dummy_var_cols_ is not None:
- return self.dummy_var_cols_, self.dummy_var_map_
-
- sorted_unique_categorical_levels = np.sort(categorical_values.unique()).tolist()
- num_dummy_vars = len(sorted_unique_categorical_levels) - 1 # dropping first
- dummy_var_cols = []
- dummy_var_map = {sorted_unique_categorical_levels.pop(0): np.zeros(num_dummy_vars)}
- for i, level in enumerate(sorted_unique_categorical_levels):
- dummy_var_map[level] = np.zeros(num_dummy_vars)
- dummy_var_map[level][i] = 1
- dummy_var_cols.append(f'ohe_{i}')
-
- self.dummy_var_map_ = dummy_var_map
- self.dummy_var_cols_ = dummy_var_cols
-
- return dummy_var_cols, dummy_var_map
-
  def _set_categorical_powers_table(self,
  num_continuous_dims=0,
  num_categorical_levels=0,
@@ -560,39 +513,20 @@ def _explode_x(self, x):
  """
  fit_x = x
 
- # find categorical features
- if self.categorical_dim_col_names_ is None:
- self.categorical_dim_col_names_ = [x.columns.values[i] for i in range(len(x.columns.values)) if x.dtypes[i] == object]
- categorical_dim_col_names = self.categorical_dim_col_names_
- if self.continuous_dim_col_names_ is None:
- self.continuous_dim_col_names_ = [x.columns.values[i] for i in range(len(x.columns.values)) if x.dtypes[i] != object]
- continuous_dim_col_names = self.continuous_dim_col_names_
- if self.num_categorical_dims_ is None:
- self.num_categorical_dims_ = len(categorical_dim_col_names)
- num_categorical_dims_ = self.num_categorical_dims_
-
+ continuous_dim_col_names = list(set.difference(set(x.columns.values), set(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names())))
+ num_categorical_dims_ = len(self.one_hot_encoder_adapter.get_one_hot_encoded_column_names())
  if num_categorical_dims_ > 0:
  # use the following to create one hot encoding columns prior to constructing fit_x and powers_ table
  working_x = x[continuous_dim_col_names].copy()
 
- # create dummy variables for OneHotEncoding with dropped first category level
- x['flattened_categoricals'] = x[categorical_dim_col_names].apply(
- lambda cat_row: '-'.join(cat_row.map(str)),
- axis=1)
- dummy_var_cols, dummy_var_map = self._create_one_hot_encoding_map(x['flattened_categoricals'])
- working_x[dummy_var_cols] = x.apply(lambda row: dummy_var_map[row['flattened_categoricals']],
- axis=1,
- result_type="expand")
-
- # create transformed x for linear fit with dummy variable (one hot encoding)
- # add continuous dimension columns corresponding to each categorical level
- if self.num_dummy_vars_ is None:
- self.num_dummy_vars_ = len(dummy_var_cols)
- num_dummy_vars = self.num_dummy_vars_
- for i in range(num_dummy_vars):
+ dummy_var_cols = self.one_hot_encoder_adapter.get_one_hot_encoded_column_names()
+ num_dummy_vars = len(dummy_var_cols)
+ working_x[dummy_var_cols] = x[dummy_var_cols]
+
+ for dummy_var_col in dummy_var_cols:
  for cont_dim_name in continuous_dim_col_names:
- dummy_times_x_col_name = f'{cont_dim_name}*ohe_{i}'
- working_x[dummy_times_x_col_name] = working_x[cont_dim_name] * working_x[dummy_var_cols[i]]
+ dummy_times_x_col_name = f'{cont_dim_name}*{dummy_var_col}'
+ working_x[dummy_times_x_col_name] = working_x[cont_dim_name] * working_x[dummy_var_col]
 
  # add exploded x weighted by oneHotEncoded columns
  # add polynomial for 000...000 encoding
@@ -605,7 +539,7 @@ def _explode_x(self, x):
  # add polynomial for non-000...000 encodings
  last_col_filled = num_terms_in_poly
  for ohe_col_name in dummy_var_cols:
- cols_for_poly_transform = [cn for cn in working_x.columns.values if cn.find(ohe_col_name) > 0]
+ cols_for_poly_transform = [cn for cn in working_x.columns.values if cn.endswith(ohe_col_name) and cn != ohe_col_name]
  ohe_poly = self.polynomial_features_transform_.fit_transform(working_x[cols_for_poly_transform])
  ohe_poly[:, 0] = ohe_poly[:, 0] * working_x[ohe_col_name] # replace global intercept w/ intercept offset term
  fit_x[:, last_col_filled:last_col_filled + num_terms_in_poly] = ohe_poly
@@ -623,15 +557,12 @@ def _explode_x(self, x):
  if self.polynomial_features_powers_ is None:
  self._set_categorical_powers_table(
  num_continuous_dims=len(continuous_dim_col_names),
- num_categorical_levels=len(x['flattened_categoricals'].unique()),
+ num_categorical_levels=num_dummy_vars+1,
  num_terms_in_poly=num_terms_in_poly,
  num_dummy_vars=num_dummy_vars,
  zero_cols_idx=zero_cols_idx
  )
 
- # remove temporary fields
- x.drop(columns=['flattened_categoricals'], inplace=True)
-
  elif self.model_config.max_basis_function_degree > 1:
  fit_x = self.polynomial_features_transform_.fit_transform(x)
  self.polynomial_features_powers_ = self.polynomial_features_transform_.powers_

diff --git a/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py b/source/Mlos.Python/mlos/Optimizers/RegressionModels/SklearnLassoRegressionModelConfig.py
@@ -42,7 +42,7 @@ class Selection(Enum):
  # sklearn model expects precompute type str, bool, array-like, so setting to default and exclude list option
  precompute=False,
  copy_x=True,
- max_iter=1000,
+ max_iter=2000,
  tol=10 ** -4,
  warm_start=False,
  positive=False