Skip to content

Commit

Permalink
add remove correlated predictors screen
Browse files Browse the repository at this point in the history
  • Loading branch information
csinva committed Mar 14, 2024
1 parent d3cd37d commit a5a4d6d
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 6 deletions.
12 changes: 12 additions & 0 deletions imodels/algebraic/gam_multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
from imodels.util.transforms import CorrelationScreenTransformer

import imodels
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
Expand Down Expand Up @@ -47,6 +48,7 @@ def __init__(
use_normalize_feature_targets=False,
use_internal_classifiers=False,
fit_target_curves=True,
use_correlation_screening_for_features=False,
random_state=42,
):
"""
Expand All @@ -66,6 +68,8 @@ def __init__(
whether to use internal classifiers (as opposed to regressors)
fit_target_curves: bool
whether to fit an EBM to predict the target
use_correlation_screening_for_features: bool
whether to use correlation screening for features
"""
self.ebm_kwargs = ebm_kwargs
self.multitask = multitask
Expand All @@ -77,6 +81,7 @@ def __init__(
self.renormalize_features = renormalize_features
self.use_internal_classifiers = use_internal_classifiers
self.fit_target_curves = fit_target_curves
self.use_correlation_screening_for_features = use_correlation_screening_for_features

# override ebm_kwargs
ebm_kwargs['random_state'] = random_state
Expand Down Expand Up @@ -158,6 +163,9 @@ def fit(self, X, y, sample_weight=None):
if self.renormalize_features:
self.scaler_ = StandardScaler()
feats = self.scaler_.fit_transform(feats)
if self.use_correlation_screening_for_features:
self.correlation_screener_ = CorrelationScreenTransformer()
feats = self.correlation_screener_.fit_transform(feats, y)
feats[np.isinf(feats)] = 0

# fit linear model
Expand Down Expand Up @@ -222,6 +230,8 @@ def predict(self, X):
feats = self._extract_ebm_features(X)
if hasattr(self, 'scaler_'):
feats = self.scaler_.transform(feats)
if hasattr(self, 'correlation_screener_'):
feats = self.correlation_screener_.transform(feats)
feats[np.isinf(feats)] = 0
return self.lin_model.predict(feats)

Expand All @@ -239,6 +249,8 @@ def predict_proba(self, X):
feats = self._extract_ebm_features(X)
if hasattr(self, 'scaler_'):
feats = self.scaler_.transform(feats)
if hasattr(self, 'correlation_screener_'):
feats = self.correlation_screener_.transform(feats)
return self.lin_model.predict_proba(feats)

# multi-output without multitask learning
Expand Down
77 changes: 75 additions & 2 deletions imodels/util/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
'''

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


class Winsorizer():
Expand All @@ -21,7 +23,8 @@ def train(self, X):
if self.trim_quantile > 0:
for i_col in np.arange(X.shape[1]):
lower = np.percentile(X[:, i_col], self.trim_quantile * 100)
upper = np.percentile(X[:, i_col], 100 - self.trim_quantile * 100)
upper = np.percentile(
X[:, i_col], 100 - self.trim_quantile * 100)
self.winsor_lims[:, i_col] = [lower, upper]

def trim(self, X):
Expand Down Expand Up @@ -53,11 +56,81 @@ def train(self, X):
for i_col in np.arange(X.shape[1]):
num_uniq_vals = len(np.unique(X[:, i_col]))
if num_uniq_vals > 2: # don't scale binary variables which are effectively already rules
scale_multipliers[i_col] = 0.4 / (1.0e-12 + np.std(X_trimmed[:, i_col]))
scale_multipliers[i_col] = 0.4 / \
(1.0e-12 + np.std(X_trimmed[:, i_col]))
self.scale_multipliers = scale_multipliers

def scale(self, X):
if self.winsorizer != None:
return self.winsorizer.trim(X) * self.scale_multipliers
else:
return X * self.scale_multipliers


class CorrelationScreenTransformer(BaseEstimator, TransformerMixin):
'''Finds correlated features above a magnitude threshold
and zeros out all but the first of them
'''

def __init__(self, threshold=1.0):
# Initialize with a correlation threshold
self.threshold = threshold
self.correlated_feature_sets = []

def fit(self, X, y=None):
# Check if X is a pandas DataFrame; if not, convert it to DataFrame
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# Calculate the correlation matrix
corr_matrix = X.corr().abs()

# Identify the features that are correlated based on the threshold
for i in range(len(corr_matrix.columns)):
for j in range(i):
if corr_matrix.iloc[i, j] >= self.threshold or corr_matrix.iloc[i, j] <= -self.threshold:
# Find the set this feature belongs to
found_set = False
for feature_set in self.correlated_feature_sets:
if i in feature_set or j in feature_set:
feature_set.update([i, j])
found_set = True
break
if not found_set:
self.correlated_feature_sets.append(set([i, j]))

# Convert the sets to list of lists where each sublist has indexes to keep and to remove
self.to_keep_remove = []
for feature_set in self.correlated_feature_sets:
feature_list = list(feature_set)
# keep the first, remove the rest
self.to_keep_remove.append((feature_list[0], feature_list[1:]))

return self

def transform(self, X):
# Again, check if X is a pandas DataFrame; if not, convert it
input_type = type(X)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

# Set the identified correlated features (except the first) to 0
X_transformed = X.copy()
for keep, to_remove in self.to_keep_remove:
X_transformed.iloc[:, to_remove] = 0

if input_type == np.ndarray:
X_transformed == X_transformed.values

return X_transformed


if __name__ == '__main__':
X = np.random.randn(5, 5)
X[:, 0] = [1, 1, 0, 1, 1]
X[:, 1] = X[:, 0]

transformer = CorrelationScreenTransformer()
print(X)
X_transformed = transformer.fit_transform(X)
print(X_transformed)
11 changes: 7 additions & 4 deletions tests/gam_multitask_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,16 +158,19 @@ def compare_models():

# remove some features to speed things up
# X = X[:, :2]
X = X[:50]
y = y[:50]
X, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

results = defaultdict(list)
for gam in tqdm([
MultiTaskGAMRegressor(use_correlation_screening_for_features=True),
MultiTaskGAMRegressor(),
# MultiTaskGAMRegressor(fit_target_curves=False),
AdaBoostRegressor(
estimator=MultiTaskGAMRegressor(
ebm_kwargs={'max_rounds': 50}),
n_estimators=8),
# AdaBoostRegressor(
# estimator=MultiTaskGAMRegressor(
# ebm_kwargs={'max_rounds': 50}),
# n_estimators=8),
# AdaBoostRegressor(estimator=MultiTaskGAMRegressor(
# multitask=True), n_estimators=2),
# MultiTaskGAMRegressor(multitask=True, onehot_prior=True),
Expand Down

0 comments on commit a5a4d6d

Please sign in to comment.