Skip to content

Commit

Permalink
Merge pull request #2 from mikemull/shap
Browse files Browse the repository at this point in the history
Move SHAP stuff to separate file
  • Loading branch information
mikemull authored Jan 3, 2025
2 parents 95886e8 + 5693bd0 commit ec6286b
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 84 deletions.
100 changes: 18 additions & 82 deletions boruta_shap_min/borutashap.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings

from sklearn.ensemble import (
RandomForestClassifier,
Expand All @@ -14,10 +15,9 @@
import numpy as np
from numpy.random import choice
import seaborn as sns
import shap

from boruta_shap_min.shap import ShapExplainer

import warnings

warnings.filterwarnings("ignore")

Expand All @@ -44,7 +44,7 @@ def __init__(
be returned.
importance_measure: String
Which importance measure too use either Shap or Gini/Gain
Which importance measure to use either Shap or Gini/Gain
classification: Boolean
if true then the problem is either a binary or multiclass problem otherwise if false then it is regression
Expand All @@ -54,9 +54,9 @@ def __init__(
would make the algorithm more lenient.
pvalue: float
A float used as a significance level again if the p-value is increased the algorithm will be more lenient making it smaller
would make it more strict also by making the model more strict could impact runtime making it slower. As it will be less likley
to reject and accept features.
A float used as a significance level again if the p-value is increased the algorithm will be more
lenient making it smaller would make it more strict also by making the model more strict could
impact runtime making it slower. As it will be less likely to reject and accept features.
"""

Expand Down Expand Up @@ -88,21 +88,20 @@ def __init__(

def check_model(self):
"""
Checks that a model object has been passed as a parameter when intiializing the BorutaShap class.
Checks that a model object has been passed as a parameter when initializing the BorutaShap class.
Returns
-------
Model Object
If no model specified then a base Random Forest will be returned otherwise the specifed model will
If no model specified then a base Random Forest will be returned otherwise the specified model will
be returned.
Raises
------
AttirbuteError
AttributeError
If the model object does not have the required attributes.
"""

check_fit = hasattr(self.model, "fit")
check_predict_proba = hasattr(self.model, "predict")

Expand Down Expand Up @@ -133,11 +132,11 @@ def check_x(self):
Returns
-------
Datframe
Dataframe
Raises
------
AttirbuteError
AttributeError
If the data is not of the expected type.
"""
Expand Down Expand Up @@ -206,11 +205,11 @@ def check_if_chose_train_or_test_and_train_model(self):
"""
if self.stratify is not None and not self.classification:
raise ValueError(
"Cannot take a strtified sample from continuous variable please bucket the variable and try again !"
"Cannot take a stratified sample from continuous variable please bucket the variable and try again !"
)

if self.train_or_test.lower() == "test":
# keeping the same naming convenetion as to not add complexit later on
# keeping the same naming convention as to not add complexity later on
(
self.X_boruta_train,
self.X_boruta_test,
Expand Down Expand Up @@ -583,7 +582,7 @@ def create_shadow_features(self):
Creates the random shadow features by shuffling the existing columns.
Returns:
Datframe with random permutations of the original columns.
Dataframe with random permutations of the original columns.
"""
self.X_shadow = self.X.apply(np.random.permutation)

Expand Down Expand Up @@ -619,7 +618,7 @@ def calculate_zscore(array):

def feature_importance(self, normalize):
"""
Caculates the feature importances scores of the model
Calculates the feature importance scores of the model
Parameters
----------
Expand All @@ -639,8 +638,8 @@ def feature_importance(self, normalize):
"""

if self.importance_measure == "shap":
self.explain()
vals = self.shap_values
xboruta = self.find_sample() if self.sample else self.X_boruta
vals = ShapExplainer(self.model, self.classification).explain(xboruta)

if normalize:
vals = self.calculate_zscore(vals)
Expand Down Expand Up @@ -719,69 +718,6 @@ def find_sample(self):

return self.X_boruta.iloc[sample_indices]

def explain(self):
"""
The shap package has numerous variants of explainers which use different assumptions depending on the model
type this function allows the user to choose explainer
Returns:
shap values
Raise
----------
ValueError:
if no model type has been specified tree as default
"""

explainer = shap.TreeExplainer(
self.model, feature_perturbation="tree_path_dependent", approximate=True
)

if self.sample:
if self.classification:
# for some reason shap returns values wraped in a list of length 1

self.shap_values = np.array(explainer.shap_values(self.find_sample()))
if isinstance(self.shap_values, list):
class_inds = range(len(self.shap_values))
shap_imp = np.zeros(self.shap_values[0].shape[1])
for i, ind in enumerate(class_inds):
shap_imp += np.abs(self.shap_values[ind]).mean(0)
self.shap_values /= len(self.shap_values)

elif len(self.shap_values.shape) == 3:
self.shap_values = np.abs(self.shap_values).sum(axis=0)
self.shap_values = self.shap_values.mean(axis=1)

else:
self.shap_values = np.abs(self.shap_values).mean(0)

else:
self.shap_values = explainer.shap_values(self.find_sample())
self.shap_values = np.abs(self.shap_values).mean(0)

else:
if self.classification:
# for some reason shap returns values wraped in a list of length 1
self.shap_values = np.array(explainer.shap_values(self.X_boruta))
if isinstance(self.shap_values, list):
class_inds = range(len(self.shap_values))
shap_imp = np.zeros(self.shap_values[0].shape[1])
for i, ind in enumerate(class_inds):
shap_imp += np.abs(self.shap_values[ind]).mean(0)
self.shap_values /= len(self.shap_values)

elif len(self.shap_values.shape) == 3:
self.shap_values = np.abs(self.shap_values).sum(axis=0)
self.shap_values = self.shap_values.mean(axis=1)

else:
self.shap_values = np.abs(self.shap_values).mean(0)

else:
self.shap_values = explainer.shap_values(self.X_boruta)
self.shap_values = np.abs(self.shap_values).mean(0)

@staticmethod
def binomial_H0_test(array, n, p, alternative):
"""
Expand Down Expand Up @@ -824,7 +760,7 @@ def bonferoni_corrections(pvals, alpha=0.05, n_tests=None):
def test_features(self, iteration):
"""
For each feature with an undetermined importance perform a two-sided test of equality
with the maximum shadow value to determine if it is statistcally better
with the maximum shadow value to determine if it is statistically better
Parameters
----------
Expand Down
50 changes: 50 additions & 0 deletions boruta_shap_min/shap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

import numpy as np
import shap


class ShapExplainer:

def __init__(self, model, classification):
self.model = model
self.classification = classification

def explain(self, x_boruta):
"""
The shap package has numerous variants of explainers which use different assumptions depending on the model
type this function allows the user to choose explainer
Returns:
shap values
Raise
----------
ValueError:
if no model type has been specified tree as default
"""
explainer = shap.TreeExplainer(
self.model, feature_perturbation="tree_path_dependent", approximate=True
)

if self.classification:
# for some reason shap returns values wraped in a list of length 1
shap_vals = np.array(explainer.shap_values(x_boruta))
if isinstance(shap_vals, list):
class_inds = range(len(shap_vals))
shap_imp = np.zeros(shap_vals[0].shape[1])
for i, ind in enumerate(class_inds):
shap_imp += np.abs(shap_vals[ind]).mean(0)
shap_vals /= len(shap_vals)

elif len(shap_vals.shape) == 3:
shap_vals = np.abs(shap_vals).sum(axis=0)
shap_vals = shap_vals.mean(axis=1)

else:
shap_vals = np.abs(shap_vals).mean(0)

else:
shap_vals = explainer.shap_values(x_boruta)
shap_vals = np.abs(shap_vals).mean(0)

return shap_vals
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "boruta-shap-min"
version = "0.1.3"
version = "0.1.4"
description = "Minimal version of BorutaShap"
authors = ["Eoghan Keany", "Mike Mull <mike.mull@gmail.com>"]
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_borutashap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest

import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor

from boruta_shap_min.borutashap import BorutaShap, load_data

Expand Down

0 comments on commit ec6286b

Please sign in to comment.