diff --git a/boruta_shap_min/borutashap.py b/boruta_shap_min/borutashap.py index 83b43b0..a1933e7 100644 --- a/boruta_shap_min/borutashap.py +++ b/boruta_shap_min/borutashap.py @@ -1,3 +1,4 @@ +import warnings from sklearn.ensemble import ( RandomForestClassifier, @@ -14,10 +15,9 @@ import numpy as np from numpy.random import choice import seaborn as sns -import shap +from boruta_shap_min.shap import ShapExplainer -import warnings warnings.filterwarnings("ignore") @@ -44,7 +44,7 @@ def __init__( be returned. importance_measure: String - Which importance measure too use either Shap or Gini/Gain + Which importance measure to use either Shap or Gini/Gain classification: Boolean if true then the problem is either a binary or multiclass problem otherwise if false then it is regression @@ -54,9 +54,9 @@ def __init__( would make the algorithm more lenient. pvalue: float - A float used as a significance level again if the p-value is increased the algorithm will be more lenient making it smaller - would make it more strict also by making the model more strict could impact runtime making it slower. As it will be less likley - to reject and accept features. + A float used as a significance level again if the p-value is increased the algorithm will be more + lenient making it smaller would make it more strict also by making the model more strict could + impact runtime making it slower. As it will be less likely to reject and accept features. """ @@ -88,21 +88,20 @@ def __init__( def check_model(self): """ - Checks that a model object has been passed as a parameter when intiializing the BorutaShap class. + Checks that a model object has been passed as a parameter when initializing the BorutaShap class. Returns ------- Model Object - If no model specified then a base Random Forest will be returned otherwise the specifed model will + If no model specified then a base Random Forest will be returned otherwise the specified model will be returned. Raises ------ - AttirbuteError + AttributeError If the model object does not have the required attributes. """ - check_fit = hasattr(self.model, "fit") check_predict_proba = hasattr(self.model, "predict") @@ -133,11 +132,11 @@ def check_x(self): Returns ------- - Datframe + Dataframe Raises ------ - AttirbuteError + AttributeError If the data is not of the expected type. """ @@ -206,11 +205,11 @@ def check_if_chose_train_or_test_and_train_model(self): """ if self.stratify is not None and not self.classification: raise ValueError( - "Cannot take a strtified sample from continuous variable please bucket the variable and try again !" + "Cannot take a stratified sample from continuous variable please bucket the variable and try again !" ) if self.train_or_test.lower() == "test": - # keeping the same naming convenetion as to not add complexit later on + # keeping the same naming convention as to not add complexity later on ( self.X_boruta_train, self.X_boruta_test, @@ -583,7 +582,7 @@ def create_shadow_features(self): Creates the random shadow features by shuffling the existing columns. Returns: - Datframe with random permutations of the original columns. + Dataframe with random permutations of the original columns. """ self.X_shadow = self.X.apply(np.random.permutation) @@ -619,7 +618,7 @@ def calculate_zscore(array): def feature_importance(self, normalize): """ - Caculates the feature importances scores of the model + Calculates the feature importance scores of the model Parameters ---------- @@ -639,8 +638,8 @@ def feature_importance(self, normalize): """ if self.importance_measure == "shap": - self.explain() - vals = self.shap_values + xboruta = self.find_sample() if self.sample else self.X_boruta + vals = ShapExplainer(self.model, self.classification).explain(xboruta) if normalize: vals = self.calculate_zscore(vals) @@ -719,69 +718,6 @@ def find_sample(self): return self.X_boruta.iloc[sample_indices] - def explain(self): - """ - The shap package has numerous variants of explainers which use different assumptions depending on the model - type this function allows the user to choose explainer - - Returns: - shap values - - Raise - ---------- - ValueError: - if no model type has been specified tree as default - """ - - explainer = shap.TreeExplainer( - self.model, feature_perturbation="tree_path_dependent", approximate=True - ) - - if self.sample: - if self.classification: - # for some reason shap returns values wraped in a list of length 1 - - self.shap_values = np.array(explainer.shap_values(self.find_sample())) - if isinstance(self.shap_values, list): - class_inds = range(len(self.shap_values)) - shap_imp = np.zeros(self.shap_values[0].shape[1]) - for i, ind in enumerate(class_inds): - shap_imp += np.abs(self.shap_values[ind]).mean(0) - self.shap_values /= len(self.shap_values) - - elif len(self.shap_values.shape) == 3: - self.shap_values = np.abs(self.shap_values).sum(axis=0) - self.shap_values = self.shap_values.mean(axis=1) - - else: - self.shap_values = np.abs(self.shap_values).mean(0) - - else: - self.shap_values = explainer.shap_values(self.find_sample()) - self.shap_values = np.abs(self.shap_values).mean(0) - - else: - if self.classification: - # for some reason shap returns values wraped in a list of length 1 - self.shap_values = np.array(explainer.shap_values(self.X_boruta)) - if isinstance(self.shap_values, list): - class_inds = range(len(self.shap_values)) - shap_imp = np.zeros(self.shap_values[0].shape[1]) - for i, ind in enumerate(class_inds): - shap_imp += np.abs(self.shap_values[ind]).mean(0) - self.shap_values /= len(self.shap_values) - - elif len(self.shap_values.shape) == 3: - self.shap_values = np.abs(self.shap_values).sum(axis=0) - self.shap_values = self.shap_values.mean(axis=1) - - else: - self.shap_values = np.abs(self.shap_values).mean(0) - - else: - self.shap_values = explainer.shap_values(self.X_boruta) - self.shap_values = np.abs(self.shap_values).mean(0) - @staticmethod def binomial_H0_test(array, n, p, alternative): """ @@ -824,7 +760,7 @@ def bonferoni_corrections(pvals, alpha=0.05, n_tests=None): def test_features(self, iteration): """ For each feature with an undetermined importance perform a two-sided test of equality - with the maximum shadow value to determine if it is statistcally better + with the maximum shadow value to determine if it is statistically better Parameters ---------- diff --git a/boruta_shap_min/shap.py b/boruta_shap_min/shap.py new file mode 100644 index 0000000..3d25f59 --- /dev/null +++ b/boruta_shap_min/shap.py @@ -0,0 +1,50 @@ + +import numpy as np +import shap + + +class ShapExplainer: + + def __init__(self, model, classification): + self.model = model + self.classification = classification + + def explain(self, x_boruta): + """ + The shap package has numerous variants of explainers which use different assumptions depending on the model + type this function allows the user to choose explainer + + Returns: + shap values + + Raise + ---------- + ValueError: + if no model type has been specified tree as default + """ + explainer = shap.TreeExplainer( + self.model, feature_perturbation="tree_path_dependent", approximate=True + ) + + if self.classification: + # for some reason shap returns values wraped in a list of length 1 + shap_vals = np.array(explainer.shap_values(x_boruta)) + if isinstance(shap_vals, list): + class_inds = range(len(shap_vals)) + shap_imp = np.zeros(shap_vals[0].shape[1]) + for i, ind in enumerate(class_inds): + shap_imp += np.abs(shap_vals[ind]).mean(0) + shap_vals /= len(shap_vals) + + elif len(shap_vals.shape) == 3: + shap_vals = np.abs(shap_vals).sum(axis=0) + shap_vals = shap_vals.mean(axis=1) + + else: + shap_vals = np.abs(shap_vals).mean(0) + + else: + shap_vals = explainer.shap_values(x_boruta) + shap_vals = np.abs(shap_vals).mean(0) + + return shap_vals diff --git a/pyproject.toml b/pyproject.toml index 3b80077..eaaf0ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "boruta-shap-min" -version = "0.1.3" +version = "0.1.4" description = "Minimal version of BorutaShap" authors = ["Eoghan Keany", "Mike Mull "] readme = "README.md" diff --git a/tests/test_borutashap.py b/tests/test_borutashap.py index 7496f5e..2a905d0 100644 --- a/tests/test_borutashap.py +++ b/tests/test_borutashap.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.tree import DecisionTreeRegressor from boruta_shap_min.borutashap import BorutaShap, load_data