diff --git a/README.rst b/README.rst
index 6555ff37..89586390 100644
--- a/README.rst
+++ b/README.rst
@@ -60,21 +60,37 @@ Installation
------------
FACET supports both PyPI and Anaconda.
-
+We recommend to install FACET into a dedicated environment.
Anaconda
~~~~~~~~
-.. code-block:: RST
+.. code-block:: sh
- conda install gamma-facet -c bcg_gamma -c conda-forge
+ conda create -n facet
+ conda activate facet
+ conda install -c bcg_gamma -c conda-forge gamma-facet
Pip
~~~
-.. code-block:: RST
+macOS and Linux:
+^^^^^^^^^^^^^^^^
+
+.. code-block:: sh
+
+ python -m venv facet
+ source facet/bin/activate
+ pip install gamma-facet
+
+Windows:
+^^^^^^^^
+
+.. code-block:: dosbatch
+ python -m venv facet
+ facet\Scripts\activate.bat
pip install gamma-facet
@@ -163,7 +179,7 @@ hyperparameter configurations and even multiple learners with the `LearnerSelect
cv=rkf_cv,
n_jobs=-3,
scoring="r2"
- ).fit(sample=diabetes_sample)
+ ).fit(diabetes_sample)
# get summary report
selector.summary_report()
@@ -238,7 +254,7 @@ The key global metrics for each pair of features in a model are:
inspector = LearnerInspector(
pipeline=selector.best_estimator_,
n_jobs=-3
- ).fit(sample=diabetes_sample)
+ ).fit(diabetes_sample)
**Synergy**
diff --git a/environment.yml b/environment.yml
index 5b5c6f92..3d182892 100644
--- a/environment.yml
+++ b/environment.yml
@@ -16,6 +16,7 @@ dependencies:
- scipy ~= 1.10
- shap ~= 0.41
- sklearndf >= 2.2rc, < 3a
+ - typing_extensions ~= 4.3
# build/test
- conda-build ~= 3.23.3
- conda-verify ~= 3.1.1
diff --git a/pyproject.toml b/pyproject.toml
index b45107af..498e55d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,14 +14,15 @@ dist-name = "gamma-facet"
license = "Apache Software License v2.0"
requires = [
- "gamma-pytools >=2.1rc2,<3a",
- "matplotlib ~=3.0",
- "numpy >=1.21,<2a", # cannot use ~= due to conda bug
- "packaging >=20",
- "pandas >=1.0",
- "scipy ~=1.2",
- "shap >=0.34,<0.42a",
- "sklearndf >=2.2rc,<3a",
+ "gamma-pytools >=2.1rc2,<3a",
+ "matplotlib ~=3.0",
+ "numpy >=1.21,<2a", # cannot use ~= due to conda bug
+ "packaging >=20",
+ "pandas >=1.0",
+ "scipy ~=1.2",
+ "shap >=0.39",
+ "sklearndf >=2.2rc,<3a",
+ "typing_extensions ~=4.0",
]
requires-python = ">=3.7,<4a"
@@ -71,49 +72,51 @@ no-binary.min = ["matplotlib", "shap"]
[build.matrix.min]
# direct requirements of gamma-facet
-gamma-pytools = "~=2.1rc2"
-matplotlib = "~=3.0.3"
-numpy = "==1.21.6" # cannot use ~= due to conda bug
-packaging = "~=20.9"
-pandas = "~=1.0.5"
-python = ">=3.7.12,<3.8a" # cannot use ~= due to conda bug
-scipy = "~=1.4.1"
-shap = "~=0.34.0"
-sklearndf = "~=2.2rc"
+gamma-pytools = "~=2.1rc2"
+matplotlib = "~=3.0.3"
+numpy = "==1.21.6" # cannot use ~= due to conda bug
+packaging = "~=20.9"
+pandas = "~=1.0.5"
+python = ">=3.7.12,<3.8a" # cannot use ~= due to conda bug
+scipy = "~=1.4.1"
+shap = "~=0.39.0"
+sklearndf = "~=2.2rc"
+typing_extensions = "~=4.0.0"
# additional minimum requirements of sklearndf
-boruta = "~=0.3.0"
-lightgbm = "~=3.0.0"
+boruta = "~=0.3.0"
+lightgbm = "~=3.0.0"
scikit-learn = "~=0.24.2"
-xgboost = "~=1.5"
+xgboost = "~=1.5"
# additional minimum requirements of gamma-pytools
-joblib = "~=0.14.1"
-typing_inspect = "~=0.4.0"
+joblib = "~=0.14.1"
+typing_inspect = "~=0.4.0"
# additional minimum requirements of shap
-ipython = "==7.0"
-numba = "~=0.55" # required to support numpy 1.21
+ipython = "==7.0"
+numba = "~=0.55" # required to support numpy 1.21
[build.matrix.max]
# direct requirements of gamma-facet
gamma-pytools = ">=2.1rc2,<3a"
matplotlib = "~=3.6"
numpy = ">=1.23,<2a" # cannot use ~= due to conda bug
-packaging = ">=20"
+packaging = ">=20"
pandas = "~=2.0"
-python = ">=3.9,<4a" # cannot use ~= due to conda bug
+python = ">=3.9,<4a" # cannot use ~= due to conda bug
scipy = "~=1.10"
-shap = "~=0.41"
-sklearndf = ">=2.2rc,<3a"
+shap = "~=0.41"
+sklearndf = ">=2.2rc,<3a"
+typing_extensions = "~=4.3"
# additional maximum requirements of sklearndf
-boruta = "~=0.3"
-lightgbm = "~=3.3"
-scikit-learn = "~=1.1"
-xgboost = "~=1.5"
+boruta = "~=0.3"
+lightgbm = "~=3.3"
+scikit-learn = "~=1.1"
+xgboost = "~=1.5"
# additional maximum requirements of gamma-pytools
-joblib = "~=1.1"
-typing_inspect = "~=0.7"
+joblib = "~=1.1"
+typing_inspect = "~=0.7"
# additional maximum requirements of shap
-ipython = ">=7"
-numba = ">=0.55.2" # required to support numpy 1.22
+ipython = ">=7"
+numba = ">=0.55.2" # required to support numpy 1.22
[tool.black]
# quiet = "True"
diff --git a/sphinx/auxiliary/Diabetes_getting_started_example.ipynb b/sphinx/auxiliary/Diabetes_getting_started_example.ipynb
index 9881faa0..1e84f433 100644
--- a/sphinx/auxiliary/Diabetes_getting_started_example.ipynb
+++ b/sphinx/auxiliary/Diabetes_getting_started_example.ipynb
@@ -280,7 +280,7 @@
" cv=rkf_cv, \n",
" n_jobs=-3,\n",
" scoring=\"r2\"\n",
- ").fit(sample=diabetes_sample)\n",
+ ").fit(diabetes_sample)\n",
"\n",
"# get summary report\n",
"selector.summary_report()"
@@ -364,7 +364,7 @@
"inspector = LearnerInspector(\n",
" pipeline=selector.best_estimator_,\n",
" n_jobs=-3\n",
- ").fit(sample=diabetes_sample)"
+ ").fit(diabetes_sample)"
]
},
{
diff --git a/sphinx/source/tutorial/Classification_with_Facet.ipynb b/sphinx/source/tutorial/Classification_with_Facet.ipynb
index 768c87f3..633dd449 100644
--- a/sphinx/source/tutorial/Classification_with_Facet.ipynb
+++ b/sphinx/source/tutorial/Classification_with_Facet.ipynb
@@ -1333,7 +1333,7 @@
" pipeline=clf_selector.best_estimator_,\n",
" n_jobs=-3,\n",
" verbose=False,\n",
- ").fit(sample=prediab_initial_features)"
+ ").fit(prediab_initial_features)"
]
},
{
@@ -1909,7 +1909,7 @@
" pipeline=clf_selector.best_estimator_,\n",
" n_jobs=-3,\n",
" verbose=False,\n",
- ").fit(sample=prediab_no_redundant_feat)"
+ ").fit(prediab_no_redundant_feat)"
]
},
{
diff --git a/src/facet/inspection/__init__.py b/src/facet/inspection/__init__.py
index ba588856..a7939066 100644
--- a/src/facet/inspection/__init__.py
+++ b/src/facet/inspection/__init__.py
@@ -6,5 +6,7 @@
of a learner pipeline which has been fitted using cross-validation.
"""
from ._explainer import *
+from ._function_inspector import *
from ._inspection import *
from ._learner_inspector import *
+from ._model_inspector import *
diff --git a/src/facet/inspection/_explainer.py b/src/facet/inspection/_explainer.py
index d2573175..bdaf11f0 100644
--- a/src/facet/inspection/_explainer.py
+++ b/src/facet/inspection/_explainer.py
@@ -10,11 +10,12 @@
Any,
Callable,
Dict,
+ Generic,
Iterable,
List,
Mapping,
Optional,
- Type,
+ TypeVar,
Union,
cast,
)
@@ -23,24 +24,28 @@
import numpy.typing as npt
import pandas as pd
import shap
-from packaging import version
-from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin, RegressorMixin
+from typing_extensions import TypeAlias
from pytools.api import AllTracker, inheritdoc, validate_type
from pytools.expression import Expression, HasExpressionRepr
from pytools.expression.atomic import Id
from pytools.parallelization import Job, JobQueue, JobRunner, ParallelizableMixin
-from sklearndf import ClassifierDF, LearnerDF, RegressorDF
+
+from ._types import ModelFunction
log = logging.getLogger(__name__)
__all__ = [
"BaseExplainer",
+ "ExactExplainerFactory",
"ExplainerFactory",
"ExplainerJob",
"ExplainerQueue",
+ "FunctionExplainerFactory",
"KernelExplainerFactory",
"ParallelExplainer",
+ "PermutationExplainerFactory",
"TreeExplainerFactory",
]
@@ -48,16 +53,8 @@
# conditional and mock imports
#
-if version.parse(shap.__version__) < version.parse("0.36"):
- # noinspection PyUnresolvedReferences
- from shap.explainers.explainer import Explainer
-else:
- try:
- # noinspection PyUnresolvedReferences
- from shap import Explainer
- except ImportError as e:
- log.warning(e)
- Explainer = type("Explainer", (), {})
+
+from shap import Explainer, Explanation
try:
import catboost
@@ -67,13 +64,22 @@
catboost = ModuleType("catboost")
catboost.Pool = type("Pool", (), {})
+#
+# Type aliases
+#
+
+ArraysAny: TypeAlias = Union[npt.NDArray[Any], List[npt.NDArray[Any]]]
+ArraysFloat: TypeAlias = Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]]
+Learner: TypeAlias = Union[RegressorMixin, ClassifierMixin]
+XType = Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool]
+YType = Union[npt.NDArray[Any], pd.Series, None]
#
-# Type variables and aliases
+# Type variables
#
-ArraysAny = Union[npt.NDArray[Any], List[npt.NDArray[Any]]]
-ArraysFloat = Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]]
+T_Model = TypeVar("T_Model")
+
#
# Ensure all symbols introduced below are included in __all__
@@ -87,48 +93,87 @@
#
-class BaseExplainer(metaclass=ABCMeta):
+class BaseExplainer(
+ Explainer, # type: ignore
+ metaclass=ABCMeta,
+):
"""
Abstract base class of SHAP explainers, providing stubs for methods used by FACET
but not consistently supported by class :class:`shap.Explainer` across different
versions of the `shap` package.
+
+ Provides unified support for the old and new explainer APIs:
+
+ - The old API uses methods :meth:`.shap_values` and :meth:`.shap_interaction_values`
+ to compute SHAP values and interaction values, respectively. They return
+ *numpy* arrays for single-output or single-class models, and lists of *numpy*
+ arrays for multi-output or multi-class models.
+ - The new API introduced in :mod:`shap` 0.36 makes explainer objects callable;
+ direct calls to an explainer object return an :class:`.Explanation` object
+ that contains the SHAP values and interaction values.
+ For multi-output or multi-class models, the array has an additional dimension for
+ the outputs or classes as the last axis.
+
+ As of :mod:`shap` 0.36, the old API is deprecated for the majority of explainers
+ while the :class:`shap.KernelExplainer` still uses the old API exclusively
+ in :mod:`shap` 0.41.
+ We remedy this by adding support for both APIs to all explainers created through
+ an :class:`ExplainerFactory` object.
"""
- def __init__(self, *args: Any, **kwargs: Any) -> None:
+ @property
+ @abstractmethod
+ def supports_interaction(self) -> bool:
"""
- :param args: positional args (should usually be empty)
- :param kwargs: keyword args (should usually be empty)
+ ``True`` if the explainer supports interaction effects, ``False`` otherwise.
"""
- super().__init__(*args, **kwargs)
+ pass
- # noinspection PyPep8Naming,PyUnresolvedReferences
- @abstractmethod
- def shap_values(
- self,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[npt.NDArray[Any], pd.Series, None] = None,
- **kwargs: Any,
- ) -> ArraysFloat:
+ # noinspection PyPep8Naming
+ def shap_values(self, X: XType, y: YType = None, **kwargs: Any) -> ArraysFloat:
"""
Estimate the SHAP values for a set of samples.
:param X: matrix of samples (# samples x # features) on which to explain the
model's output
:param y: array of label values for each sample, used when explaining loss
- functions
+ functions (optional)
:param kwargs: additional arguments specific to the explainer implementation
:return: SHAP values as an array of shape `(n_observations, n_features)`;
a list of such arrays in the case of a multi-output model
"""
- pass
+
+ explanation: Explanation
+ if y is None:
+ explanation = self(X, **kwargs)
+ else:
+ explanation = self(X, y, **kwargs)
+
+ values = explanation.values
+
+ interactions: int = kwargs.get("interactions", 1)
+ if isinstance(values, np.ndarray):
+ if values.ndim == 2 + interactions:
+ # convert the array of shape
+ # (n_observations, n_features, ..., n_outputs)
+ # to a list of arrays of shape (n_observations, n_features, ...)
+ return [values[..., i] for i in range(values.shape[-1])]
+ elif values.ndim == 1 + interactions:
+ # return a single array of shape (n_observations, n_features)
+ return values
+ else:
+ raise ValueError(
+ f"SHAP values have unexpected shape {values.shape}; "
+ "expected shape (n_observations, n_features, ..., n_outputs) "
+ "or (n_observations, n_features, ...)"
+ )
+ else:
+ assert isinstance(values, list), "SHAP values must be a list or array"
+ return values
# noinspection PyPep8Naming,PyUnresolvedReferences
- @abstractmethod
def shap_interaction_values(
- self,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[npt.NDArray[Any], pd.Series, None] = None,
- **kwargs: Any,
+ self, X: XType, y: YType = None, **kwargs: Any
) -> ArraysFloat:
"""
Estimate the SHAP interaction values for a set of samples.
@@ -136,20 +181,35 @@ def shap_interaction_values(
:param X: matrix of samples (# samples x # features) on which to explain the
model's output
:param y: array of label values for each sample, used when explaining loss
- functions
+ functions (optional)
:param kwargs: additional arguments specific to the explainer implementation
:return: SHAP values as an array of shape
`(n_observations, n_features, n_features)`;
a list of such arrays in the case of a multi-output model
"""
- pass
+ if self.supports_interaction:
+ return self.shap_values(X, y, interactions=2, **kwargs)
+ else:
+ raise NotImplementedError(
+ f"{self.__class__.__name__} does not support interaction values"
+ )
-class ExplainerFactory(HasExpressionRepr, metaclass=ABCMeta):
+class ExplainerFactory(HasExpressionRepr, Generic[T_Model], metaclass=ABCMeta):
"""
A factory for constructing :class:`~shap.Explainer` objects.
"""
+ #: Additional keyword arguments to be passed to the explainer constructor.
+ kwargs: Dict[str, Any]
+
+ def __init__(self, **kwargs: Any) -> None:
+ """
+ :param kwargs: additional keyword arguments to be passed to the explainer
+ """
+ super().__init__()
+ self.explainer_kwargs = kwargs
+
@property
@abstractmethod
def explains_raw_output(self) -> bool:
@@ -176,7 +236,7 @@ def uses_background_dataset(self) -> bool:
@abstractmethod
def make_explainer(
- self, model: LearnerDF, data: Optional[pd.DataFrame]
+ self, model: T_Model, data: Optional[pd.DataFrame]
) -> BaseExplainer:
"""
Construct a new :class:`~shap.Explainer` to compute shap values.
@@ -216,10 +276,10 @@ class ExplainerJob(Job[ArraysAny]):
interactions: bool
#: the feature values of the observations to be explained
- X: Union[npt.NDArray[Any], pd.DataFrame]
+ X: XType
#: the target values of the observations to be explained
- y: Union[None, npt.NDArray[Any], pd.Series]
+ y: YType
#: additional arguments specific to the explainer method
kwargs: Dict[str, Any]
@@ -228,8 +288,8 @@ class ExplainerJob(Job[ArraysAny]):
def __init__(
self,
explainer: BaseExplainer,
- X: Union[npt.NDArray[Any], pd.DataFrame],
- y: Union[None, npt.NDArray[Any], pd.Series] = None,
+ X: XType,
+ y: YType = None,
*,
interactions: bool,
**kwargs: Any,
@@ -294,8 +354,8 @@ class ExplainerQueue(JobQueue[ArraysAny, ArraysAny]):
def __init__(
self,
explainer: BaseExplainer,
- X: Union[npt.NDArray[Any], pd.DataFrame],
- y: Union[None, npt.NDArray[Any], pd.Series] = None,
+ X: XType,
+ y: YType = None,
*,
interactions: bool,
max_job_size: int,
@@ -309,9 +369,14 @@ def __init__(
calculate SHAP interaction values
:param max_job_size: the maximum number of observations to allocate to each job
:param kwargs: additional arguments specific to the explainer method
+
+ :raise NotImplementedError: if `X` is a :class:`~catboost.Pool`;
+ this is currently not supported
"""
super().__init__()
+ if isinstance(X, catboost.Pool):
+ raise NotImplementedError("CatBoost Pool is not supported")
self.explainer = explainer
self.X = X.values if isinstance(X, pd.DataFrame) else X
self.y = y.values if isinstance(y, pd.Series) else y
@@ -391,7 +456,9 @@ def __init__(
:param max_job_size: the maximum number of observations to allocate to any of
the explainer jobs running in parallel
"""
- super().__init__(
+ Explainer.__init__(self, model=explainer.model)
+ ParallelizableMixin.__init__(
+ self,
n_jobs=n_jobs,
shared_memory=shared_memory,
pre_dispatch=pre_dispatch,
@@ -409,32 +476,38 @@ def __init__(
assert __init__.__doc__ is not None
__init__.__doc__ += cast(str, ParallelizableMixin.__init__.__doc__)
+ @property
+ def supports_interaction(self) -> bool:
+ """[see superclass]"""
+ return self.explainer.supports_interaction
+
+ def __call__(self, *args: Any, **kwargs: Any) -> Explanation:
+ return self.explainer(*args, **kwargs)
+
# noinspection PyPep8Naming
- def shap_values(
- self,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[npt.NDArray[Any], pd.Series, None] = None,
- **kwargs: Any,
- ) -> ArraysFloat:
+ def shap_values(self, X: XType, y: YType = None, **kwargs: Any) -> ArraysFloat:
"""[see superclass]"""
- return self._run(self.explainer, X, y, interactions=False, **kwargs)
+ if y is None:
+ return self.explainer.shap_values(X=X, **kwargs)
+ else:
+ return self.explainer.shap_values(X=X, y=y, **kwargs)
# noinspection PyPep8Naming
def shap_interaction_values(
- self,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[npt.NDArray[Any], pd.Series, None] = None,
- **kwargs: Any,
+ self, X: XType, y: YType = None, **kwargs: Any
) -> ArraysFloat:
"""[see superclass]"""
- return self._run(self.explainer, X, y, interactions=True, **kwargs)
+ if y is None:
+ return self.explainer.shap_interaction_values(X=X, **kwargs)
+ else:
+ return self.explainer.shap_interaction_values(X=X, y=y, **kwargs)
# noinspection PyPep8Naming
def _run(
self,
explainer: BaseExplainer,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[None, npt.NDArray[Any], pd.Series] = None,
+ X: XType,
+ y: YType = None,
*,
interactions: bool,
**kwargs: Any,
@@ -455,13 +528,43 @@ def _run(
# TreeExplainer factory
#
-_TreeExplainer: Optional[Type[BaseExplainer]] = None
+
+@inheritdoc(match="""[see superclass]""")
+class _TreeExplainer(
+ shap.explainers.Tree, # type: ignore
+ BaseExplainer,
+):
+ @property
+ def supports_interaction(self) -> bool:
+ """[see superclass]"""
+ return True
+
+ # noinspection PyPep8Naming
+ def __call__(
+ self, X: XType, y: YType = None, check_additivity: bool = False, **kwargs: Any
+ ) -> Explanation:
+ # we override the __call__ method to change the default value of
+ # arg check_additivity to False
+ return cast(
+ Explanation,
+ super().__call__(X=X, y=y, check_additivity=check_additivity, **kwargs),
+ )
+
+ # noinspection PyPep8Naming
+ def shap_values(
+ self, X: XType, y: YType = None, check_additivity: bool = False, **kwargs: Any
+ ) -> ArraysFloat:
+ """[see superclass]"""
+ return cast(
+ ArraysFloat,
+ super().shap_values(X=X, y=y, check_additivity=check_additivity, **kwargs),
+ )
@inheritdoc(match="""[see superclass]""")
-class TreeExplainerFactory(ExplainerFactory):
+class TreeExplainerFactory(ExplainerFactory[Learner]):
"""
- A factory constructing :class:`~shap.TreeExplainer` objects.
+ A factory constructing :class:`~shap.TreeExplainer` instances.
"""
def __init__(
@@ -470,6 +573,7 @@ def __init__(
model_output: Optional[str] = None,
feature_perturbation: Optional[str] = None,
uses_background_dataset: bool = True,
+ **kwargs: Any,
) -> None:
"""
:param model_output: override the default model output parameter (optional)
@@ -479,7 +583,7 @@ def __init__(
dataset on to the tree explainer even if a background dataset is passed
to :meth:`.make_explainer`
"""
- super().__init__()
+ super().__init__(**kwargs)
validate_type(
model_output, expected_type=str, optional=True, name="arg model_output"
)
@@ -493,12 +597,7 @@ def __init__(
self.feature_perturbation = feature_perturbation
self._uses_background_dataset = uses_background_dataset
- global _TreeExplainer
-
- if _TreeExplainer is None:
- _TreeExplainer = type(
- "_TreeExplainer", (shap.TreeExplainer, BaseExplainer), {}
- )
+ __init__.__doc__ = f"{__init__.__doc__}{ExplainerFactory.__init__.__doc__}"
@property
def explains_raw_output(self) -> bool:
@@ -516,7 +615,7 @@ def uses_background_dataset(self) -> bool:
return self._uses_background_dataset
def make_explainer(
- self, model: LearnerDF, data: Optional[pd.DataFrame] = None
+ self, model: Learner, data: Optional[pd.DataFrame] = None
) -> BaseExplainer:
"""[see superclass]"""
@@ -525,7 +624,7 @@ def make_explainer(
assert _TreeExplainer is not None, "Global tree explainer is set"
explainer = _TreeExplainer(
- model=model.native_estimator,
+ model=model,
data=data if self._uses_background_dataset else None,
**self._remove_null_kwargs(
dict(
@@ -533,12 +632,7 @@ def make_explainer(
feature_perturbation=self.feature_perturbation,
)
),
- )
-
- # we overwrite the shap_values method - need to tell mypy to allow this
- # as an exception
- explainer.shap_values = functools.partial( # type: ignore
- explainer.shap_values, check_additivity=False
+ **self.explainer_kwargs,
)
return explainer
@@ -552,6 +646,68 @@ def to_expression(self) -> Expression:
)
+#
+# Abstract function explainer factory
+#
+
+
+@inheritdoc(match="""[see superclass]""")
+class FunctionExplainerFactory(
+ ExplainerFactory[Union[Learner, ModelFunction]], metaclass=ABCMeta
+):
+ """
+ A factory constructing :class:`~shap.Explainer` instances that use Python functions
+ as the underlying model.
+ """
+
+ @property
+ def uses_background_dataset(self) -> bool:
+ """``True``, since function explainers typically use a background dataset"""
+ return True
+
+ def make_explainer(
+ self, model: Union[Learner, ModelFunction], data: Optional[pd.DataFrame]
+ ) -> BaseExplainer:
+ """[see superclass]"""
+ self._validate_background_dataset(data=data)
+
+ # create a model function from the model
+ try:
+ if isinstance(model, RegressorMixin):
+ # noinspection PyUnresolvedReferences
+ model_fn = model.predict
+ elif isinstance(model, ClassifierMixin):
+ # noinspection PyUnresolvedReferences
+ model_fn = model.predict_proba
+ elif callable(model):
+ model_fn = model
+ else:
+ model_fn = None
+ except AttributeError as cause:
+ raise TypeError(
+ f"arg model does not support default prediction method: {cause}"
+ ) from cause
+ if not model_fn:
+ raise TypeError(
+ "arg model is neither a regressor nor a classifier: "
+ f"{type(model).__name__}"
+ )
+
+ return self.make_explainer_from_function(model_fn=model_fn, data=data)
+
+ @abstractmethod
+ def make_explainer_from_function(
+ self, model_fn: ModelFunction, data: Optional[pd.DataFrame]
+ ) -> BaseExplainer:
+ """
+ Construct an explainer from a function.
+
+ :param model_fn: the function representing the model
+ :param data: the background dataset
+ :return: the explainer
+ """
+
+
#
# KernelExplainer factory
#
@@ -561,23 +717,32 @@ class _KernelExplainer(
shap.KernelExplainer, # type: ignore
BaseExplainer,
):
- # noinspection PyPep8Naming,PyUnresolvedReferences
- def shap_interaction_values(
- self,
- X: Union[npt.NDArray[Any], pd.DataFrame, catboost.Pool],
- y: Union[npt.NDArray[Any], pd.Series, None] = None,
- **kwargs: Any,
- ) -> ArraysFloat:
+ def __call__(self, *args: Any, **kwargs: Any) -> Explanation:
+ """[see superclass]"""
+
+ # we override the BaseExplainer implementation because the shap.KernelExplainer
+ # implementation does not support __call__
+ shap_values = shap.KernelExplainer.shap_values(self, *args, **kwargs)
+
+ if isinstance(shap_values, list):
+ # combine the shap values into a single array, along an additional axis
+ shap_values = np.stack(shap_values, axis=-1)
+
+ return Explanation(shap_values)
+
+ @property
+ def supports_interaction(self) -> bool:
"""
- Not implemented.
+ :return: ``False`` because :class:`~shap.KernelExplainer` does not support
+ interaction values
"""
- raise NotImplementedError()
+ return False
@inheritdoc(match="""[see superclass]""")
-class KernelExplainerFactory(ExplainerFactory):
+class KernelExplainerFactory(FunctionExplainerFactory):
"""
- A factory constructing :class:`~shap.KernelExplainer` objects.
+ A factory constructing :class:`~shap.KernelExplainer` instances.
"""
def __init__(
@@ -586,6 +751,7 @@ def __init__(
link: Optional[str] = None,
l1_reg: Optional[str] = "num_features(10)",
data_size_limit: Optional[int] = 100,
+ **kwargs: Any,
) -> None:
"""
:param link: override the default link parameter (optional)
@@ -596,12 +762,14 @@ def __init__(
the background data set; larger data sets will be down-sampled using
kmeans; don't downsample if omitted (optional)
"""
- super().__init__()
+ super().__init__(**kwargs)
validate_type(link, expected_type=str, optional=True, name="arg link")
self.link = link
self.l1_reg = l1_reg if l1_reg is not None else "num_features(10)"
self.data_size_limit = data_size_limit
+ __init__.__doc__ = f"{__init__.__doc__}{FunctionExplainerFactory.__init__.__doc__}"
+
@property
def explains_raw_output(self) -> bool:
"""[see superclass]"""
@@ -612,44 +780,23 @@ def supports_shap_interaction_values(self) -> bool:
"""[see superclass]"""
return False
- @property
- def uses_background_dataset(self) -> bool:
- """[see superclass]"""
- return True
-
- def make_explainer(self, model: LearnerDF, data: pd.DataFrame) -> BaseExplainer:
+ def make_explainer_from_function(
+ self, model_fn: ModelFunction, data: Optional[pd.DataFrame]
+ ) -> BaseExplainer:
"""[see superclass]"""
self._validate_background_dataset(data=data)
-
- model_root_estimator: BaseEstimator = model.native_estimator
-
- try:
- if isinstance(model, RegressorDF):
- # noinspection PyUnresolvedReferences
- model_fn = model_root_estimator.predict
- elif isinstance(model, ClassifierDF):
- # noinspection PyUnresolvedReferences
- model_fn = model_root_estimator.predict_proba
- else:
- model_fn = None
- except AttributeError as cause:
- raise TypeError(
- f"arg model does not support default prediction method: {cause}"
- ) from cause
-
- if not model_fn:
- raise TypeError(
- "arg model is neither a regressor nor a classifier: "
- f"{type(model).__name__}"
- )
+ assert data is not None, "this explainer requires a background dataset"
data_size_limit = self.data_size_limit
if data_size_limit is not None and len(data) > data_size_limit:
data = shap.kmeans(data, data_size_limit, round_values=True)
explainer = _KernelExplainer(
- model=model_fn, data=data, **self._remove_null_kwargs(dict(link=self.link))
+ model=model_fn,
+ data=data,
+ **self._remove_null_kwargs(dict(link=self.link)),
+ **self.explainer_kwargs,
)
if self.l1_reg is not None:
@@ -669,4 +816,114 @@ def to_expression(self) -> Expression:
)
+#
+# Exact explainer factory
+#
+
+# noinspection PyPep8Naming
+@inheritdoc(match="""[see superclass]""")
+class _ExactExplainer(
+ shap.explainers.Exact, # type: ignore
+ BaseExplainer,
+):
+ @property
+ def supports_interaction(self) -> bool:
+ """
+ :return: ``True`` because :class:`~shap.explainers.Exact` supports interaction
+ values
+ """
+ return True
+
+
+@inheritdoc(match="""[see superclass]""")
+class ExactExplainerFactory(FunctionExplainerFactory):
+ """
+ A factory constructing :class:`~shap.Exact` explainer instances.
+ """
+
+ def __init__(self) -> None:
+ super().__init__()
+
+ @property
+ def explains_raw_output(self) -> bool:
+ """[see superclass]"""
+ return True
+
+ @property
+ def supports_shap_interaction_values(self) -> bool:
+ """[see superclass]"""
+ return True
+
+ def make_explainer_from_function(
+ self, model_fn: ModelFunction, data: Optional[pd.DataFrame]
+ ) -> BaseExplainer:
+ """[see superclass]"""
+ self._validate_background_dataset(data=data)
+ # noinspection PyTypeChecker
+ return _ExactExplainer(model=model_fn, masker=data)
+
+ def to_expression(self) -> Expression:
+ """[see superclass]"""
+ return Id(type(self))()
+
+
+#
+# Permutation explainer factory
+#
+
+
+@inheritdoc(match="""[see superclass]""")
+# noinspection PyPep8Naming
+class _PermutationExplainer(
+ shap.explainers.Permutation, # type: ignore
+ BaseExplainer,
+):
+ @property
+ def supports_interaction(self) -> bool:
+ """
+ :return: ``False`` because :class:`~shap.explainers.Permutation` does not
+ support interaction values
+ """
+ return False
+
+ # noinspection PyPep8Naming
+ def shap_values(self, X: XType, y: YType = None, **kwargs: Any) -> ArraysFloat:
+ # skip the call to super().shap_values() because would raise
+ # an AttributeError exception due to a bug in the shap library
+ return BaseExplainer.shap_values(self, X, y, **kwargs)
+
+
+@inheritdoc(match="""[see superclass]""")
+class PermutationExplainerFactory(FunctionExplainerFactory):
+ """
+ A factory constructing :class:`~shap.Permutation` explainer instances.
+ """
+
+ @property
+ def explains_raw_output(self) -> bool:
+ """[see superclass]"""
+ return True
+
+ @property
+ def supports_shap_interaction_values(self) -> bool:
+ """[see superclass]"""
+ return False
+
+ def make_explainer_from_function(
+ self,
+ model_fn: ModelFunction,
+ data: Optional[pd.DataFrame],
+ ) -> BaseExplainer:
+ """[see superclass]"""
+ self._validate_background_dataset(data=data)
+ # noinspection PyTypeChecker
+ return _PermutationExplainer(
+ model=model_fn, masker=data, **self.explainer_kwargs
+ )
+
+ def to_expression(self) -> Expression:
+ """[see superclass]"""
+ return Id(type(self))()
+
+
__tracker.validate()
diff --git a/src/facet/inspection/_function_inspector.py b/src/facet/inspection/_function_inspector.py
new file mode 100644
index 00000000..53c51297
--- /dev/null
+++ b/src/facet/inspection/_function_inspector.py
@@ -0,0 +1,150 @@
+"""
+Implementation of :class:`.LearnerInspector`.
+"""
+import logging
+import re
+from typing import Any, Generic, List, Optional, Sequence, TypeVar, Union
+
+from pytools.api import AllTracker, inheritdoc, subsdoc, to_list
+
+from ._explainer import ExactExplainerFactory, FunctionExplainerFactory
+from ._model_inspector import ModelInspector
+from ._types import ModelFunction
+from .shap import FunctionShapCalculator, ShapCalculator
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+ "FunctionInspector",
+]
+
+
+#
+# Type variables
+#
+
+T_Function = TypeVar("T_Function", bound=ModelFunction)
+
+
+#
+# Ensure all symbols introduced below are included in __all__
+#
+
+__tracker = AllTracker(globals())
+
+
+#
+# Class definitions
+#
+
+
+@subsdoc(
+ pattern=(
+ r"(?m)" # match multiline
+ r"(^\s+)\.\. note::\s*" # .. note:: at start of line
+ r"(?:\1.*\n)+" # followed by one or more indented lines
+ r"(?:\1?\n)*" # followed by zero or more blank lines
+ ),
+ replacement="",
+)
+@inheritdoc(match="""[see superclass]""")
+class FunctionInspector(ModelInspector[T_Function], Generic[T_Function]):
+ """[see superclass]"""
+
+ #: The default explainer factory used by this inspector.
+ DEFAULT_EXPLAINER_FACTORY: FunctionExplainerFactory = ExactExplainerFactory()
+
+ #: The factory used to create the explainer for the model function.
+ explainer_factory: FunctionExplainerFactory
+
+ # the feature names of the model function
+ _feature_names: List[str]
+
+ def __init__(
+ self,
+ model: T_Function,
+ *,
+ feature_names: Sequence[str],
+ explainer_factory: Optional[FunctionExplainerFactory] = None,
+ shap_interaction: bool = True,
+ n_jobs: Optional[int] = None,
+ shared_memory: Optional[bool] = None,
+ pre_dispatch: Optional[Union[str, int]] = None,
+ verbose: Optional[int] = None,
+ ) -> None:
+ """
+ :param model: the model function to inspect, which takes a 2D array of
+ feature values as input and returns a 1D array of output values
+ :param feature_names: the names of the inputs to the model function
+ :param explainer_factory: optional function that creates a shap Explainer
+ (default: a :class:`.KernelExplainerFactory` instance; see
+ :attr:`.DEFAULT_EXPLAINER_FACTORY`)
+ """
+
+ if explainer_factory:
+ if not explainer_factory.explains_raw_output:
+ raise ValueError(
+ "arg explainer_factory must be configured to explain raw output"
+ )
+ else:
+ explainer_factory = self.DEFAULT_EXPLAINER_FACTORY
+ assert explainer_factory.explains_raw_output
+
+ if shap_interaction:
+ if not explainer_factory.supports_shap_interaction_values:
+ log.warning(
+ "ignoring arg shap_interaction=True: "
+ f"explainers made by {explainer_factory!r} do not support "
+ "SHAP interaction values"
+ )
+ shap_interaction = False
+
+ super().__init__(
+ model=model,
+ shap_interaction=shap_interaction,
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
+
+ self.model = model
+ self._feature_names = to_list(
+ feature_names, element_type=str, arg_name="feature_names"
+ )
+ self.explainer_factory = explainer_factory
+ self._shap_calculator: Optional[ShapCalculator[Any]] = None
+
+ __init__.__doc__ = str(__init__.__doc__) + re.sub(
+ r"(?m)^\s*:param model:\s+.*$", "", str(ModelInspector.__init__.__doc__)
+ )
+
+ @property
+ def feature_names(self) -> List[str]:
+ """[see superclass]"""
+ return self._feature_names
+
+ @property
+ def shap_calculator(self) -> ShapCalculator[Any]:
+ """[see superclass]"""
+
+ if self._shap_calculator is not None:
+ return self._shap_calculator
+
+ model = self.model
+
+ shap_calculator = FunctionShapCalculator(
+ model=model,
+ explainer_factory=self.explainer_factory,
+ interaction_values=self.shap_interaction,
+ n_jobs=self.n_jobs,
+ shared_memory=self.shared_memory,
+ pre_dispatch=self.pre_dispatch,
+ verbose=self.verbose,
+ )
+
+ self._shap_calculator = shap_calculator
+ return shap_calculator
+
+
+__tracker.validate()
diff --git a/src/facet/inspection/_inspection.py b/src/facet/inspection/_inspection.py
index 38804dc1..6b8bea5d 100644
--- a/src/facet/inspection/_inspection.py
+++ b/src/facet/inspection/_inspection.py
@@ -7,6 +7,7 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
+from typing_extensions import TypeAlias
from pytools.api import AllTracker
@@ -23,7 +24,7 @@
# Type aliases
#
-FloatArray = npt.NDArray[np.float_]
+FloatArray: TypeAlias = npt.NDArray[np.float_]
#
diff --git a/src/facet/inspection/_learner_inspector.py b/src/facet/inspection/_learner_inspector.py
index 2f6d1a23..1790a457 100644
--- a/src/facet/inspection/_learner_inspector.py
+++ b/src/facet/inspection/_learner_inspector.py
@@ -1,87 +1,37 @@
"""
-Core implementation of :mod:`facet.inspection`
+Implementation of :class:`.LearnerInspector`.
"""
import logging
-from abc import ABCMeta
-from types import MethodType
-from typing import (
- Any,
- Callable,
- Generic,
- Iterable,
- List,
- Optional,
- Sequence,
- Tuple,
- Type,
- TypeVar,
- Union,
- cast,
-)
+import re
+from typing import Any, Dict, Generic, List, Optional, TypeVar, Union, cast
-import numpy as np
-import numpy.typing as npt
import pandas as pd
-from scipy.cluster import hierarchy
-from scipy.spatial import distance
-from sklearn.base import is_classifier
+from sklearn.base import is_classifier, is_regressor
-from pytools.api import AllTracker, inheritdoc
-from pytools.data import LinkageTree, Matrix
-from pytools.fit import FittableMixin, fitted_only
-from pytools.parallelization import ParallelizableMixin
-from sklearndf import ClassifierDF, LearnerDF, RegressorDF
-from sklearndf.pipeline import LearnerPipelineDF
+from pytools.api import AllTracker, inheritdoc, subsdoc
+from sklearndf import SupervisedLearnerDF
+from sklearndf.pipeline import SupervisedLearnerPipelineDF
-from ..data import Sample
from ._explainer import ExplainerFactory, TreeExplainerFactory
-from ._inspection import ShapPlotData
-from ._shap import (
- ClassifierShapInteractionValuesCalculator,
- ClassifierShapValuesCalculator,
- RegressorShapInteractionValuesCalculator,
- RegressorShapValuesCalculator,
- ShapCalculator,
- ShapInteractionValuesCalculator,
-)
-from ._shap_global_explanation import (
- ShapGlobalExplainer,
- ShapInteractionGlobalExplainer,
+from ._model_inspector import ModelInspector
+from .shap.sklearn import (
+ ClassifierShapCalculator,
+ LearnerShapCalculator,
+ RegressorShapCalculator,
)
-from ._shap_projection import ShapInteractionVectorProjector, ShapVectorProjector
log = logging.getLogger(__name__)
__all__ = [
"LearnerInspector",
- "ModelInspector",
]
-#
-# Type aliases
-#
-
-FloatArray = npt.NDArray[np.float_]
-FloatMatrix = Matrix[np.float_]
-
-
#
# Type variables
#
-T_LearnerInspector = TypeVar("T_LearnerInspector", bound="LearnerInspector[Any]")
-T_LearnerPipelineDF = TypeVar("T_LearnerPipelineDF", bound=LearnerPipelineDF[Any])
-T_SeriesOrDataFrame = TypeVar("T_SeriesOrDataFrame", pd.Series, pd.DataFrame)
-T_Number = TypeVar("T_Number", bound="np.number[Any]")
-
-
-#
-# Constants
-#
-
-ASSERTION__INSPECTOR_IS_FITTED = "inspector is fitted"
-ASSERTION__SHAP_INTERACTION_SUPPORTED = "SHAP interaction values are supported"
+T_SupervisedLearnerDF = TypeVar("T_SupervisedLearnerDF", bound=SupervisedLearnerDF)
#
@@ -96,64 +46,20 @@
#
-class ModelInspector(FittableMixin[Sample], metaclass=ABCMeta):
- """
- Base class of `inspectors` to explain different kinds of `models` based on SHAP
- values.
- """
-
- pass
-
-
-@inheritdoc(match="[see superclass]")
+@subsdoc(
+ pattern=(
+ r"(?m)" # match multiline
+ r"(^\s+)\.\. note::\s*" # .. note:: at start of line
+ r"(?:\1.*\n)+" # followed by one or more indented lines
+ r"(?:\1?\n)*" # followed by zero or more blank lines
+ ),
+ replacement="",
+)
+@inheritdoc(match="""[see superclass]""")
class LearnerInspector(
- ModelInspector, ParallelizableMixin, Generic[T_LearnerPipelineDF]
+ ModelInspector[T_SupervisedLearnerDF], Generic[T_SupervisedLearnerDF]
):
- """
- Explain regressors and classifiers based on SHAP values.
-
- Focus is on explaining the overall model, but the inspector also delivers
- SHAP explanations of the individual observations.
-
- Available inspection methods are:
-
- - SHAP values
- - SHAP interaction values
- - feature importance derived from SHAP values (either as mean absolute values
- or as the root of mean squares)
- - pairwise feature redundancy matrix (requires availability of SHAP interaction
- values)
- - pairwise feature synergy matrix (requires availability of SHAP interaction
- values)
- - pairwise feature association matrix (upper bound for redundancy but can be
- inflated by synergy; available if SHAP interaction values are unknown)
- - pairwise feature interaction matrix (direct feature interaction quantified by
- SHAP interaction values)
- - feature redundancy linkage (to visualize clusters of redundant features in a
- dendrogram; requires availability of SHAP interaction values)
- - feature synergy linkage (to visualize clusters of synergistic features in a
- dendrogram; requires availability of SHAP interaction values)
- - feature association linkage (to visualize clusters of associated features in a
- dendrogram)
-
- All inspections that aggregate across observations will respect sample weights, if
- specified in the underlying training sample.
- """
-
- # defined in superclass, repeated here for Sphinx
- n_jobs: Optional[int]
-
- # defined in superclass, repeated here for Sphinx
- shared_memory: Optional[bool]
-
- # defined in superclass, repeated here for Sphinx
- pre_dispatch: Optional[Union[str, int]]
-
- # defined in superclass, repeated here for Sphinx
- verbose: Optional[int]
-
- #: Name for feature importance series or column.
- COL_IMPORTANCE = "importance"
+ """[see superclass]"""
#: The default explainer factory used by this inspector.
#: This is a tree explainer using the tree_path_dependent method for
@@ -164,9 +70,9 @@ class LearnerInspector(
def __init__(
self,
+ model: SupervisedLearnerPipelineDF[T_SupervisedLearnerDF],
*,
- pipeline: T_LearnerPipelineDF,
- explainer_factory: Optional[ExplainerFactory] = None,
+ explainer_factory: Optional[ExplainerFactory[T_SupervisedLearnerDF]] = None,
shap_interaction: bool = True,
n_jobs: Optional[int] = None,
shared_memory: Optional[bool] = None,
@@ -174,29 +80,31 @@ def __init__(
verbose: Optional[int] = None,
) -> None:
"""
- :param pipeline: the learner pipeline to inspect
+ :param model: the learner pipeline to inspect
:param explainer_factory: optional function that creates a shap Explainer
(default: ``TreeExplainerFactory``)
- :param shap_interaction: if ``True``, calculate SHAP interaction values, else
- only calculate SHAP contribution values.
- SHAP interaction values are needed to determine feature synergy and
- redundancy.
- (default: ``True``)
"""
- super().__init__(
- n_jobs=n_jobs,
- shared_memory=shared_memory,
- pre_dispatch=pre_dispatch,
- verbose=verbose,
- )
- if not pipeline.is_fitted:
+ if not model.is_fitted:
raise ValueError("arg pipeline must be fitted")
- if not isinstance(pipeline.final_estimator, (ClassifierDF, RegressorDF)):
+ final_estimator: T_SupervisedLearnerDF = model.final_estimator
+ if is_classifier(final_estimator):
+ try:
+ n_outputs = final_estimator.n_outputs_
+ except AttributeError:
+ pass
+ else:
+ if n_outputs > 1:
+ raise ValueError(
+ "only single-target classifiers (binary or multi-class) are "
+ "supported, but the given classifier has been fitted on "
+ f"multiple targets: {', '.join(final_estimator.output_names_)}"
+ )
+ elif not is_regressor(final_estimator):
raise TypeError(
"learner in arg pipeline must be a classifier or a regressor,"
- f"but is a {type(pipeline.final_estimator).__name__}"
+ f"but is a {type(final_estimator).__name__}"
)
if explainer_factory:
@@ -217,889 +125,65 @@ def __init__(
)
shap_interaction = False
- self.pipeline = pipeline
- self.explainer_factory = explainer_factory
- self.shap_interaction = shap_interaction
+ super().__init__(
+ model=model,
+ shap_interaction=shap_interaction,
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
- self._shap_calculator: Optional[ShapCalculator[T_LearnerPipelineDF]] = None
- self._shap_global_decomposer: Optional[ShapGlobalExplainer] = None
- self._shap_global_projector: Optional[ShapGlobalExplainer] = None
- self._sample: Optional[Sample] = None
+ self.explainer_factory = explainer_factory
+ self._shap_calculator: Optional[LearnerShapCalculator[Any]] = None
- __init__.__doc__ = cast(str, __init__.__doc__) + cast(
- str, ParallelizableMixin.__init__.__doc__
+ __init__.__doc__ = str(__init__.__doc__) + re.sub(
+ r"(?m)^\s*:param model:\s+.*$", "", str(ModelInspector.__init__.__doc__)
)
- def fit( # type: ignore[override]
- # todo: remove 'type: ignore' once mypy correctly infers return type
- self: T_LearnerInspector,
- sample: Sample,
- **fit_params: Any,
- ) -> T_LearnerInspector:
- """
- Fit the inspector with the given sample, creating global explanations including
- feature redundancy and synergy.
-
- This will calculate SHAP values and, if enabled in the underlying SHAP
- explainer, also SHAP interaction values.
-
- :param sample: the background sample to be used for the global explanation
- of this model
- :param fit_params: additional keyword arguments (ignored; accepted for
- compatibility with :class:`.FittableMixin`)
- :return: ``self``
- """
-
- learner: LearnerDF = self.pipeline.final_estimator
-
- _is_classifier = is_classifier(learner)
- if _is_classifier and isinstance(sample.target_name, list):
- raise ValueError(
- "only single-output classifiers (binary or multi-class) are supported, "
- "but the given classifier has been fitted on multiple columns "
- f"{sample.target_name}"
- )
-
- shap_global_projector: Union[
- ShapVectorProjector, ShapInteractionVectorProjector, None
- ]
-
- shap_calculator_type: Type[ShapCalculator[T_LearnerPipelineDF]]
- shap_calculator: ShapCalculator[T_LearnerPipelineDF]
-
- if self.shap_interaction:
- if _is_classifier:
- shap_calculator_type = ClassifierShapInteractionValuesCalculator
- else:
- shap_calculator_type = RegressorShapInteractionValuesCalculator
-
- shap_calculator = shap_calculator_type(
- pipeline=self.pipeline,
- explainer_factory=self.explainer_factory,
- n_jobs=self.n_jobs,
- shared_memory=self.shared_memory,
- pre_dispatch=self.pre_dispatch,
- verbose=self.verbose,
- )
-
- shap_global_projector = ShapInteractionVectorProjector()
-
- else:
- if _is_classifier:
- shap_calculator_type = ClassifierShapValuesCalculator
- else:
- shap_calculator_type = RegressorShapValuesCalculator
-
- shap_calculator = shap_calculator_type(
- pipeline=self.pipeline,
- explainer_factory=self.explainer_factory,
- n_jobs=self.n_jobs,
- shared_memory=self.shared_memory,
- pre_dispatch=self.pre_dispatch,
- verbose=self.verbose,
- )
-
- shap_global_projector = ShapVectorProjector()
-
- shap_calculator.fit(sample)
- shap_global_projector.fit(shap_calculator=shap_calculator)
-
- self._sample = sample
- self._shap_calculator = shap_calculator
- self._shap_global_projector = shap_global_projector
-
- return self
-
@property
- def _shap_global_explainer(self) -> ShapGlobalExplainer:
- assert self._shap_global_projector is not None, ASSERTION__INSPECTOR_IS_FITTED
- return self._shap_global_projector
-
- @property
- def is_fitted(self) -> bool:
+ def feature_names(self) -> List[str]:
"""[see superclass]"""
- return self._sample is not None
-
- @property
- @fitted_only
- def sample_(self) -> Sample:
- """
- The background sample used to fit this inspector.
- """
-
- assert self._sample is not None, ASSERTION__INSPECTOR_IS_FITTED
- return self._sample
-
- @property
- @fitted_only
- def output_names_(self) -> Sequence[str]:
- """
- The names of the outputs explained by this inspector.
-
- For regressors, these are the names of the target columns.
-
- For binary classifiers, this is a list of length 1 with the name of a single
- class, since the SHAP values of the second class can be trivially derived as
- the negation of the SHAP values of the first class.
-
- For non-binary classifiers, this is the list of all classes.
- """
-
- assert (
- self._shap_calculator is not None
- and self._shap_calculator.output_names_ is not None
- ), ASSERTION__INSPECTOR_IS_FITTED
- return self._shap_calculator.output_names_
-
- @property
- @fitted_only
- def features_(self) -> List[str]:
- """
- The names of the features used to fit the learner pipeline explained by this
- inspector.
- """
- return cast(List[str], self.pipeline.feature_names_out_.to_list())
-
- @fitted_only
- def shap_values(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
- """
- Calculate the SHAP values for all observations and features.
-
- Returns a data frame of SHAP values where each row corresponds to an
- observation, and each column corresponds to a feature.
-
- :return: a data frame with SHAP values
- """
-
- assert self._shap_calculator is not None, ASSERTION__INSPECTOR_IS_FITTED
- return self.__split_multi_output_df(self._shap_calculator.get_shap_values())
-
- @fitted_only
- def shap_interaction_values(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
- """
- Calculate the SHAP interaction values for all observations and pairs of
- features.
-
- Returns a data frame of SHAP interaction values where each row corresponds to an
- observation and a feature (identified by a hierarchical index with two levels),
- and each column corresponds to a feature.
-
- :return: a data frame with SHAP interaction values
- """
- return self.__split_multi_output_df(
- self.__shap_interaction_values_calculator.get_shap_interaction_values()
- )
-
- @fitted_only
- def feature_importance(
- self, *, method: str = "rms"
- ) -> Union[pd.Series, pd.DataFrame]:
- """
- Calculate the relative importance of each feature based on SHAP values.
-
- The importance values of all features always add up to `1.0`.
-
- The calculation applies sample weights if specified in the underlying
- :class:`.Sample`.
-
- :param method: method for calculating feature importance. Supported methods
- are ``rms`` (root of mean squares, default), ``mav`` (mean absolute
- values)
- :return: a series of length `n_features` for single-output models, or a
- data frame of shape (n_features, n_outputs) for multi-output models
- """
-
- methods = {"rms", "mav"}
- if method not in methods:
- raise ValueError(f'arg method="{method}" must be one of {methods}')
-
- assert self._shap_calculator is not None
- shap_matrix: pd.DataFrame = self._shap_calculator.get_shap_values()
- weight: Optional[pd.Series] = self.sample_.weight
-
- abs_importance: pd.Series
- if method == "rms":
- if weight is None:
- abs_importance = shap_matrix.pow(2).mean().pow(0.5)
- else:
- abs_importance = shap_matrix.pow(2).mul(weight, axis=0).mean().pow(0.5)
- else:
- assert method == "mav", f"method is in {methods}"
- if weight is None:
- abs_importance = shap_matrix.abs().mean()
- else:
- abs_importance = shap_matrix.abs().mul(weight, axis=0).mean()
-
- def _normalize_importance(
- _importance: T_SeriesOrDataFrame,
- ) -> T_SeriesOrDataFrame:
- return _importance.divide(_importance.sum())
-
- if len(self.output_names_) == 1:
- return _normalize_importance(abs_importance).rename(self.output_names_[0])
-
- else:
- assert (
- abs_importance.index.nlevels == 2
- ), "2 index levels in place for multi-output models"
-
- return _normalize_importance(abs_importance.unstack(level=0))
-
- @fitted_only
- def feature_synergy_matrix(
- self,
- *,
- absolute: bool = False,
- symmetrical: bool = False,
- clustered: bool = True,
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- """
- Calculate the feature synergy matrix.
-
- This yields an asymmetric matrix where each row and column represents one
- feature, and the values at the intersections are the pairwise feature synergies,
- ranging from `0.0` (no synergy - both features contribute to predictions fully
- autonomously of each other) to `1.0` (full synergy, both features rely on
- combining all of their information to achieve any contribution to predictions).
-
- The synergy of a feature with itself is defined as `1.0`.
-
- Feature synergy calculations require SHAP interaction values; if only SHAP
- values are available consider calculating feature associations instead
- (see :meth:`.feature_association_matrix`).
-
- In the case of multi-target regression and non-binary classification, returns
- a list of data frames with one matrix per output.
-
- :param absolute: if ``False``, return relative synergy as a percentage of
- total feature importance;
- if ``True``, return absolute synergy as a portion of feature importance
- :param symmetrical: if ``True``, return a symmetrical matrix quantifying
- mutual synergy; if ``False``, return an asymmetrical matrix quantifying
- unilateral synergy of the features represented by rows with the
- features represented by columns (default: ``False``)
- :param clustered: if ``True``, reorder the rows and columns of the matrix
- such that synergy between adjacent rows and columns is maximised; if
- ``False``, keep rows and columns in the original features order
- (default: ``True``)
- :return: feature synergy matrix as a data frame of shape
- `(n_features, n_features)`, or a list of data frames for multiple outputs
- """
-
- return self.__feature_affinity_matrix(
- explainer_fn=self.__interaction_explainer.synergy,
- absolute=absolute,
- symmetrical=symmetrical,
- clustered=clustered,
- )
-
- @fitted_only
- def feature_redundancy_matrix(
- self,
- *,
- absolute: bool = False,
- symmetrical: bool = False,
- clustered: bool = True,
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- """
- Calculate the feature redundancy matrix.
-
- This yields an asymmetric matrix where each row and column represents one
- feature, and the values at the intersections are the pairwise feature
- redundancies, ranging from `0.0` (no redundancy - both features contribute to
- predictions fully independently of each other) to `1.0` (full redundancy, either
- feature can replace the other feature without loss of predictive power).
-
- The redundancy of a feature with itself is defined as `1.0`.
-
- Feature redundancy calculations require SHAP interaction values; if only SHAP
- values are available consider calculating feature associations instead
- (see :meth:`.feature_association_matrix`).
-
- In the case of multi-target regression and non-binary classification, returns
- a list of data frames with one matrix per output.
-
- :param absolute: if ``False``, return relative redundancy as a percentage of
- total feature importance;
- if ``True``, return absolute redundancy as a portion of feature importance
- :param symmetrical: if ``True``, return a symmetrical matrix quantifying
- mutual redundancy; if ``False``, return an asymmetrical matrix quantifying
- unilateral redundancy of the features represented by rows with the
- features represented by columns (default: ``False``)
- :param clustered: if ``True``, reorder the rows and columns of the matrix
- such that redundancy between adjacent rows and columns is maximised; if
- ``False``, keep rows and columns in the original features order
- (default: ``True``)
- :return: feature redundancy matrix as a data frame of shape
- `(n_features, n_features)`, or a list of data frames for multiple outputs
- """
-
- return self.__feature_affinity_matrix(
- explainer_fn=self.__interaction_explainer.redundancy,
- absolute=absolute,
- symmetrical=symmetrical,
- clustered=clustered,
- )
-
- @fitted_only
- def feature_association_matrix(
- self,
- *,
- absolute: bool = False,
- symmetrical: bool = False,
- clustered: bool = True,
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- """
- Calculate the feature association matrix.
-
- This yields an asymmetric matrix where each row and column represents one
- feature, and the values at the intersections are the pairwise feature
- associations, ranging from `0.0` (no association) to `1.0` (full association).
-
- The association of a feature with itself is defined as `1.0`.
-
- Feature association provides an upper bound for feature redundancy but might be
- inflated by feature synergy.
-
- While it is preferable to assess redundancy and synergy separately, association
- can be calculated using only SHAP values, and thus can be used as a fallback
- if no SHAP interaction values are available.
-
- In the case of multi-target regression and non-binary classification, returns
- a list of data frames with one matrix per output.
-
- :param absolute: if ``False``, return relative association as a percentage of
- total feature importance;
- if ``True``, return absolute association as a portion of feature importance
- :param symmetrical: if ``False``, return an asymmetrical matrix
- quantifying unilateral association of the features represented by rows
- with the features represented by columns;
- if ``True``, return a symmetrical matrix quantifying mutual association
- (default: ``False``)
- :param clustered: if ``True``, reorder the rows and columns of the matrix
- such that association between adjacent rows and columns is maximised; if
- ``False``, keep rows and columns in the original features order
- (default: ``True``)
- :return: feature association matrix as a data frame of shape
- `(n_features, n_features)`, or a list of data frames for multiple outputs
- """
-
- return self.__feature_affinity_matrix(
- explainer_fn=self._shap_global_explainer.association,
- absolute=absolute,
- symmetrical=symmetrical,
- clustered=clustered,
- )
-
- @fitted_only
- def feature_synergy_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
- """
- Calculate a linkage tree based on the :meth:`.feature_synergy_matrix`.
-
- The linkage tree can be used to render a dendrogram indicating clusters of
- synergistic features.
-
- In the case of multi-target regression and non-binary classification, returns
- a list of linkage trees per target or class.
-
- :return: linkage tree of feature synergies; list of linkage trees
- for multi-target regressors or non-binary classifiers
- """
-
- feature_affinity_matrix = self.__interaction_explainer.synergy(
- symmetrical=True, absolute=False
- )
- assert (
- feature_affinity_matrix is not None
- ), ASSERTION__SHAP_INTERACTION_SUPPORTED
-
- return self.__linkages_from_affinity_matrices(
- feature_affinity_matrix=feature_affinity_matrix
- )
-
- @fitted_only
- def feature_redundancy_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
- """
- Calculate a linkage tree based on the :meth:`.feature_redundancy_matrix`.
-
- The linkage tree can be used to render a dendrogram indicating clusters of
- redundant features.
-
- In the case of multi-target regression and non-binary classification, returns
- a list of linkage trees per target or class.
-
- :return: linkage tree of feature redundancies; list of linkage trees
- for multi-target regressors or non-binary classifiers
- """
-
- feature_affinity_matrix = self.__interaction_explainer.redundancy(
- symmetrical=True, absolute=False
- )
- assert (
- feature_affinity_matrix is not None
- ), ASSERTION__SHAP_INTERACTION_SUPPORTED
-
- return self.__linkages_from_affinity_matrices(
- feature_affinity_matrix=feature_affinity_matrix
- )
-
- @fitted_only
- def feature_association_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
- """
- Calculate a linkage tree based on the :meth:`.feature_association_matrix`.
-
- The linkage tree can be used to render a dendrogram indicating clusters of
- associated features.
-
- In the case of multi-target regression and non-binary classification, returns
- a list of linkage trees per target or class.
-
- :return: linkage tree of feature associations; list of linkage trees
- for multi-target regressors or non-binary classifiers
- """
-
- feature_affinity_matrix = self._shap_global_explainer.association(
- absolute=False, symmetrical=True
- )
- assert (
- feature_affinity_matrix is not None
- ), ASSERTION__SHAP_INTERACTION_SUPPORTED
-
- return self.__linkages_from_affinity_matrices(
- feature_affinity_matrix=feature_affinity_matrix
- )
-
- @fitted_only
- def feature_interaction_matrix(self) -> Union[FloatMatrix, List[FloatMatrix]]:
- """
- Calculate relative shap interaction values for all feature pairings.
-
- Shap interactions quantify direct interactions between pairs of features.
- For a quantification of overall interaction (including indirect interactions
- across more than two features), see :meth:`.feature_synergy_matrix`.
-
- The relative values are normalised to add up to `1.0`, and each value ranges
- between `0.0` and `1.0`.
-
- For features :math:`f_i` and :math:`f_j`, relative feature interaction
- :math:`I` is calculated as
-
- .. math::
- I_{ij} = \\frac
- {\\sigma(\\vec{\\phi}_{ij})}
- {\\sum_{a=1}^n \\sum_{b=1}^n \\sigma(\\vec{\\phi}_{ab})}
-
- where :math:`\\sigma(\\vec v)` is the standard deviation of all elements of
- vector :math:`\\vec v`.
-
- The total average interaction of features
- :math:`f_i` and :math:`f_j` is
- :math:`I_{ij} \
- + I_{ji} \
- = 2 I_{ij}`.
-
- :math:`I_{ii}` is the residual, non-synergistic contribution
- of feature :math:`f_i`
-
- The matrix returned by this method is a lower-triangular matrix
-
- .. math::
-
- \\newcommand\\nan{\\mathit{nan}}
- I_{} = \\begin{pmatrix}
- I_{11} & \\nan & \\nan & \\dots & \\nan \\\\
- 2I_{21} & I_{22} & \\nan & \\dots & \\nan \\\\
- 2I_{31} & 2I_{32} & I_{33} & \\dots & \\nan \\\\
- \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\
- 2I_{n1} & 2I_{n2} & 2I_{n3} & \\dots & I_{nn} \\\\
- \\end{pmatrix}
-
- with :math:`\\sum_{a=1}^n \\sum_{b=a}^n I_{ab} = 1`
-
- In the case of multi-target regression and non-binary classification, returns
- a list with one matrix per output.
-
- :return: relative shap interaction values as a data frame of shape
- `(n_features, n_features)`; or a list of such data frames
- """
-
- n_features = len(self.features_)
- n_outputs = len(self.output_names_)
-
- # get a feature interaction array with shape
- # (n_observations, n_outputs, n_features, n_features)
- # where the innermost feature x feature arrays are symmetrical
- im_matrix_per_observation_and_output: FloatArray = (
- # TODO missing proper handling for list of data frames
- self.shap_interaction_values() # type: ignore
- .values.reshape((-1, n_features, n_outputs, n_features))
- .swapaxes(1, 2)
- )
-
- # get the observation weights with shape
- # (n_observations, n_outputs, n_features, n_features)
- weight: Optional[FloatArray]
- _weight_sr = self.sample_.weight
- if _weight_sr is not None:
- # if sample weights are defined, convert them to an array
- # and align the array with the dimensions of the feature interaction array
- weight = _weight_sr.values.reshape((-1, 1, 1, 1))
- else:
- weight = None
-
- # calculate the average interactions for each output and feature/feature
- # interaction, based on the standard deviation assuming a mean of 0.0.
- # The resulting matrix has shape (n_outputs, n_features, n_features)
- _interaction_squared = im_matrix_per_observation_and_output**2
- if weight is not None:
- _interaction_squared *= weight
- interaction_matrix = np.sqrt(_interaction_squared.mean(axis=0))
- assert interaction_matrix.shape == (n_outputs, n_features, n_features)
-
- # we normalise the synergy matrix for each output to a total of 1.0
- interaction_matrix /= interaction_matrix.sum()
-
- # the total interaction effect for features i and j is the total of matrix
- # cells (i,j) and (j,i); theoretically both should be the same but to minimize
- # numerical errors we total both in the lower matrix triangle (but excluding the
- # matrix diagonal, hence k=1)
- interaction_matrix += np.triu(interaction_matrix, k=1).swapaxes(1, 2)
-
- # discard the upper matrix triangle by setting it to nan
- interaction_matrix += np.triu(
- np.full(shape=(n_features, n_features), fill_value=np.nan), k=1
- )[np.newaxis, :, :]
-
- # create a data frame from the feature matrix
- return self.__arrays_to_matrix(
- interaction_matrix, value_label="relative shap interaction"
- )
-
- @fitted_only
- def shap_plot_data(self) -> ShapPlotData:
- """
- Consolidate SHAP values and corresponding feature values from this inspector
- for use in SHAP plots offered by the
- `shap `__ package.
-
- The `shap` package provides functions for creating various SHAP plots.
- Most of these functions require
-
- - one or more SHAP value matrices as a single `numpy` array,
- or a list of `numpy` arrays of shape `(n_observations, n_features)`
- - a feature matrix of shape `(n_observations, n_features)`, which can be
- provided as a data frame to preserve feature names
-
- This method provides this data inside a :class:`.ShapPlotData` object, plus
-
- - the names of all outputs (i.e., the target names in case of regression,
- or the class names in case of classification)
- - corresponding target values as a series, or as a data frame in the case of
- multiple targets
-
- This method also ensures that the rows of all arrays, frames, and series are
- aligned, even if only a subset of the observations in the original sample was
- used to calculate SHAP values.
-
- Calculates mean shap values for each observation and feature, across all
- splits for which SHAP values were calculated.
-
- :return: consolidated SHAP and feature values for use shap plots
- """
-
- shap_values: Union[pd.DataFrame, List[pd.DataFrame]] = self.shap_values()
-
- output_names: Sequence[str] = self.output_names_
- shap_values_numpy: Union[FloatArray, List[FloatArray]]
- included_observations: pd.Index
-
- if len(output_names) > 1:
- shap_values_numpy = [s.values for s in shap_values]
- included_observations = shap_values[0].index
- else:
- shap_values = cast(pd.DataFrame, shap_values)
- shap_values_numpy = shap_values.values
- included_observations = shap_values.index
-
- sample: Sample = self.sample_.subsample(loc=included_observations)
-
- return ShapPlotData(
- shap_values=shap_values_numpy,
- sample=sample,
+ return cast(
+ List[str],
+ self.model.final_estimator.feature_names_in_.to_list(),
)
- def __arrays_to_matrix(
- self, matrix: FloatArray, value_label: str
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- # transform a matrix of shape (n_outputs, n_features, n_features)
- # to a data frame
-
- feature_index = self.pipeline.feature_names_out_.rename(Sample.IDX_FEATURE)
-
- n_features = len(feature_index)
- assert matrix.shape == (len(self.output_names_), n_features, n_features)
-
- # convert array to data frame(s) with features as row and column indices
- if len(matrix) == 1:
- return self.__array_to_matrix(
- matrix[0],
- feature_importance=self.feature_importance(),
- value_label=value_label,
- )
- else:
- return [
- self.__array_to_matrix(
- m,
- feature_importance=feature_importance,
- value_label=f"{value_label} ({output_name})",
- )
- for m, (_, feature_importance), output_name in zip(
- matrix, self.feature_importance().items(), self.output_names_
- )
- ]
-
- def __feature_affinity_matrix(
- self,
- *,
- explainer_fn: Callable[..., FloatArray],
- absolute: bool,
- symmetrical: bool,
- clustered: bool,
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- affinity_matrices = explainer_fn(symmetrical=symmetrical, absolute=absolute)
+ def preprocess_features(
+ self, features: Union[pd.DataFrame, pd.Series]
+ ) -> pd.DataFrame:
+ """[see superclass]"""
+ return self.model.preprocess(features)
- explainer: ShapGlobalExplainer = cast(
- ShapGlobalExplainer, cast(MethodType, explainer_fn).__self__
- )
- affinity_matrices_df: List[pd.DataFrame] = explainer.to_frames(
- affinity_matrices
- )
+ @property
+ def shap_calculator(self) -> LearnerShapCalculator[Any]:
+ """[see superclass]"""
- if clustered:
- affinity_symmetrical = explainer_fn(symmetrical=True, absolute=False)
- assert (
- affinity_symmetrical is not None
- ), ASSERTION__SHAP_INTERACTION_SUPPORTED
+ if self._shap_calculator is not None:
+ return self._shap_calculator
- affinity_matrices_df = self.__sort_affinity_matrices(
- affinity_matrices=affinity_matrices_df,
- symmetrical_affinity_matrices=affinity_symmetrical,
- )
+ learner: SupervisedLearnerDF = self.model.final_estimator
- return self.__isolate_single_frame(
- affinity_matrices_df, affinity_metric=explainer_fn.__name__
+ shap_calculator_params: Dict[str, Any] = dict(
+ model=self.model.final_estimator.native_estimator,
+ interaction_values=self.shap_interaction,
+ explainer_factory=self.explainer_factory,
+ n_jobs=self.n_jobs,
+ shared_memory=self.shared_memory,
+ pre_dispatch=self.pre_dispatch,
+ verbose=self.verbose,
)
- @staticmethod
- def __sort_affinity_matrices(
- affinity_matrices: List[pd.DataFrame],
- symmetrical_affinity_matrices: FloatArray,
- ) -> List[pd.DataFrame]:
- # abbreviate a very long function name to stay within the permitted line length
- fn_linkage = LearnerInspector.__linkage_matrix_from_affinity_matrix_for_output
-
- return [
- (lambda feature_order: affinity_matrix.iloc[feature_order, feature_order])(
- feature_order=hierarchy.leaves_list(
- Z=fn_linkage(feature_affinity_matrix=symmetrical_affinity_matrix)
- )
- )
- for affinity_matrix, symmetrical_affinity_matrix in zip(
- affinity_matrices, symmetrical_affinity_matrices
- )
- ]
-
- @staticmethod
- def __split_multi_output_df(
- multi_output_df: pd.DataFrame,
- ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
- # Split a multi-output data frame into a list of single-output data frames.
- # Return single-output data frames as is.
- # Multi-output data frames are grouped by level 0 in the column index.
- if multi_output_df.columns.nlevels == 1:
- return multi_output_df
- else:
- return [
- multi_output_df.xs(key=output_name, axis=1, level=0, drop_level=True)
- for output_name in (
- cast(pd.MultiIndex, multi_output_df.columns).levels[0]
- )
- ]
-
- def __linkages_from_affinity_matrices(
- self, feature_affinity_matrix: FloatArray
- ) -> Union[LinkageTree, List[LinkageTree]]:
- # calculate the linkage trees for all outputs in a feature distance matrix;
- # matrix has shape (n_outputs, n_features, n_features) with values ranging from
- # (1 = closest, 0 = most distant)
- # return a linkage tree if there is only one output, else return a list of
- # linkage trees
-
- feature_importance = self.feature_importance(method="rms")
-
- if len(feature_affinity_matrix) == 1:
- # we have only a single output
- # feature importance is already a series
- return self.__linkage_tree_from_affinity_matrix_for_output(
- feature_affinity_matrix[0], feature_importance
- )
-
+ shap_calculator: LearnerShapCalculator[Any]
+ if is_classifier(learner):
+ shap_calculator = ClassifierShapCalculator(**shap_calculator_params)
else:
- feature_importance_iter: (
- Iterable[Tuple[Any, pd.Series]]
- ) = feature_importance.items()
-
- return [
- self.__linkage_tree_from_affinity_matrix_for_output(
- feature_affinity_for_output,
- feature_importance_for_output,
- )
- for feature_affinity_for_output, (
- _,
- feature_importance_for_output,
- ) in zip(feature_affinity_matrix, feature_importance_iter)
- ]
-
- @staticmethod
- def __linkage_tree_from_affinity_matrix_for_output(
- feature_affinity_matrix: FloatArray, feature_importance: pd.Series
- ) -> LinkageTree:
- # calculate the linkage tree from the a given output in a feature distance
- # matrix;
- # matrix has shape (n_features, n_features) with values ranging from
- # (1 = closest, 0 = most distant)
-
- linkage_matrix: FloatArray = (
- LearnerInspector.__linkage_matrix_from_affinity_matrix_for_output(
- feature_affinity_matrix
- )
- )
-
- # Feature labels and weights will be used as the leaves of the linkage tree.
- # Select only the features that appear in the distance matrix, and in the
- # correct order
-
- # build and return the linkage tree
- return LinkageTree(
- scipy_linkage_matrix=linkage_matrix,
- leaf_names=feature_importance.index,
- leaf_weights=feature_importance.values,
- max_distance=1.0,
- distance_label="feature distance",
- leaf_label="feature",
- weight_label="feature importance",
- )
-
- @staticmethod
- def __linkage_matrix_from_affinity_matrix_for_output(
- feature_affinity_matrix: FloatArray,
- ) -> FloatArray:
- # calculate the linkage matrix from the a given output in a feature distance
- # matrix;
- # matrix has shape (n_features, n_features) with values ranging from
- # (1 = closest, 0 = most distant)
-
- # compress the distance matrix (required by SciPy)
- distance_matrix = 1.0 - abs(feature_affinity_matrix)
- np.fill_diagonal(distance_matrix, 0.0)
- compressed_distance_matrix: FloatArray = distance.squareform(distance_matrix)
-
- # calculate the linkage matrix
- leaf_ordering: FloatArray = hierarchy.optimal_leaf_ordering(
- Z=hierarchy.linkage(y=compressed_distance_matrix, method="single"),
- y=compressed_distance_matrix,
- )
-
- # reverse the leaf ordering, so that larger values tend to end up on top
- leaf_ordering[:, [1, 0]] = leaf_ordering[:, [0, 1]]
-
- return leaf_ordering
-
- def _ensure_shap_interaction(self) -> None:
- if not self.shap_interaction:
- raise RuntimeError(
- "SHAP interaction values have not been calculated. "
- "Create an inspector with parameter 'shap_interaction=True' to "
- "enable calculations involving SHAP interaction values."
- )
-
- def __isolate_single_frame(
- self,
- frames: List[pd.DataFrame],
- affinity_metric: str,
- ) -> Union[FloatMatrix, List[FloatMatrix]]:
- feature_importance = self.feature_importance()
-
- if len(frames) == 1:
- assert isinstance(feature_importance, pd.Series)
- return self.__frame_to_matrix(
- frames[0],
- affinity_metric=affinity_metric,
- feature_importance=feature_importance,
+ shap_calculator = RegressorShapCalculator(
+ **shap_calculator_params, output_names=learner.output_names_
)
- else:
- return [
- self.__frame_to_matrix(
- frame,
- affinity_metric=affinity_metric,
- feature_importance=frame_importance,
- feature_importance_category=str(frame_name),
- )
- for frame, (frame_name, frame_importance) in zip(
- frames, feature_importance.items()
- )
- ]
-
- @staticmethod
- def __array_to_matrix(
- a: npt.NDArray[T_Number],
- *,
- feature_importance: pd.Series,
- value_label: str,
- ) -> Matrix[T_Number]:
- return Matrix(
- a,
- names=(feature_importance.index, feature_importance.index),
- weights=(feature_importance, feature_importance),
- value_label=value_label,
- name_labels=("feature", "feature"),
- )
-
- @staticmethod
- def __frame_to_matrix(
- frame: pd.DataFrame,
- *,
- affinity_metric: str,
- feature_importance: pd.Series,
- feature_importance_category: Optional[str] = None,
- ) -> FloatMatrix:
- return Matrix.from_frame(
- frame,
- weights=(
- feature_importance.reindex(frame.index),
- feature_importance.reindex(frame.columns),
- ),
- value_label=(
- f"{affinity_metric} ({feature_importance_category})"
- if feature_importance_category
- else affinity_metric
- ),
- name_labels=("primary feature", "associated feature"),
- )
- @property
- def __shap_interaction_values_calculator(
- self,
- ) -> ShapInteractionValuesCalculator[T_LearnerPipelineDF]:
- self._ensure_shap_interaction()
- return cast(
- ShapInteractionValuesCalculator[T_LearnerPipelineDF], self._shap_calculator
- )
-
- @property
- def __interaction_explainer(self) -> ShapInteractionGlobalExplainer:
- self._ensure_shap_interaction()
- return cast(ShapInteractionGlobalExplainer, self._shap_global_explainer)
+ self._shap_calculator = shap_calculator
+ return shap_calculator
__tracker.validate()
diff --git a/src/facet/inspection/_model_inspector.py b/src/facet/inspection/_model_inspector.py
new file mode 100644
index 00000000..0abfa5fb
--- /dev/null
+++ b/src/facet/inspection/_model_inspector.py
@@ -0,0 +1,1015 @@
+"""
+Implementation of :class:`.ModelInspector`.
+"""
+import logging
+from abc import ABCMeta, abstractmethod
+from types import MethodType
+from typing import (
+ Any,
+ Callable,
+ Generic,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ TypeVar,
+ Union,
+ cast,
+)
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+from scipy.cluster import hierarchy
+from scipy.spatial import distance
+from typing_extensions import TypeAlias
+
+from pytools.api import AllTracker
+from pytools.data import LinkageTree, Matrix
+from pytools.fit import FittableMixin, fitted_only
+from pytools.parallelization import ParallelizableMixin
+
+from ..data import Sample
+from ._inspection import ShapPlotData
+from ._shap_projection import (
+ ShapInteractionVectorProjector,
+ ShapProjector,
+ ShapVectorProjector,
+)
+from .shap import ShapCalculator
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+ "ModelInspector",
+]
+
+
+#
+# Type aliases
+#
+
+FloatArray: TypeAlias = npt.NDArray[np.float_]
+FloatMatrix: TypeAlias = Matrix[np.float_]
+
+
+#
+# Type variables
+#
+
+T_Model = TypeVar("T_Model")
+T_ModelInspector = TypeVar("T_ModelInspector", bound="ModelInspector[Any]")
+T_Number = TypeVar("T_Number", bound="np.number[Any]")
+T_SeriesOrDataFrame = TypeVar("T_SeriesOrDataFrame", pd.Series, pd.DataFrame)
+
+
+#
+# Constants
+#
+
+ASSERTION__INSPECTOR_IS_FITTED = "inspector is fitted"
+ASSERTION__SHAP_INTERACTION_SUPPORTED = "SHAP interaction values are supported"
+
+
+#
+# Ensure all symbols introduced below are included in __all__
+#
+
+__tracker = AllTracker(globals())
+
+
+#
+# Class definitions
+#
+
+
+class ModelInspector(
+ ParallelizableMixin, FittableMixin[Sample], Generic[T_Model], metaclass=ABCMeta
+):
+ """
+ .. note::
+ This is an abstract base class for inspectors explaining different kinds of
+ models based on SHAP values.
+ It is not intended to be used directly.
+
+ Explain regressors and classifiers based on SHAP values.
+
+ Focus is on explaining the overall model, but the inspector also delivers
+ SHAP explanations of the individual observations.
+
+ Available inspection methods are:
+
+ - SHAP values
+ - SHAP interaction values
+ - feature importance derived from SHAP values (either as mean absolute values
+ or as the root of mean squares)
+ - pairwise feature redundancy matrix (requires availability of SHAP interaction
+ values)
+ - pairwise feature synergy matrix (requires availability of SHAP interaction
+ values)
+ - pairwise feature association matrix (upper bound for redundancy but can be
+ inflated by synergy; available if SHAP interaction values are unknown)
+ - pairwise feature interaction matrix (direct feature interaction quantified by
+ SHAP interaction values)
+ - feature redundancy linkage (to visualize clusters of redundant features in a
+ dendrogram; requires availability of SHAP interaction values)
+ - feature synergy linkage (to visualize clusters of synergistic features in a
+ dendrogram; requires availability of SHAP interaction values)
+ - feature association linkage (to visualize clusters of associated features in a
+ dendrogram)
+
+ All inspections that aggregate across observations will respect sample weights, if
+ specified in the underlying training sample.
+ """
+
+ # defined in superclass, repeated here for Sphinx
+ n_jobs: Optional[int]
+
+ # defined in superclass, repeated here for Sphinx
+ shared_memory: Optional[bool]
+
+ # defined in superclass, repeated here for Sphinx
+ pre_dispatch: Optional[Union[str, int]]
+
+ # defined in superclass, repeated here for Sphinx
+ verbose: Optional[int]
+
+ #: The model to inspect.
+ model: T_Model
+
+ #: If ``True``, calculate SHAP interaction values, else only calculate SHAP
+ #: contribution values.
+ #: SHAP interaction values are needed to determine feature synergy and redundancy.
+ shap_interaction: bool
+
+ #: Name for feature importance series or column.
+ COL_IMPORTANCE = "importance"
+
+ def __init__(
+ self,
+ model: T_Model,
+ *,
+ shap_interaction: bool = True,
+ n_jobs: Optional[int] = None,
+ shared_memory: Optional[bool] = None,
+ pre_dispatch: Optional[Union[str, int]] = None,
+ verbose: Optional[int] = None,
+ ) -> None:
+ """
+ :param model: the model to inspect
+ :param shap_interaction: if ``True``, calculate SHAP interaction values, else
+ only calculate SHAP contribution values;
+ SHAP interaction values are needed to determine feature synergy and
+ redundancy
+ (default: ``True``)
+ """
+ super().__init__(
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
+
+ self.model = model
+ self.shap_interaction = shap_interaction
+
+ self._shap_projector: Optional[ShapProjector] = None
+ self._sample: Optional[Sample] = None
+
+ __init__.__doc__ = cast(str, __init__.__doc__) + cast(
+ str, ParallelizableMixin.__init__.__doc__
+ )
+
+ def preprocess_features(
+ self, features: Union[pd.DataFrame, pd.Series]
+ ) -> pd.DataFrame:
+ """
+ Preprocess the features prior to calculating SHAP values.
+
+ This method is called by :meth:`.fit` before fitting the inspector.
+ By default, returns the features unchanged.
+
+ :param features: features to preprocess
+ :return: preprocessed features
+ """
+ return features
+
+ def fit(
+ self: T_ModelInspector, __sample: Sample, **fit_params: Any
+ ) -> T_ModelInspector:
+ """
+ Fit the inspector with the given sample, creating global explanations including
+ feature redundancy and synergy.
+
+ This will calculate SHAP values and, if enabled in the underlying SHAP
+ explainer, also SHAP interaction values.
+
+ :param __sample: the background sample to be used for the global explanation
+ of this model
+ :param fit_params: additional keyword arguments (ignored; accepted for
+ compatibility with :class:`.FittableMixin`)
+ :return: ``self``
+ """
+
+ shap_calculator: ShapCalculator[Any] = self.shap_calculator
+
+ shap_calculator.fit(self.preprocess_features(__sample.features))
+
+ shap_global_projector: ShapProjector = (
+ ShapInteractionVectorProjector()
+ if self.shap_interaction
+ else ShapVectorProjector()
+ )
+
+ shap_global_projector.fit(shap_calculator, sample_weight=__sample.weight)
+
+ self._sample = __sample
+ self._shap_projector = shap_global_projector
+
+ return self
+
+ @property
+ def is_fitted(self) -> bool:
+ """[see superclass]"""
+ return self._sample is not None
+
+ @property
+ @fitted_only
+ def sample_(self) -> Sample:
+ """
+ The background sample used to fit this inspector.
+ """
+
+ assert self._sample is not None, ASSERTION__INSPECTOR_IS_FITTED
+ return self._sample
+
+ @property
+ @abstractmethod
+ def feature_names(self) -> List[str]:
+ """
+ The feature names of the model being inspected.
+
+ These names may differ from the feature names expected for the sample
+ set has been preprocessed before calculating SHAP values.
+
+ :return: the feature names
+ """
+ pass
+
+ @property
+ def output_names(self) -> List[str]:
+ """
+ The names of the outputs explained by this inspector.
+
+ For regressors, these are the names of the target columns.
+
+ For binary classifiers, this is a list of length 1 with the name of a single
+ class, since the SHAP values of the second class can be trivially derived as
+ the negation of the SHAP values of the first class.
+
+ For non-binary classifiers, this is the list of all classes.
+ """
+ return self.shap_calculator.output_names
+
+ @fitted_only
+ def shap_values(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+ """
+ Calculate the SHAP values for all observations and features.
+
+ Returns a data frame of SHAP values where each row corresponds to an
+ observation, and each column corresponds to a feature.
+
+ :return: a data frame with SHAP values
+ """
+
+ return self.__split_multi_output_df(self.shap_calculator.shap_values)
+
+ @fitted_only
+ def shap_interaction_values(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+ """
+ Calculate the SHAP interaction values for all observations and pairs of
+ features.
+
+ Returns a data frame of SHAP interaction values where each row corresponds to an
+ observation and a feature (identified by a hierarchical index with two levels),
+ and each column corresponds to a feature.
+
+ :return: a data frame with SHAP interaction values
+ """
+ return self.__split_multi_output_df(
+ self.shap_calculator.shap_interaction_values
+ )
+
+ @fitted_only
+ def feature_importance(
+ self, *, method: str = "rms"
+ ) -> Union[pd.Series, pd.DataFrame]:
+ # noinspection GrazieInspection
+ """
+ Calculate the relative importance of each feature based on SHAP values.
+
+ The importance values of all features always add up to `1.0`.
+
+ The calculation applies sample weights if specified in the underlying
+ :class:`.Sample`.
+
+ :param method: method for calculating feature importance. Supported methods
+ are ``rms`` (root of mean squares, default), ``mav`` (mean absolute
+ values)
+ :return: a series of length `n_features` for single-output models, or a
+ data frame of shape (n_features, n_outputs) for multi-output models
+ """
+
+ methods = {"rms", "mav"}
+ if method not in methods:
+ raise ValueError(f'arg method="{method}" must be one of {methods}')
+
+ shap_matrix: pd.DataFrame = self.shap_calculator.shap_values
+ weight: Optional[pd.Series] = self.sample_.weight
+
+ abs_importance: pd.Series
+ if method == "rms":
+ if weight is None:
+ abs_importance = shap_matrix.pow(2).mean().pow(0.5)
+ else:
+ abs_importance = shap_matrix.pow(2).mul(weight, axis=0).mean().pow(0.5)
+ else:
+ assert method == "mav", f"method is in {methods}"
+ if weight is None:
+ abs_importance = shap_matrix.abs().mean()
+ else:
+ abs_importance = shap_matrix.abs().mul(weight, axis=0).mean()
+
+ def _normalize_importance(
+ _importance: T_SeriesOrDataFrame,
+ ) -> T_SeriesOrDataFrame:
+ return _importance.divide(_importance.sum())
+
+ if len(self.output_names) == 1:
+ return _normalize_importance(abs_importance).rename(self.output_names[0])
+
+ else:
+ assert (
+ abs_importance.index.nlevels == 2
+ ), "2 index levels in place for multi-output models"
+
+ return _normalize_importance(abs_importance.unstack(level=0))
+
+ @fitted_only
+ def feature_synergy_matrix(
+ self,
+ *,
+ absolute: bool = False,
+ symmetrical: bool = False,
+ clustered: bool = True,
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ """
+ Calculate the feature synergy matrix.
+
+ This yields an asymmetric matrix where each row and column represents one
+ feature, and the values at the intersections are the pairwise feature synergies,
+ ranging from `0.0` (no synergy - both features contribute to predictions fully
+ autonomously of each other) to `1.0` (full synergy, both features rely on
+ combining all of their information to achieve any contribution to predictions).
+
+ The synergy of a feature with itself is defined as `1.0`.
+
+ Feature synergy calculations require SHAP interaction values; if only SHAP
+ values are available consider calculating feature associations instead
+ (see :meth:`.feature_association_matrix`).
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of data frames with one matrix per output.
+
+ :param absolute: if ``False``, return relative synergy as a percentage of
+ total feature importance;
+ if ``True``, return absolute synergy as a portion of feature importance
+ :param symmetrical: if ``True``, return a symmetrical matrix quantifying
+ mutual synergy; if ``False``, return an asymmetrical matrix quantifying
+ unilateral synergy of the features represented by rows with the
+ features represented by columns (default: ``False``)
+ :param clustered: if ``True``, reorder the rows and columns of the matrix
+ such that synergy between adjacent rows and columns is maximised; if
+ ``False``, keep rows and columns in the original features order
+ (default: ``True``)
+ :return: feature synergy matrix as a data frame of shape
+ `(n_features, n_features)`, or a list of data frames for multiple outputs
+ """
+
+ return self.__feature_affinity_matrix(
+ explainer_fn=self.__interaction_projector.synergy,
+ absolute=absolute,
+ symmetrical=symmetrical,
+ clustered=clustered,
+ )
+
+ @fitted_only
+ def feature_redundancy_matrix(
+ self,
+ *,
+ absolute: bool = False,
+ symmetrical: bool = False,
+ clustered: bool = True,
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ """
+ Calculate the feature redundancy matrix.
+
+ This yields an asymmetric matrix where each row and column represents one
+ feature, and the values at the intersections are the pairwise feature
+ redundancies, ranging from `0.0` (no redundancy - both features contribute to
+ predictions fully independently of each other) to `1.0` (full redundancy, either
+ feature can replace the other feature without loss of predictive power).
+
+ The redundancy of a feature with itself is defined as `1.0`.
+
+ Feature redundancy calculations require SHAP interaction values; if only SHAP
+ values are available consider calculating feature associations instead
+ (see :meth:`.feature_association_matrix`).
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of data frames with one matrix per output.
+
+ :param absolute: if ``False``, return relative redundancy as a percentage of
+ total feature importance;
+ if ``True``, return absolute redundancy as a portion of feature importance
+ :param symmetrical: if ``True``, return a symmetrical matrix quantifying
+ mutual redundancy; if ``False``, return an asymmetrical matrix quantifying
+ unilateral redundancy of the features represented by rows with the
+ features represented by columns (default: ``False``)
+ :param clustered: if ``True``, reorder the rows and columns of the matrix
+ such that redundancy between adjacent rows and columns is maximised; if
+ ``False``, keep rows and columns in the original features order
+ (default: ``True``)
+ :return: feature redundancy matrix as a data frame of shape
+ `(n_features, n_features)`, or a list of data frames for multiple outputs
+ """
+
+ return self.__feature_affinity_matrix(
+ explainer_fn=self.__interaction_projector.redundancy,
+ absolute=absolute,
+ symmetrical=symmetrical,
+ clustered=clustered,
+ )
+
+ @fitted_only
+ def feature_association_matrix(
+ self,
+ *,
+ absolute: bool = False,
+ symmetrical: bool = False,
+ clustered: bool = True,
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ """
+ Calculate the feature association matrix.
+
+ This yields an asymmetric matrix where each row and column represents one
+ feature, and the values at the intersections are the pairwise feature
+ associations, ranging from `0.0` (no association) to `1.0` (full association).
+
+ The association of a feature with itself is defined as `1.0`.
+
+ Feature association provides an upper bound for feature redundancy but might be
+ inflated by feature synergy.
+
+ While it is preferable to assess redundancy and synergy separately, association
+ can be calculated using only SHAP values, and thus can be used as a fallback
+ if no SHAP interaction values are available.
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of data frames with one matrix per output.
+
+ :param absolute: if ``False``, return relative association as a percentage of
+ total feature importance;
+ if ``True``, return absolute association as a portion of feature importance
+ :param symmetrical: if ``False``, return an asymmetrical matrix
+ quantifying unilateral association of the features represented by rows
+ with the features represented by columns;
+ if ``True``, return a symmetrical matrix quantifying mutual association
+ (default: ``False``)
+ :param clustered: if ``True``, reorder the rows and columns of the matrix
+ such that association between adjacent rows and columns is maximised; if
+ ``False``, keep rows and columns in the original features order
+ (default: ``True``)
+ :return: feature association matrix as a data frame of shape
+ `(n_features, n_features)`, or a list of data frames for multiple outputs
+ """
+
+ assert self._shap_projector is not None, ASSERTION__INSPECTOR_IS_FITTED
+ return self.__feature_affinity_matrix(
+ explainer_fn=self._shap_projector.association,
+ absolute=absolute,
+ symmetrical=symmetrical,
+ clustered=clustered,
+ )
+
+ @fitted_only
+ def feature_synergy_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
+ """
+ Calculate a linkage tree based on the :meth:`.feature_synergy_matrix`.
+
+ The linkage tree can be used to render a dendrogram indicating clusters of
+ synergistic features.
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of linkage trees per target or class.
+
+ :return: linkage tree of feature synergies; list of linkage trees
+ for multi-target regressors or non-binary classifiers
+ """
+
+ feature_affinity_matrix = self.__interaction_projector.synergy(
+ symmetrical=True, absolute=False
+ )
+ assert (
+ feature_affinity_matrix is not None
+ ), ASSERTION__SHAP_INTERACTION_SUPPORTED
+
+ return self.__linkages_from_affinity_matrices(
+ feature_affinity_matrix=feature_affinity_matrix
+ )
+
+ @fitted_only
+ def feature_redundancy_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
+ """
+ Calculate a linkage tree based on the :meth:`.feature_redundancy_matrix`.
+
+ The linkage tree can be used to render a dendrogram indicating clusters of
+ redundant features.
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of linkage trees per target or class.
+
+ :return: linkage tree of feature redundancies; list of linkage trees
+ for multi-target regressors or non-binary classifiers
+ """
+
+ feature_affinity_matrix = self.__interaction_projector.redundancy(
+ symmetrical=True, absolute=False
+ )
+ assert (
+ feature_affinity_matrix is not None
+ ), ASSERTION__SHAP_INTERACTION_SUPPORTED
+
+ return self.__linkages_from_affinity_matrices(
+ feature_affinity_matrix=feature_affinity_matrix
+ )
+
+ @fitted_only
+ def feature_association_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
+ """
+ Calculate a linkage tree based on the :meth:`.feature_association_matrix`.
+
+ The linkage tree can be used to render a dendrogram indicating clusters of
+ associated features.
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list of linkage trees per target or class.
+
+ :return: linkage tree of feature associations; list of linkage trees
+ for multi-target regressors or non-binary classifiers
+ """
+
+ assert self._shap_projector is not None, ASSERTION__INSPECTOR_IS_FITTED
+ feature_affinity_matrix = self._shap_projector.association(
+ absolute=False, symmetrical=True
+ )
+ assert (
+ feature_affinity_matrix is not None
+ ), ASSERTION__SHAP_INTERACTION_SUPPORTED
+
+ return self.__linkages_from_affinity_matrices(
+ feature_affinity_matrix=feature_affinity_matrix
+ )
+
+ @fitted_only
+ def feature_interaction_matrix(self) -> Union[FloatMatrix, List[FloatMatrix]]:
+ """
+ Calculate relative shap interaction values for all feature pairings.
+
+ Shap interactions quantify direct interactions between pairs of features.
+ For a quantification of overall interaction (including indirect interactions
+ across more than two features), see :meth:`.feature_synergy_matrix`.
+
+ The relative values are normalised to add up to `1.0`, and each value ranges
+ between `0.0` and `1.0`.
+
+ For features :math:`f_i` and :math:`f_j`, relative feature interaction
+ :math:`I` is calculated as
+
+ .. math::
+ I_{ij} = \\frac
+ {\\sigma(\\vec{\\phi}_{ij})}
+ {\\sum_{a=1}^n \\sum_{b=1}^n \\sigma(\\vec{\\phi}_{ab})}
+
+ where :math:`\\sigma(\\vec v)` is the standard deviation of all elements of
+ vector :math:`\\vec v`.
+
+ The total average interaction of features
+ :math:`f_i` and :math:`f_j` is
+ :math:`I_{ij} \
+ + I_{ji} \
+ = 2 I_{ij}`.
+
+ :math:`I_{ii}` is the residual, non-synergistic contribution
+ of feature :math:`f_i`
+
+ The matrix returned by this method is a lower-triangular matrix
+
+ .. math::
+
+ \\newcommand\\nan{\\mathit{nan}}
+ I_{} = \\begin{pmatrix}
+ I_{11} & \\nan & \\nan & \\dots & \\nan \\\\
+ 2I_{21} & I_{22} & \\nan & \\dots & \\nan \\\\
+ 2I_{31} & 2I_{32} & I_{33} & \\dots & \\nan \\\\
+ \\vdots & \\vdots & \\vdots & \\ddots & \\vdots \\\\
+ 2I_{n1} & 2I_{n2} & 2I_{n3} & \\dots & I_{nn} \\\\
+ \\end{pmatrix}
+
+ with :math:`\\sum_{a=1}^n \\sum_{b=a}^n I_{ab} = 1`
+
+ In the case of multi-target regression and non-binary classification, returns
+ a list with one matrix per output.
+
+ :return: relative shap interaction values as a data frame of shape
+ `(n_features, n_features)`; or a list of such data frames
+ """
+
+ n_features = len(self.feature_names)
+ n_outputs = len(self.output_names)
+
+ # get a feature interaction array with shape
+ # (n_observations, n_outputs, n_features, n_features)
+ # where the innermost feature x feature arrays are symmetrical
+ im_matrix_per_observation_and_output: FloatArray = (
+ # TODO missing proper handling for list of data frames
+ self.shap_interaction_values() # type: ignore
+ .values.reshape((-1, n_features, n_outputs, n_features))
+ .swapaxes(1, 2)
+ )
+
+ # get the observation weights with shape
+ # (n_observations, n_outputs, n_features, n_features)
+ weight: Optional[FloatArray]
+ _weight_sr = self.sample_.weight
+ if _weight_sr is not None:
+ # if sample weights are defined, convert them to an array
+ # and align the array with the dimensions of the feature interaction array
+ weight = _weight_sr.values.reshape((-1, 1, 1, 1))
+ else:
+ weight = None
+
+ # calculate the average interactions for each output and feature/feature
+ # interaction, based on the standard deviation assuming a mean of 0.0.
+ # The resulting matrix has shape (n_outputs, n_features, n_features)
+ _interaction_squared = im_matrix_per_observation_and_output**2
+ if weight is not None:
+ _interaction_squared *= weight
+ interaction_matrix = np.sqrt(_interaction_squared.mean(axis=0))
+ assert interaction_matrix.shape == (n_outputs, n_features, n_features)
+
+ # we normalise the synergy matrix for each output to a total of 1.0
+ interaction_matrix /= interaction_matrix.sum()
+
+ # the total interaction effect for features i and j is the total of matrix
+ # cells (i,j) and (j,i); theoretically both should be the same but to minimize
+ # numerical errors we total both in the lower matrix triangle (but excluding the
+ # matrix diagonal, hence k=1)
+ interaction_matrix += np.triu(interaction_matrix, k=1).swapaxes(1, 2)
+
+ # discard the upper matrix triangle by setting it to nan
+ interaction_matrix += np.triu(
+ np.full(shape=(n_features, n_features), fill_value=np.nan), k=1
+ )[np.newaxis, :, :]
+
+ # create a data frame from the feature matrix
+ return self.__arrays_to_matrix(
+ interaction_matrix, value_label="relative shap interaction"
+ )
+
+ @fitted_only
+ def shap_plot_data(self) -> ShapPlotData:
+ """
+ Consolidate SHAP values and corresponding feature values from this inspector
+ for use in SHAP plots offered by the
+ `shap `__ package.
+
+ The `shap` package provides functions for creating various SHAP plots.
+ Most of these functions require:
+
+ - one or more SHAP value matrices as a single `numpy` array,
+ or a list of `numpy` arrays of shape `(n_observations, n_features)`
+ - a feature matrix of shape `(n_observations, n_features)`, which can be
+ provided as a data frame to preserve feature names
+
+ This method provides this data inside a :class:`.ShapPlotData` object, plus:
+
+ - the names of all outputs (i.e., the target names in case of regression,
+ or the class names in case of classification)
+ - corresponding target values as a series, or as a data frame in the case of
+ multiple targets
+
+ This method also ensures that the rows of all arrays, frames, and series are
+ aligned, even if only a subset of the observations in the original sample was
+ used to calculate SHAP values.
+
+ Calculates mean shap values for each observation and feature, across all
+ splits for which SHAP values were calculated.
+
+ :return: consolidated SHAP and feature values for use shap plots
+ """
+
+ shap_values: Union[pd.DataFrame, List[pd.DataFrame]] = self.shap_values()
+
+ output_names: List[str] = self.output_names
+ shap_values_numpy: Union[FloatArray, List[FloatArray]]
+ included_observations: pd.Index
+
+ if len(output_names) > 1:
+ shap_values_numpy = [s.values for s in shap_values]
+ included_observations = shap_values[0].index
+ else:
+ shap_values = cast(pd.DataFrame, shap_values)
+ shap_values_numpy = shap_values.values
+ included_observations = shap_values.index
+
+ sample: Sample = self.sample_.subsample(loc=included_observations)
+
+ return ShapPlotData(
+ shap_values=shap_values_numpy,
+ sample=sample,
+ )
+
+ @property
+ @abstractmethod
+ def shap_calculator(self) -> ShapCalculator[Any]:
+ """
+ The SHAP calculator used by this inspector.
+ """
+
+ def __arrays_to_matrix(
+ self, matrix: FloatArray, value_label: str
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ # transform a matrix of shape (n_outputs, n_features, n_features)
+ # to a data frame
+
+ feature_index = self.feature_names
+
+ n_features = len(feature_index)
+ assert matrix.shape == (len(self.output_names), n_features, n_features)
+
+ # convert array to data frame(s) with features as row and column indices
+ if len(matrix) == 1:
+ return self.__array_to_matrix(
+ matrix[0],
+ feature_importance=self.feature_importance(),
+ value_label=value_label,
+ )
+ else:
+ return [
+ self.__array_to_matrix(
+ m,
+ feature_importance=feature_importance,
+ value_label=f"{value_label} ({output_name})",
+ )
+ for m, (_, feature_importance), output_name in zip(
+ matrix, self.feature_importance().items(), self.output_names
+ )
+ ]
+
+ def __feature_affinity_matrix(
+ self,
+ *,
+ explainer_fn: Callable[..., FloatArray],
+ absolute: bool,
+ symmetrical: bool,
+ clustered: bool,
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ affinity_matrices = explainer_fn(symmetrical=symmetrical, absolute=absolute)
+
+ explainer: ShapProjector = cast(
+ ShapProjector, cast(MethodType, explainer_fn).__self__
+ )
+ affinity_matrices_df: List[pd.DataFrame] = explainer.to_frames(
+ affinity_matrices
+ )
+
+ if clustered:
+ affinity_symmetrical = explainer_fn(symmetrical=True, absolute=False)
+ assert (
+ affinity_symmetrical is not None
+ ), ASSERTION__SHAP_INTERACTION_SUPPORTED
+
+ affinity_matrices_df = self.__sort_affinity_matrices(
+ affinity_matrices=affinity_matrices_df,
+ symmetrical_affinity_matrices=affinity_symmetrical,
+ )
+
+ return self.__isolate_single_frame(
+ affinity_matrices_df, affinity_metric=explainer_fn.__name__
+ )
+
+ @staticmethod
+ def __sort_affinity_matrices(
+ affinity_matrices: List[pd.DataFrame],
+ symmetrical_affinity_matrices: FloatArray,
+ ) -> List[pd.DataFrame]:
+ # abbreviate a very long function name to stay within the permitted line length
+ fn_linkage = ModelInspector.__linkage_matrix_from_affinity_matrix_for_output
+
+ return [
+ (lambda feature_order: affinity_matrix.iloc[feature_order, feature_order])(
+ feature_order=hierarchy.leaves_list(
+ Z=fn_linkage(feature_affinity_matrix=symmetrical_affinity_matrix)
+ )
+ )
+ for affinity_matrix, symmetrical_affinity_matrix in zip(
+ affinity_matrices, symmetrical_affinity_matrices
+ )
+ ]
+
+ @staticmethod
+ def __split_multi_output_df(
+ multi_output_df: pd.DataFrame,
+ ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+ # Split a multi-output data frame into a list of single-output data frames.
+ # Return single-output data frames as is.
+ # Multi-output data frames are grouped by level 0 in the column index.
+ if multi_output_df.columns.nlevels == 1:
+ return multi_output_df
+ else:
+ return [
+ multi_output_df.xs(key=output_name, axis=1, level=0, drop_level=True)
+ for output_name in (
+ cast(pd.MultiIndex, multi_output_df.columns).levels[0]
+ )
+ ]
+
+ def __linkages_from_affinity_matrices(
+ self, feature_affinity_matrix: FloatArray
+ ) -> Union[LinkageTree, List[LinkageTree]]:
+ # calculate the linkage trees for all outputs in a feature distance matrix;
+ # matrix has shape (n_outputs, n_features, n_features) with values ranging from
+ # (1 = closest, 0 = most distant)
+ # return a linkage tree if there is only one output, else return a list of
+ # linkage trees
+
+ feature_importance = self.feature_importance(method="rms")
+
+ if len(feature_affinity_matrix) == 1:
+ # we have only a single output
+ # feature importance is already a series
+ return self.__linkage_tree_from_affinity_matrix_for_output(
+ feature_affinity_matrix[0], feature_importance
+ )
+
+ else:
+ feature_importance_iter: (
+ Iterable[Tuple[Any, pd.Series]]
+ ) = feature_importance.items()
+
+ return [
+ self.__linkage_tree_from_affinity_matrix_for_output(
+ feature_affinity_for_output,
+ feature_importance_for_output,
+ )
+ for feature_affinity_for_output, (
+ _,
+ feature_importance_for_output,
+ ) in zip(feature_affinity_matrix, feature_importance_iter)
+ ]
+
+ @staticmethod
+ def __linkage_tree_from_affinity_matrix_for_output(
+ feature_affinity_matrix: FloatArray, feature_importance: pd.Series
+ ) -> LinkageTree:
+ # calculate the linkage tree from the given output in a feature distance
+ # matrix;
+ # matrix has shape (n_features, n_features) with values ranging from
+ # (1 = closest, 0 = most distant)
+
+ linkage_matrix: FloatArray = (
+ ModelInspector.__linkage_matrix_from_affinity_matrix_for_output(
+ feature_affinity_matrix
+ )
+ )
+
+ # Feature labels and weights will be used as the leaves of the linkage tree.
+ # Select only the features that appear in the distance matrix, and in the
+ # correct order.
+
+ # build and return the linkage tree
+ return LinkageTree(
+ scipy_linkage_matrix=linkage_matrix,
+ leaf_names=feature_importance.index,
+ leaf_weights=feature_importance.values,
+ max_distance=1.0,
+ distance_label="feature distance",
+ leaf_label="feature",
+ weight_label="feature importance",
+ )
+
+ @staticmethod
+ def __linkage_matrix_from_affinity_matrix_for_output(
+ feature_affinity_matrix: FloatArray,
+ ) -> FloatArray:
+ # calculate the linkage matrix from the given output in a feature distance
+ # matrix;
+ # matrix has shape (n_features, n_features) with values ranging from
+ # (1 = closest, 0 = most distant)
+
+ # compress the distance matrix (required by SciPy)
+ distance_matrix = 1.0 - abs(feature_affinity_matrix)
+ np.fill_diagonal(distance_matrix, 0.0)
+ compressed_distance_matrix: FloatArray = distance.squareform(distance_matrix)
+
+ # calculate the linkage matrix
+ leaf_ordering: FloatArray = hierarchy.optimal_leaf_ordering(
+ Z=hierarchy.linkage(y=compressed_distance_matrix, method="single"),
+ y=compressed_distance_matrix,
+ )
+
+ # reverse the leaf ordering, so that larger values tend to end up on top
+ leaf_ordering[:, [1, 0]] = leaf_ordering[:, [0, 1]]
+
+ return leaf_ordering
+
+ def _ensure_shap_interaction(self) -> None:
+ if not self.shap_interaction:
+ raise RuntimeError(
+ "SHAP interaction values have not been calculated. "
+ "Create an inspector with parameter 'shap_interaction=True' to "
+ "enable calculations involving SHAP interaction values."
+ )
+
+ def __isolate_single_frame(
+ self,
+ frames: List[pd.DataFrame],
+ affinity_metric: str,
+ ) -> Union[FloatMatrix, List[FloatMatrix]]:
+ feature_importance = self.feature_importance()
+
+ if len(frames) == 1:
+ assert isinstance(feature_importance, pd.Series)
+ return self.__frame_to_matrix(
+ frames[0],
+ affinity_metric=affinity_metric,
+ feature_importance=feature_importance,
+ )
+ else:
+ return [
+ self.__frame_to_matrix(
+ frame,
+ affinity_metric=affinity_metric,
+ feature_importance=frame_importance,
+ feature_importance_category=str(frame_name),
+ )
+ for frame, (frame_name, frame_importance) in zip(
+ frames, feature_importance.items()
+ )
+ ]
+
+ @staticmethod
+ def __array_to_matrix(
+ a: npt.NDArray[T_Number],
+ *,
+ feature_importance: pd.Series,
+ value_label: str,
+ ) -> Matrix[T_Number]:
+ return Matrix(
+ a,
+ names=(feature_importance.index, feature_importance.index),
+ weights=(feature_importance, feature_importance),
+ value_label=value_label,
+ name_labels=("feature", "feature"),
+ )
+
+ @staticmethod
+ def __frame_to_matrix(
+ frame: pd.DataFrame,
+ *,
+ affinity_metric: str,
+ feature_importance: pd.Series,
+ feature_importance_category: Optional[str] = None,
+ ) -> FloatMatrix:
+ return Matrix.from_frame(
+ frame,
+ weights=(
+ feature_importance.reindex(frame.index),
+ feature_importance.reindex(frame.columns),
+ ),
+ value_label=(
+ f"{affinity_metric} ({feature_importance_category})"
+ if feature_importance_category
+ else affinity_metric
+ ),
+ name_labels=("primary feature", "associated feature"),
+ )
+
+ @property
+ def __interaction_projector(self) -> ShapInteractionVectorProjector:
+ self._ensure_shap_interaction()
+ return cast(ShapInteractionVectorProjector, self._shap_projector)
+
+
+__tracker.validate()
diff --git a/src/facet/inspection/_shap.py b/src/facet/inspection/_shap.py
deleted file mode 100644
index aa29c2cb..00000000
--- a/src/facet/inspection/_shap.py
+++ /dev/null
@@ -1,758 +0,0 @@
-"""
-Helper classes for SHAP calculations.
-"""
-
-import logging
-from abc import ABCMeta, abstractmethod
-from typing import Any, Generic, List, Optional, Sequence, TypeVar, Union, cast
-
-import numpy as np
-import numpy.typing as npt
-import pandas as pd
-
-from pytools.api import AllTracker, inheritdoc
-from pytools.fit import FittableMixin, fitted_only
-from pytools.parallelization import ParallelizableMixin
-from sklearndf.pipeline import (
- ClassifierPipelineDF,
- LearnerPipelineDF,
- RegressorPipelineDF,
-)
-
-from ..data import Sample
-from ._explainer import BaseExplainer, ExplainerFactory, ParallelExplainer
-
-log = logging.getLogger(__name__)
-
-__all__ = [
- "ShapCalculator",
- "ShapValuesCalculator",
- "ShapInteractionValuesCalculator",
- "RegressorShapCalculator",
- "RegressorShapValuesCalculator",
- "RegressorShapInteractionValuesCalculator",
- "ClassifierShapCalculator",
- "ClassifierShapValuesCalculator",
- "ClassifierShapInteractionValuesCalculator",
-]
-
-#
-# Type variables
-#
-
-T_ShapCalculator = TypeVar("T_ShapCalculator", bound="ShapCalculator[Any]")
-T_LearnerPipelineDF = TypeVar("T_LearnerPipelineDF", bound=LearnerPipelineDF[Any])
-
-
-#
-# Constants
-#
-
-ASSERTION__CALCULATOR_IS_FITTED = "calculator is fitted"
-
-
-#
-# Ensure all symbols introduced below are included in __all__
-#
-
-__tracker = AllTracker(globals())
-
-
-#
-# Class definitions
-#
-
-
-@inheritdoc(match="[see superclass]")
-class ShapCalculator(
- FittableMixin[Sample],
- ParallelizableMixin,
- Generic[T_LearnerPipelineDF],
- metaclass=ABCMeta,
-):
- """
- Base class for all SHAP calculators.
-
- A SHAP calculator uses the ``shap`` package to calculate SHAP tensors for all
- observations in a given sample, then consolidates and aggregates results
- in a data frame.
- """
-
- #: constant for "mean" aggregation method, to be passed as arg ``aggregation``
- #: to :class:`.ShapCalculator` methods that implement it
- AGG_MEAN = "mean"
-
- #: constant for "std" aggregation method, to be passed as arg ``aggregation``
- #: to :class:`.ShapCalculator` methods that implement it
- AGG_STD = "std"
-
- #: name of index level indicating the split ID
- IDX_SPLIT = "split"
-
- def __init__(
- self,
- pipeline: T_LearnerPipelineDF,
- explainer_factory: ExplainerFactory,
- *,
- n_jobs: Optional[int] = None,
- shared_memory: Optional[bool] = None,
- pre_dispatch: Optional[Union[str, int]] = None,
- verbose: Optional[int] = None,
- ) -> None:
- super().__init__(
- n_jobs=n_jobs,
- shared_memory=shared_memory,
- pre_dispatch=pre_dispatch,
- verbose=verbose,
- )
- self.pipeline = pipeline
- self._explainer_factory = explainer_factory
- self.shap_: Optional[pd.DataFrame] = None
- self.feature_index_: Optional[pd.Index] = None
- self.output_names_: Optional[Sequence[str]] = None
- self.sample_: Optional[Sample] = None
-
- @property
- def is_fitted(self) -> bool:
- """[see superclass]"""
- return self.shap_ is not None
-
- def fit( # type: ignore[override]
- self: T_ShapCalculator,
- sample: Sample,
- **fit_params: Any,
- ) -> T_ShapCalculator:
- """
- Calculate the SHAP values.
-
- :param sample: the observations for which to calculate SHAP values
- :param fit_params: additional fit parameters (unused)
- :return: self
- """
-
- # reset fit in case we get an exception along the way
- self.shap_ = None
-
- self.feature_index_ = self.pipeline.final_estimator.feature_names_in_
- self.output_names_ = self._get_output_names(sample)
- self.sample_ = sample
-
- # calculate shap values and re-order the observation index to match the
- # sequence in the original training sample
- shap_df: pd.DataFrame = self._get_shap(sample)
-
- n_levels = shap_df.index.nlevels
- assert 1 <= n_levels <= 2
- assert shap_df.index.names[0] == sample.index.name
-
- self.shap_ = shap_df.reindex(
- index=sample.index.intersection(
- (
- shap_df.index
- if n_levels == 1
- else cast(pd.MultiIndex, shap_df.index).levels[0]
- ),
- sort=False,
- ),
- level=0,
- copy=False,
- )
-
- return self
-
- @abstractmethod
- def get_shap_values(self) -> pd.DataFrame:
- """
- The resulting shap values, per observation and feature, as a data frame.
-
- :return: SHAP contribution values with shape
- (n_observations, n_outputs * n_features)
- """
-
- @abstractmethod
- def get_shap_interaction_values(self) -> pd.DataFrame:
- """
- Get the resulting shap interaction values as a data frame.
-
- :return: SHAP contribution values with shape
- (n_observations * n_features, n_outputs * n_features)
- :raise TypeError: this SHAP calculator does not support interaction values
- """
-
- @staticmethod
- @abstractmethod
- def get_multi_output_type() -> str:
- """
- :return: a category name for the dimensions represented by multiple outputs
- """
-
- @abstractmethod
- def get_multi_output_names(self, sample: Sample) -> List[str]:
- """
- :return: a name for each of the outputs
- """
- pass
-
- def _get_shap(self, sample: Sample) -> pd.DataFrame:
-
- pipeline = self.pipeline
-
- # prepare the background dataset
-
- background_dataset: Optional[pd.DataFrame]
-
- if self._explainer_factory.uses_background_dataset:
- background_dataset = sample.features
- if pipeline.preprocessing:
- background_dataset = pipeline.preprocessing.transform(
- X=background_dataset
- )
-
- background_dataset_not_na = background_dataset.dropna()
-
- if len(background_dataset_not_na) != len(background_dataset):
- n_original = len(background_dataset)
- n_dropped = n_original - len(background_dataset_not_na)
- log.warning(
- f"{n_dropped} out of {n_original} observations in the sample "
- "contain NaN values after pre-processing and will not be included "
- "in the background dataset"
- )
-
- background_dataset = background_dataset_not_na
-
- else:
- background_dataset = None
-
- explainer = self._explainer_factory.make_explainer(
- model=pipeline.final_estimator,
- # we re-index the columns of the background dataset to match
- # the column sequence of the model (in case feature order
- # was shuffled, or train split pre-processing removed columns)
- data=(
- None
- if background_dataset is None
- else background_dataset.reindex(
- columns=pipeline.final_estimator.feature_names_in_,
- copy=False,
- )
- ),
- )
-
- if self.n_jobs != 1:
- explainer = ParallelExplainer(
- explainer,
- n_jobs=self.n_jobs,
- shared_memory=self.shared_memory,
- pre_dispatch=self.pre_dispatch,
- verbose=self.verbose,
- )
-
- # we explain the full sample using the model fitted on the full sample
- # so the result is a list with a single data frame of shap values
- return self._calculate_shap(sample=sample, explainer=explainer)
-
- @abstractmethod
- def _calculate_shap(
- self, *, sample: Sample, explainer: BaseExplainer
- ) -> pd.DataFrame:
- pass
-
- def _convert_shap_tensors_to_list(
- self,
- *,
- shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]],
- n_outputs: int,
- ) -> List[npt.NDArray[np.float_]]:
- def _validate_shap_tensor(_t: npt.NDArray[np.float_]) -> None:
- if np.isnan(np.sum(_t)):
- raise AssertionError(
- "Output of SHAP explainer includes NaN values. "
- "This should not happen; consider initialising the "
- "LearnerInspector with an ExplainerFactory that has a different "
- "configuration, or that makes SHAP explainers of a different type."
- )
-
- if isinstance(shap_tensors, List):
- for shap_tensor in shap_tensors:
- _validate_shap_tensor(shap_tensor)
- else:
- _validate_shap_tensor(shap_tensors)
- shap_tensors = [shap_tensors]
-
- if n_outputs != len(shap_tensors):
- raise AssertionError(
- f"count of SHAP tensors (n={len(shap_tensors)}) "
- f"should match number of outputs (n={n_outputs})"
- )
-
- return shap_tensors
-
- def _preprocess_features(self, sample: Sample) -> pd.DataFrame:
-
- # get the model
- pipeline = self.pipeline
-
- # get the features of all out-of-bag observations
- x = sample.features
-
- # pre-process the features
- if pipeline.preprocessing is not None:
- x = pipeline.preprocessing.transform(x)
-
- # re-index the features to fit the sequence that was used to fit the learner
- return x.reindex(columns=pipeline.final_estimator.feature_names_in_, copy=False)
-
- @staticmethod
- @abstractmethod
- def _convert_raw_shap_to_df(
- raw_shap_tensors: List[npt.NDArray[np.float_]],
- observations: pd.Index,
- features_in_split: pd.Index,
- ) -> List[pd.DataFrame]:
- """
- Convert the SHAP tensors for a single split to a data frame.
-
- :param raw_shap_tensors: the raw values returned by the SHAP explainer
- :param observations: the ids used for indexing the explained observations
- :param features_in_split: the features in the current split,
- explained by the SHAP explainer
- :return: SHAP values of a single split as data frame
- """
- pass
-
- @abstractmethod
- def _get_output_names(self, sample: Sample) -> Sequence[str]:
- pass
-
-
-@inheritdoc(match="[see superclass]")
-class ShapValuesCalculator(
- ShapCalculator[T_LearnerPipelineDF], Generic[T_LearnerPipelineDF], metaclass=ABCMeta
-):
- """
- Base class for calculating SHAP contribution values.
- """
-
- @fitted_only
- def get_shap_values(self) -> pd.DataFrame:
- """[see superclass]"""
- return self.shap_
-
- def get_shap_interaction_values(self) -> pd.DataFrame:
- """
- Not implemented.
-
- :return: (never returns anything)
- :raise TypeError: SHAP interaction values are not supported - always raised
- """
- raise TypeError(
- f"{type(self).__name__}"
- f".{ShapValuesCalculator.get_shap_interaction_values.__name__}() "
- "is not defined"
- )
-
- def _calculate_shap(
- self, *, sample: Sample, explainer: BaseExplainer
- ) -> pd.DataFrame:
- x = self._preprocess_features(sample=sample)
-
- if x.isna().values.any():
- log.warning(
- "preprocessed sample passed to SHAP explainer contains NaN values; "
- "try to change preprocessing to impute all NaN values"
- )
-
- multi_output_type = self.get_multi_output_type()
- multi_output_names = self.get_multi_output_names(sample=sample)
- assert self.feature_index_ is not None, ASSERTION__CALCULATOR_IS_FITTED
- features_out = self.feature_index_
-
- # calculate the shap values, and ensure the result is a list of arrays
- shap_values: List[npt.NDArray[np.float_]] = self._convert_shap_tensors_to_list(
- shap_tensors=explainer.shap_values(x), n_outputs=len(multi_output_names)
- )
-
- # convert to a data frame per output (different logic depending on whether
- # we have a regressor or a classifier, implemented by method
- # shap_matrix_for_split_to_df_fn)
- shap_values_df_per_output: List[pd.DataFrame] = [
- shap.reindex(columns=features_out, copy=False, fill_value=0.0)
- for shap in self._convert_raw_shap_to_df(shap_values, x.index, x.columns)
- ]
-
- # if we have a single output, return the data frame for that output;
- # else, add a top level to the column index indicating each output
-
- if len(shap_values_df_per_output) == 1:
- return shap_values_df_per_output[0]
- else:
- return pd.concat(
- shap_values_df_per_output,
- axis=1,
- keys=multi_output_names,
- names=[multi_output_type, features_out.name],
- )
-
-
-@inheritdoc(match="[see superclass]")
-class ShapInteractionValuesCalculator(
- ShapCalculator[T_LearnerPipelineDF], Generic[T_LearnerPipelineDF], metaclass=ABCMeta
-):
- """
- Base class for calculating SHAP interaction values.
- """
-
- @fitted_only
- def get_shap_values(self) -> pd.DataFrame:
- """[see superclass]"""
-
- assert self.shap_ is not None, ASSERTION__CALCULATOR_IS_FITTED
- return self.shap_.groupby(level=0).sum()
-
- @fitted_only
- def get_shap_interaction_values(self) -> pd.DataFrame:
- """[see superclass]"""
-
- assert self.shap_ is not None, ASSERTION__CALCULATOR_IS_FITTED
- return self.shap_
-
- @fitted_only
- def get_diagonals(self) -> pd.DataFrame:
- """
- The get_diagonals of all SHAP interaction matrices, of shape
- (n_observations, n_outputs * n_features).
-
- :return: SHAP interaction values with shape
- (n_observations * n_features, n_outputs * n_features), i.e., for each
- observation and output we get the feature interaction values of size
- n_features * n_features.
- """
-
- assert (
- self.shap_ is not None
- and self.sample_ is not None
- and self.feature_index_ is not None
- ), ASSERTION__CALCULATOR_IS_FITTED
-
- n_observations = len(self.sample_)
- n_features = len(self.feature_index_)
- interaction_matrix = self.shap_
-
- return pd.DataFrame(
- np.diagonal(
- interaction_matrix.values.reshape(
- (n_observations, n_features, -1, n_features)
- # observations x features x outputs x features
- ),
- axis1=1,
- axis2=3,
- ).reshape((n_observations, -1)),
- # observations x (outputs * features)
- index=cast(pd.MultiIndex, interaction_matrix.index).levels[0],
- columns=interaction_matrix.columns,
- )
-
- def _calculate_shap(
- self, *, sample: Sample, explainer: BaseExplainer
- ) -> pd.DataFrame:
- x = self._preprocess_features(sample=sample)
-
- multi_output_type = self.get_multi_output_type()
- multi_output_names = self.get_multi_output_names(sample)
- assert self.feature_index_ is not None, ASSERTION__CALCULATOR_IS_FITTED
- features_out = self.feature_index_
-
- # calculate the shap interaction values; ensure the result is a list of arrays
- shap_interaction_tensors: List[
- npt.NDArray[np.float_]
- ] = self._convert_shap_tensors_to_list(
- shap_tensors=explainer.shap_interaction_values(x),
- n_outputs=len(multi_output_names),
- )
-
- interaction_matrix_per_output: List[pd.DataFrame] = [
- im.reindex(
- index=pd.MultiIndex.from_product(
- iterables=(x.index, features_out),
- names=(x.index.name, features_out.name),
- ),
- columns=features_out,
- copy=False,
- fill_value=0.0,
- )
- for im in self._convert_raw_shap_to_df(
- shap_interaction_tensors, x.index, x.columns
- )
- ]
-
- # if we have a single output, use the data frame for that output;
- # else, concatenate the values data frame for all outputs horizontally
- # and add a top level to the column index indicating each output
- if len(interaction_matrix_per_output) == 1:
- return interaction_matrix_per_output[0]
- else:
- return pd.concat(
- interaction_matrix_per_output,
- axis=1,
- keys=multi_output_names,
- names=[multi_output_type, features_out.name],
- )
-
-
-@inheritdoc(match="[see superclass]")
-class RegressorShapCalculator(
- ShapCalculator[RegressorPipelineDF[Any]], metaclass=ABCMeta
-):
- """
- Calculates SHAP (interaction) values for regression models.
- """
-
- def _get_output_names(self, sample: Sample) -> List[str]:
- # noinspection PyProtectedMember
- return sample._target_names
-
- @staticmethod
- def get_multi_output_type() -> str:
- """[see superclass]"""
- return Sample.IDX_TARGET
-
- def get_multi_output_names(self, sample: Sample) -> List[str]:
- """[see superclass]"""
- # noinspection PyProtectedMember
- return sample._target_names
-
-
-class RegressorShapValuesCalculator(
- RegressorShapCalculator, ShapValuesCalculator[RegressorPipelineDF[Any]]
-):
- """
- Calculates SHAP values for regression models.
- """
-
- @staticmethod
- def _convert_raw_shap_to_df(
- raw_shap_tensors: List[npt.NDArray[np.float_]],
- observations: pd.Index,
- features_in_split: pd.Index,
- ) -> List[pd.DataFrame]:
- return [
- pd.DataFrame(
- data=raw_shap_matrix, index=observations, columns=features_in_split
- )
- for raw_shap_matrix in raw_shap_tensors
- ]
-
-
-class RegressorShapInteractionValuesCalculator(
- RegressorShapCalculator, ShapInteractionValuesCalculator[RegressorPipelineDF[Any]]
-):
- """
- Calculates SHAP interaction matrices for regression models.
- """
-
- @staticmethod
- def _convert_raw_shap_to_df(
- raw_shap_tensors: List[npt.NDArray[np.float_]],
- observations: pd.Index,
- features_in_split: pd.Index,
- ) -> List[pd.DataFrame]:
- row_index = pd.MultiIndex.from_product(
- iterables=(observations, features_in_split),
- names=(observations.name, features_in_split.name),
- )
-
- return [
- pd.DataFrame(
- data=raw_interaction_tensor.reshape(
- (-1, raw_interaction_tensor.shape[2])
- ),
- index=row_index,
- columns=features_in_split,
- )
- for raw_interaction_tensor in raw_shap_tensors
- ]
-
-
-@inheritdoc(match="[see superclass]")
-class ClassifierShapCalculator(
- ShapCalculator[ClassifierPipelineDF[Any]], metaclass=ABCMeta
-):
- """
- Calculates SHAP (interaction) values for classification models.
- """
-
- COL_CLASS = "class"
-
- def _convert_shap_tensors_to_list(
- self,
- *,
- shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]],
- n_outputs: int,
- ) -> List[npt.NDArray[np.float_]]:
-
- if n_outputs == 2 and isinstance(shap_tensors, np.ndarray):
- # if we have a single output *and* binary classification, the explainer
- # will have returned a single tensor for the positive class;
- # the SHAP values for the negative class will have the opposite sign
- (shap_tensors,) = super()._convert_shap_tensors_to_list(
- shap_tensors=shap_tensors, n_outputs=1
- )
- return [-shap_tensors, shap_tensors]
- else:
- return super()._convert_shap_tensors_to_list(
- shap_tensors=shap_tensors, n_outputs=n_outputs
- )
-
- def _get_output_names(
- self,
- sample: Sample,
- ) -> Sequence[str]:
- assert not isinstance(
- sample.target_name, list
- ), "classification model is single-output"
- classifier_df = self.pipeline.final_estimator
- assert classifier_df.is_fitted, "classifier must be fitted"
-
- try:
- output_names: List[str] = classifier_df.classes_.tolist()
-
- except Exception as cause:
- raise AssertionError("classifier must define classes_ attribute") from cause
-
- n_outputs = len(output_names)
-
- if n_outputs == 1:
- raise RuntimeError(
- "cannot explain a (sub)sample with one single category "
- f"{repr(output_names[0])}: "
- "consider using a stratified cross-validation strategy"
- )
-
- elif n_outputs == 2:
- # for binary classifiers, we will generate only output for the first class
- # as the probabilities for the second class are trivially linked to class 1
- return output_names[:1]
-
- else:
- return output_names
-
- @staticmethod
- def get_multi_output_type() -> str:
- """[see superclass]"""
- return ClassifierShapCalculator.COL_CLASS
-
- def get_multi_output_names(self, sample: Sample) -> List[str]:
- """[see superclass]"""
- assert isinstance(
- sample.target, pd.Series
- ), "only single-output classifiers are currently supported"
- root_classifier = self.pipeline.final_estimator.native_estimator
- # noinspection PyUnresolvedReferences
- return list(map(str, root_classifier.classes_))
-
-
-class ClassifierShapValuesCalculator(
- ClassifierShapCalculator, ShapValuesCalculator[ClassifierPipelineDF[Any]]
-):
- """
- Calculates SHAP matrices for classification models.
- """
-
- # noinspection DuplicatedCode
- @staticmethod
- def _convert_raw_shap_to_df(
- raw_shap_tensors: List[npt.NDArray[np.float_]],
- observations: pd.Index,
- features_in_split: pd.Index,
- ) -> List[pd.DataFrame]:
- # return a list of data frame [obs x features], one for each of the outputs
-
- n_arrays = len(raw_shap_tensors)
-
- if n_arrays == 2:
- # in the binary classification case, we will proceed with SHAP values
- # for class 0 only, since values for class 1 will just be the same
- # values times (*-1) (the opposite delta probability)
-
- # to ensure the values are returned as expected above,
- # and no information of class 1 is discarded, assert the
- # following:
- if not np.allclose(raw_shap_tensors[0], -raw_shap_tensors[1]):
- _raw_shap_tensor_totals = raw_shap_tensors[0] + raw_shap_tensors[1]
- log.warning(
- "shap values of binary classifiers should add up to 0.0 "
- "for each observation and feature, but total shap values range "
- f"from {_raw_shap_tensor_totals.min():g} "
- f"to {_raw_shap_tensor_totals.max():g}"
- )
-
- # all good: proceed with SHAP values for class 1 (positive class):
- raw_shap_tensors = raw_shap_tensors[1:]
-
- return [
- pd.DataFrame(
- data=raw_shap_matrix, index=observations, columns=features_in_split
- )
- for raw_shap_matrix in raw_shap_tensors
- ]
-
-
-class ClassifierShapInteractionValuesCalculator(
- ClassifierShapCalculator, ShapInteractionValuesCalculator[ClassifierPipelineDF[Any]]
-):
- """
- Calculates SHAP interaction matrices for classification models.
- """
-
- # noinspection DuplicatedCode
- @staticmethod
- def _convert_raw_shap_to_df(
- raw_shap_tensors: List[npt.NDArray[np.float_]],
- observations: pd.Index,
- features_in_split: pd.Index,
- ) -> List[pd.DataFrame]:
- # return a list of data frame [(obs x features) x features],
- # one for each of the outputs
-
- n_arrays = len(raw_shap_tensors)
-
- if n_arrays == 2:
- # in the binary classification case, we will proceed with SHAP values
- # for class 0 only, since values for class 1 will just be the same
- # values times (*-1) (the opposite delta probability)
-
- # to ensure the values are returned as expected above,
- # and no information of class 1 is discarded, assert the
- # following:
- if not np.allclose(raw_shap_tensors[0], -raw_shap_tensors[1]):
- _raw_shap_tensor_totals = raw_shap_tensors[0] + raw_shap_tensors[1]
- log.warning(
- "shap interaction values of binary classifiers must add up to 0.0 "
- "for each observation and feature pair, but total shap values "
- f"range from {_raw_shap_tensor_totals.min():g} "
- f"to {_raw_shap_tensor_totals.max():g}"
- )
-
- # all good: proceed with SHAP values for class 1 (positive class):
- raw_shap_tensors = raw_shap_tensors[1:]
-
- # each row is indexed by an observation and a feature
- row_index = pd.MultiIndex.from_product(
- iterables=(observations, features_in_split),
- names=(observations.name, features_in_split.name),
- )
-
- return [
- pd.DataFrame(
- data=raw_shap_interaction_matrix.reshape(
- (-1, raw_shap_interaction_matrix.shape[2])
- ),
- index=row_index,
- columns=features_in_split,
- )
- for raw_shap_interaction_matrix in raw_shap_tensors
- ]
-
-
-__tracker.validate()
diff --git a/src/facet/inspection/_shap_global_explanation.py b/src/facet/inspection/_shap_context.py
similarity index 72%
rename from src/facet/inspection/_shap_global_explanation.py
rename to src/facet/inspection/_shap_context.py
index fd14ad20..6cbf36bd 100644
--- a/src/facet/inspection/_shap_global_explanation.py
+++ b/src/facet/inspection/_shap_context.py
@@ -6,25 +6,21 @@
from __future__ import annotations
import logging
-from abc import ABCMeta, abstractmethod
-from typing import Any, List, Optional, TypeVar, Union, cast
+from typing import Any, Optional, Union, cast
import numpy as np
import numpy.typing as npt
import pandas as pd
-from pytools.api import AllTracker, inheritdoc
-from pytools.fit import FittableMixin
+from pytools.api import AllTracker
-from ._shap import ShapCalculator
+from .shap import ShapCalculator
log = logging.getLogger(__name__)
__all__ = [
"AffinityMatrix",
"ShapContext",
- "ShapGlobalExplainer",
- "ShapInteractionGlobalExplainer",
"ShapInteractionValueContext",
"ShapValueContext",
"cov",
@@ -43,13 +39,6 @@
#: (sequential) summation will be negligible in practice
_PAIRWISE_PARTIAL_SUMMATION = False
-#
-# Type variables
-#
-
-T_ShapGlobalExplainer = TypeVar("T_ShapGlobalExplainer", bound="ShapGlobalExplainer")
-T_ShapCalculator = TypeVar("T_ShapCalculator", bound=ShapCalculator[Any])
-
#
# Constants
@@ -148,141 +137,6 @@ def get_values(self, symmetrical: bool, absolute: bool) -> npt.NDArray[np.float_
)
-@inheritdoc(match="""[see superclass]""")
-class ShapGlobalExplainer(FittableMixin[ShapCalculator[Any]], metaclass=ABCMeta):
- """
- Derives feature association as a global metric of SHAP values for multiple
- observations.
- """
-
- def __init__(self) -> None:
- super().__init__()
- self.feature_index_: Optional[pd.Index] = None
-
- @property
- def is_fitted(self) -> bool:
- """[see superclass]"""
- return self.feature_index_ is not None
-
- def fit( # type: ignore[override]
- self: T_ShapGlobalExplainer,
- shap_calculator: ShapCalculator[Any],
- **fit_params: Any,
- ) -> T_ShapGlobalExplainer:
- """
- Calculate the SHAP decomposition for the shap values produced by the
- given SHAP calculator.
-
- :param shap_calculator: the fitted calculator from which to get the shap values
- """
-
- try:
- if len(fit_params) > 0:
- raise ValueError(
- f'unsupported fit parameters: {", ".join(fit_params.values())}'
- )
-
- self._fit(shap_calculator=shap_calculator)
-
- self.feature_index_ = shap_calculator.feature_index_
-
- except Exception:
- # reset fit in case we get an exception along the way
- self._reset_fit()
- raise
-
- return self
-
- @abstractmethod
- def association(self, absolute: bool, symmetrical: bool) -> npt.NDArray[np.float_]:
- """
- The association matrix for all feature pairs.
-
- Raises an error if this global explainer has not been fitted.
-
- :param absolute: if ``False``, return relative association as a percentage of
- total feature importance;
- if ``True``, return absolute association as a portion of feature importance
- :param symmetrical: if ``False``, return an asymmetrical matrix
- quantifying unilateral association of the features represented by rows
- with the features represented by columns;
- if ``True``, return a symmetrical matrix quantifying mutual association
- :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
- """
-
- def to_frames(self, matrix: npt.NDArray[np.float_]) -> List[pd.DataFrame]:
- """
- Transforms one or more affinity matrices into a list of data frames.
-
- :param matrix: an array of shape `(n_outputs, n_features, n_features)`,
- representing one or more affinity matrices
- :return: a list of `n_outputs` data frames of shape `(n_features, n_features)`
- """
- assert self.feature_index_ is not None, "explainer is fitted"
- index = self.feature_index_
-
- n_features = len(index)
- assert matrix.ndim == 3
- assert matrix.shape[1:] == (n_features, n_features)
-
- return [
- pd.DataFrame(
- m,
- index=index,
- columns=index,
- )
- for m in matrix
- ]
-
- @abstractmethod
- def _fit(self, shap_calculator: ShapCalculator[Any]) -> None:
- pass
-
- def _reset_fit(self) -> None:
- self.feature_index_ = None
-
-
-class ShapInteractionGlobalExplainer(ShapGlobalExplainer, metaclass=ABCMeta):
- """
- Derives feature association, synergy, and redundancy as a global metric of SHAP
- interaction values for multiple observations.
- """
-
- @abstractmethod
- def synergy(self, symmetrical: bool, absolute: bool) -> npt.NDArray[np.float_]:
- """
- The synergy matrix for all feature pairs.
-
- Raises an error if this global explainer has not been fitted.
-
- :param absolute: if ``False``, return relative synergy as a percentage of
- total feature importance;
- if ``True``, return absolute synergy as a portion of feature importance
- :param symmetrical: if ``False``, return an asymmetrical matrix
- quantifying unilateral synergy of the features represented by rows
- with the features represented by columns;
- if ``True``, return a symmetrical matrix quantifying mutual synergy
- :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
- """
-
- @abstractmethod
- def redundancy(self, symmetrical: bool, absolute: bool) -> npt.NDArray[np.float_]:
- """
- The redundancy matrix for all feature pairs.
-
- Raises an error if this global explainer has not been fitted.
-
- :param absolute: if ``False``, return relative redundancy as a percentage of
- total feature importance;
- if ``True``, return absolute redundancy as a portion of feature importance
- :param symmetrical: if ``False``, return an asymmetrical matrix
- quantifying unilateral redundancy of the features represented by rows
- with the features represented by columns;
- if ``True``, return a symmetrical matrix quantifying mutual redundancy
- :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
- """
-
-
#
# Utility functions
#
@@ -462,7 +316,7 @@ def cov_broadcast(
)
-class ShapContext(metaclass=ABCMeta):
+class ShapContext:
"""
Contextual data for global SHAP calculations.
"""
@@ -518,15 +372,16 @@ class ShapValueContext(ShapContext):
Contextual data for global SHAP calculations based on SHAP values.
"""
- def __init__(self, shap_calculator: ShapCalculator[Any]) -> None:
- shap_values: pd.DataFrame = shap_calculator.get_shap_values()
+ def __init__(
+ self, shap_calculator: ShapCalculator[Any], sample_weight: Optional[pd.Series]
+ ) -> None:
+ shap_values: pd.DataFrame = shap_calculator.shap_values
def _p_i() -> npt.NDArray[np.float_]:
assert (
- shap_calculator.output_names_ is not None
- and shap_calculator.feature_index_ is not None
+ shap_calculator.feature_index_ is not None
), ASSERTION__CALCULATOR_IS_FITTED
- n_outputs: int = len(shap_calculator.output_names_)
+ n_outputs: int = len(shap_calculator.output_names)
n_features: int = len(shap_calculator.feature_index_)
n_observations: int = len(shap_values)
@@ -545,14 +400,11 @@ def _weight() -> Optional[npt.NDArray[np.float_]]:
# shape: (n_observations)
# return a 1d array of weights that aligns with the observations axis of the
# SHAP values tensor (axis 1)
- assert (
- shap_calculator.sample_ is not None and ASSERTION__CALCULATOR_IS_FITTED
- )
- _weight_sr = shap_calculator.sample_.weight
- if _weight_sr is not None:
+
+ if sample_weight is not None:
return cast(
npt.NDArray[np.float_],
- _weight_sr.loc[shap_values.index.get_level_values(-1)].values,
+ sample_weight.loc[shap_values.index.get_level_values(-1)].values,
)
else:
return None
@@ -565,15 +417,16 @@ class ShapInteractionValueContext(ShapContext):
Contextual data for global SHAP calculations based on SHAP interaction values.
"""
- def __init__(self, shap_calculator: ShapCalculator[Any]) -> None:
- shap_values: pd.DataFrame = shap_calculator.get_shap_interaction_values()
+ def __init__(
+ self, shap_calculator: ShapCalculator[Any], sample_weight: Optional[pd.Series]
+ ) -> None:
+ shap_values: pd.DataFrame = shap_calculator.shap_interaction_values
assert (
- shap_calculator.output_names_ is not None
- and shap_calculator.feature_index_ is not None
+ shap_calculator.feature_index_ is not None
), ASSERTION__CALCULATOR_IS_FITTED
n_features: int = len(shap_calculator.feature_index_)
- n_outputs: int = len(shap_calculator.output_names_)
+ n_outputs: int = len(shap_calculator.output_names)
n_observations: int = len(shap_values) // n_features
assert shap_values.shape == (
@@ -588,14 +441,13 @@ def __init__(self, shap_calculator: ShapCalculator[Any]) -> None:
# return a 1d array of weights that aligns with the observations axis of the
# SHAP values tensor (axis 1)
weight: Optional[npt.NDArray[np.float_]]
- assert shap_calculator.sample_ is not None and ASSERTION__CALCULATOR_IS_FITTED
- _weight_sr = shap_calculator.sample_.weight
- if _weight_sr is not None:
+
+ if sample_weight is not None:
_observation_indices = shap_values.index.get_level_values(
-2
).values.reshape((n_observations, n_features))[:, 0]
weight = ensure_last_axis_is_fast(
- _weight_sr.loc[_observation_indices].values
+ sample_weight.loc[_observation_indices].values
)
else:
weight = None
diff --git a/src/facet/inspection/_shap_projection.py b/src/facet/inspection/_shap_projection.py
index 0fe02c7f..2d82bef9 100644
--- a/src/facet/inspection/_shap_projection.py
+++ b/src/facet/inspection/_shap_projection.py
@@ -5,20 +5,18 @@
"""
import logging
from abc import ABCMeta, abstractmethod
-from typing import Any, Optional, Tuple, TypeVar
+from typing import Any, List, Optional, Tuple, TypeVar
import numpy as np
import numpy.typing as npt
+import pandas as pd
from pytools.api import AllTracker, inheritdoc
-from pytools.fit import fitted_only
+from pytools.fit import FittableMixin, fitted_only
-from ._shap import ShapCalculator
-from ._shap_global_explanation import (
+from ._shap_context import (
AffinityMatrix,
ShapContext,
- ShapGlobalExplainer,
- ShapInteractionGlobalExplainer,
ShapInteractionValueContext,
ShapValueContext,
cov,
@@ -28,6 +26,7 @@
sqrt,
transpose,
)
+from .shap import ShapCalculator
log = logging.getLogger(__name__)
@@ -44,6 +43,7 @@
#
T_Self = TypeVar("T_Self")
+T_Projector = TypeVar("T_Projector", bound="ShapProjector")
#
# Ensure all symbols introduced below are included in __all__
@@ -58,33 +58,111 @@
@inheritdoc(match="""[see superclass]""")
-class ShapProjector(ShapGlobalExplainer, metaclass=ABCMeta):
+class ShapProjector(FittableMixin[ShapCalculator[Any]], metaclass=ABCMeta):
"""
Base class for global pairwise model explanations based on SHAP vector projection.
+
+ Derives feature association as a global metric of SHAP values for multiple
+ observations.
"""
def __init__(self) -> None:
super().__init__()
+ self.feature_index_: Optional[pd.Index] = None
self.association_: Optional[AffinityMatrix] = None
+ @property
+ def is_fitted(self) -> bool:
+ """[see superclass]"""
+ return self.feature_index_ is not None
+
+ def fit( # type: ignore[override]
+ self: T_Projector,
+ shap_calculator: ShapCalculator[Any],
+ *,
+ sample_weight: Optional[pd.Series] = None,
+ **fit_params: Any,
+ ) -> T_Projector:
+ """
+ Calculate the SHAP decomposition for the shap values produced by the
+ given SHAP calculator.
+
+ :param shap_calculator: the fitted calculator from which to get the shap values
+ :param sample_weight: optional sample weights to apply for the global
+ explanations; the index must match the index of the features used to
+ fit the SHAP calculator
+ """
+
+ self._reset_fit()
+
+ if len(fit_params) > 0:
+ raise ValueError(
+ f'unsupported fit parameters: {", ".join(fit_params.values())}'
+ )
+
+ self._calculate(
+ self._get_context(
+ shap_calculator=shap_calculator, sample_weight=sample_weight
+ )
+ )
+
+ self.feature_index_ = shap_calculator.feature_index_
+
+ return self
+
@fitted_only
def association(self, absolute: bool, symmetrical: bool) -> npt.NDArray[np.float_]:
- """[see superclass]"""
+ """
+ The association matrix for all feature pairs.
+
+ Raises an error if this global explainer has not been fitted.
+
+ :param absolute: if ``False``, return relative association as a percentage of
+ total feature importance;
+ if ``True``, return absolute association as a portion of feature importance
+ :param symmetrical: if ``False``, return an asymmetrical matrix
+ quantifying unilateral association of the features represented by rows
+ with the features represented by columns;
+ if ``True``, return a symmetrical matrix quantifying mutual association
+ :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
+ """
assert self.association_ is not None
return self.association_.get_values(symmetrical=symmetrical, absolute=absolute)
- def _fit(self, shap_calculator: ShapCalculator[Any]) -> None:
- self._reset_fit()
- self._calculate(self._get_context(shap_calculator=shap_calculator))
+ def to_frames(self, matrix: npt.NDArray[np.float_]) -> List[pd.DataFrame]:
+ """
+ Transforms one or more affinity matrices into a list of data frames.
+
+ :param matrix: an array of shape `(n_outputs, n_features, n_features)`,
+ representing one or more affinity matrices
+ :return: a list of `n_outputs` data frames of shape `(n_features, n_features)`
+ """
+ assert self.feature_index_ is not None, "explainer is fitted"
+ index = self.feature_index_
+
+ n_features = len(index)
+ assert matrix.ndim == 3
+ assert matrix.shape[1:] == (n_features, n_features)
+
+ return [
+ pd.DataFrame(
+ m,
+ index=index,
+ columns=index,
+ )
+ for m in matrix
+ ]
def _reset_fit(self) -> None:
# revert status of this object to not fitted
- super()._reset_fit()
+ self.feature_index_ = None
self.association_ = None
@abstractmethod
- def _get_context(self, shap_calculator: ShapCalculator[Any]) -> ShapContext:
+ def _get_context(
+ self, shap_calculator: ShapCalculator[Any], sample_weight: Optional[pd.Series]
+ ) -> ShapContext:
pass
@abstractmethod
@@ -129,17 +207,23 @@ class ShapVectorProjector(ShapProjector):
onto a feature's main SHAP vector.
"""
- def _get_context(self, shap_calculator: ShapCalculator[Any]) -> ShapContext:
- return ShapValueContext(shap_calculator=shap_calculator)
+ def _get_context(
+ self, shap_calculator: ShapCalculator[Any], sample_weight: Optional[pd.Series]
+ ) -> ShapContext:
+ return ShapValueContext(
+ shap_calculator=shap_calculator, sample_weight=sample_weight
+ )
def _calculate(self, context: ShapContext) -> None:
# calculate association matrices for each SHAP context, then aggregate
self.association_ = self._calculate_association(context)
-@inheritdoc(match="""[see superclass]""")
-class ShapInteractionVectorProjector(ShapProjector, ShapInteractionGlobalExplainer):
+class ShapInteractionVectorProjector(ShapProjector):
"""
+ Derives feature association, synergy, and redundancy as a global metric of SHAP
+ interaction values for multiple observations.
+
Decomposes SHAP interaction scores (i.e, SHAP importance) of all possible pairings
of features into additive components for synergy, redundancy, and independence.
This is achieved through scalar projection of redundancy and synergy vectors
@@ -158,20 +242,50 @@ def __init__(self) -> None:
@fitted_only
def synergy(self, symmetrical: bool, absolute: bool) -> npt.NDArray[np.float_]:
- """[see superclass]"""
+ """
+ The synergy matrix for all feature pairs.
+
+ Raises an error if this global explainer has not been fitted.
+
+ :param absolute: if ``False``, return relative synergy as a percentage of
+ total feature importance;
+ if ``True``, return absolute synergy as a portion of feature importance
+ :param symmetrical: if ``False``, return an asymmetrical matrix
+ quantifying unilateral synergy of the features represented by rows
+ with the features represented by columns;
+ if ``True``, return a symmetrical matrix quantifying mutual synergy
+ :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
+ """
assert self.synergy_ is not None, "Projector is fitted"
return self.synergy_.get_values(symmetrical=symmetrical, absolute=absolute)
@fitted_only
def redundancy(self, symmetrical: bool, absolute: bool) -> npt.NDArray[np.float_]:
- """[see superclass]"""
+ """
+ The redundancy matrix for all feature pairs.
+
+ Raises an error if this global explainer has not been fitted.
+
+ :param absolute: if ``False``, return relative redundancy as a percentage of
+ total feature importance;
+ if ``True``, return absolute redundancy as a portion of feature importance
+ :param symmetrical: if ``False``, return an asymmetrical matrix
+ quantifying unilateral redundancy of the features represented by rows
+ with the features represented by columns;
+ if ``True``, return a symmetrical matrix quantifying mutual redundancy
+ :returns: the matrix as an array of shape (n_outputs, n_features, n_features)
+ """
assert self.redundancy_ is not None, "Projector is fitted"
return self.redundancy_.get_values(symmetrical=symmetrical, absolute=absolute)
- def _get_context(self, shap_calculator: ShapCalculator[Any]) -> ShapContext:
- return ShapInteractionValueContext(shap_calculator=shap_calculator)
+ def _get_context(
+ self, shap_calculator: ShapCalculator[Any], sample_weight: Optional[pd.Series]
+ ) -> ShapContext:
+ return ShapInteractionValueContext(
+ shap_calculator=shap_calculator, sample_weight=sample_weight
+ )
def _calculate(self, context: ShapContext) -> None:
# calculate association, synergy, and redundancy matrices for the SHAP context
@@ -285,10 +399,12 @@ def _calculate_synergy_redundancy(
# Calculate relative synergy and redundancy (ranging from 0.0 to 1.0),
# as a symmetric and an asymmetric measure.
+ #
# For the symmetric case, we ensure perfect symmetry by removing potential
- # round-off errors
+ # round-off errors.
+ #
# NOTE: we do not store independence, so technically it could be removed from
- # the code above
+ # the code above.
std_p_i = sqrt(var_p_i)
return (
diff --git a/src/facet/inspection/_types.py b/src/facet/inspection/_types.py
new file mode 100644
index 00000000..cbdd2dbb
--- /dev/null
+++ b/src/facet/inspection/_types.py
@@ -0,0 +1,15 @@
+"""
+Type aliases for common use in the inspection package
+"""
+
+from typing import Callable, Union
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+# a function representing a model to be inspected
+ModelFunction = Callable[
+ [Union[pd.Series, pd.DataFrame, npt.NDArray[np.float_]]],
+ Union[pd.Series, npt.NDArray[np.float_], float],
+]
diff --git a/src/facet/inspection/shap/__init__.py b/src/facet/inspection/shap/__init__.py
new file mode 100644
index 00000000..768499b7
--- /dev/null
+++ b/src/facet/inspection/shap/__init__.py
@@ -0,0 +1,6 @@
+"""
+Helper classes for SHAP calculations.
+"""
+
+from ._function import *
+from ._shap import *
diff --git a/src/facet/inspection/shap/_function.py b/src/facet/inspection/shap/_function.py
new file mode 100644
index 00000000..cd278074
--- /dev/null
+++ b/src/facet/inspection/shap/_function.py
@@ -0,0 +1,71 @@
+"""
+Shap calculations for functions
+"""
+
+import logging
+from typing import Generic, List, TypeVar
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+from pytools.api import AllTracker, inheritdoc
+
+from .._types import ModelFunction
+from ._shap import ShapCalculator
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+ "FunctionShapCalculator",
+]
+
+
+#
+# Type variables
+#
+
+T_ModelFunction = TypeVar("T_ModelFunction", bound=ModelFunction)
+
+#
+# Ensure all symbols introduced below are included in __all__
+#
+
+__tracker = AllTracker(globals())
+
+
+#
+# Class definitions
+#
+
+
+@inheritdoc(match="""[see superclass]""")
+class FunctionShapCalculator(ShapCalculator[T_ModelFunction], Generic[T_ModelFunction]):
+ """
+ Calculate SHAP values for a function.
+ """
+
+ @property
+ def input_names(self) -> None:
+ """Always ``None``, since functions require no fixed names for their inputs."""
+ return None
+
+ @property
+ def output_names(self) -> List[str]:
+ """[see superclass]"""
+ try:
+ return [self.model.__name__]
+ except AttributeError:
+ return ["output"]
+
+ def _convert_shap_to_df(
+ self,
+ raw_shap_tensors: List[npt.NDArray[np.float_]],
+ observation_idx: pd.Index,
+ feature_idx: pd.Index,
+ ) -> List[pd.DataFrame]:
+ return self._convert_raw_shap_to_df(
+ raw_shap_tensors=raw_shap_tensors,
+ observation_idx=observation_idx,
+ feature_idx=feature_idx,
+ )
diff --git a/src/facet/inspection/shap/_shap.py b/src/facet/inspection/shap/_shap.py
new file mode 100644
index 00000000..927f19eb
--- /dev/null
+++ b/src/facet/inspection/shap/_shap.py
@@ -0,0 +1,457 @@
+"""
+Implementation of package ``facet.inspection.shap``.
+"""
+
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import Any, Generic, List, Optional, TypeVar, Union, cast
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+from pytools.api import AllTracker
+from pytools.fit import FittableMixin, fitted_only
+from pytools.parallelization import ParallelizableMixin
+
+from facet.inspection._explainer import (
+ BaseExplainer,
+ ExplainerFactory,
+ ParallelExplainer,
+)
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+ "ShapCalculator",
+]
+
+#
+# Type variables
+#
+
+T_Model = TypeVar("T_Model")
+T_ShapCalculator = TypeVar("T_ShapCalculator", bound="ShapCalculator[Any]")
+
+
+#
+# Constants
+#
+
+ASSERTION__CALCULATOR_IS_FITTED = "calculator is fitted"
+
+
+#
+# Ensure all symbols introduced below are included in __all__
+#
+
+__tracker = AllTracker(globals())
+
+
+#
+# Class definitions
+#
+
+
+class ShapCalculator(
+ FittableMixin[pd.DataFrame],
+ ParallelizableMixin,
+ Generic[T_Model],
+ metaclass=ABCMeta,
+):
+ """
+ Base class for all SHAP calculators.
+
+ A SHAP calculator uses the ``shap`` package to calculate SHAP tensors for all
+ observations in a given sample of feature values, then consolidates and aggregates
+ results in a data frame.
+ """
+
+ #: Name for the feature index (= column index) of the resulting SHAP data frame.
+ IDX_FEATURE = "feature"
+
+ #: Name of the index that is used to identify multiple outputs for which SHAP
+ #: values are calculated. To be overloaded by subclasses.
+ MULTI_OUTPUT_INDEX_NAME = "output"
+
+ #: The model for which to calculate SHAP values.
+ model: T_Model
+
+ #: The explainer factory used to create the SHAP explainer for this calculator.
+ explainer_factory: ExplainerFactory[T_Model]
+
+ #: The SHAP values for all observations this calculator has been fitted to.
+ shap_: Optional[pd.DataFrame]
+
+ #: The names of the features for which SHAP values were calculated.
+ feature_index_: Optional[pd.Index]
+
+ def __init__(
+ self,
+ model: T_Model,
+ *,
+ explainer_factory: ExplainerFactory[T_Model],
+ interaction_values: bool,
+ n_jobs: Optional[int] = None,
+ shared_memory: Optional[bool] = None,
+ pre_dispatch: Optional[Union[str, int]] = None,
+ verbose: Optional[int] = None,
+ ) -> None:
+ """
+ :param model: the model for which to calculate SHAP values
+ :param explainer_factory: the explainer factory used to create the SHAP
+ explainer for this calculator
+ :param interaction_values: if ``True``, calculate SHAP interaction values,
+ otherwise calculate SHAP values
+ """
+ super().__init__(
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
+ self.model = model
+ self.explainer_factory = explainer_factory
+ self.interaction_values = interaction_values
+
+ # the following attributes are set in fit()
+ self.shap_: Optional[pd.DataFrame] = None
+ self.feature_index_: Optional[pd.Index] = None
+
+ __init__.__doc__ = cast(str, __init__.__doc__) + cast(
+ str, ParallelizableMixin.__init__.__doc__
+ )
+
+ @property
+ @abstractmethod
+ def input_names(self) -> Optional[List[str]]:
+ """
+ The names of the inputs explained by this SHAP calculator, or ``None`` if
+ no names are defined.
+ """
+
+ @property
+ @abstractmethod
+ def output_names(self) -> List[str]:
+ """
+ The names of the outputs explained by this SHAP calculator.
+ """
+
+ @property
+ def is_fitted(self) -> bool:
+ """[see superclass]"""
+ return self.shap_ is not None
+
+ # noinspection PyPep8Naming
+ def fit(
+ self: T_ShapCalculator, __X: pd.DataFrame, **fit_params: Any
+ ) -> T_ShapCalculator:
+ """
+ Calculate the SHAP values.
+
+ :param __X: the observations for which to calculate SHAP values
+ :param fit_params: additional fit parameters (unused)
+ :return: self
+ :raises ValueError: if the observations are not a valid feature matrix
+ for this calculator
+ """
+
+ # reset fit in case we get an exception along the way
+ self._reset_fit()
+
+ # validate the feature matrix
+ self.validate_features(__X)
+
+ self.feature_index_ = __X.columns.rename(ShapCalculator.IDX_FEATURE)
+
+ # explain all observations using the model, resulting in a matrix of
+ # SHAP values for each observation and feature
+ shap_df: pd.DataFrame = self._calculate_shap(
+ features=__X, explainer=self._make_explainer(__X)
+ )
+
+ # re-order the observation index to match the sequence in the original
+ # training sample
+
+ n_levels = shap_df.index.nlevels
+ assert 1 <= n_levels <= 2
+ assert shap_df.index.names[0] == __X.index.name
+
+ self.shap_ = shap_df.reindex(
+ index=__X.index.intersection(
+ (
+ shap_df.index
+ if n_levels == 1
+ else cast(pd.MultiIndex, shap_df.index).levels[0]
+ ),
+ sort=False,
+ ),
+ level=0,
+ copy=False,
+ )
+
+ return self
+
+ @property
+ @fitted_only
+ def shap_values(self) -> pd.DataFrame:
+ r"""
+ The SHAP values per observation and feature, with shape
+ :math:`(\mathit{n_observations}, \mathit{n_outputs} * \mathit{n_features})`
+ """
+
+ assert self.shap_ is not None, ASSERTION__CALCULATOR_IS_FITTED
+ if self.interaction_values:
+ return self.shap_.groupby(level=0).sum()
+ else:
+ return self.shap_
+
+ @property
+ @fitted_only
+ def shap_interaction_values(self) -> pd.DataFrame:
+ r"""
+ The SHAP interaction values per observation and feature pair, with shape
+ :math:`(\mathit{n_observations} * \mathit{n_features},
+ \mathit{n_outputs} * \mathit{n_features})`
+
+ :raise AttributeError: this SHAP calculator does not support interaction values
+ """
+ if self.interaction_values:
+ assert self.shap_ is not None, ASSERTION__CALCULATOR_IS_FITTED
+ return self.shap_
+ else:
+ raise AttributeError("interaction values are not supported")
+
+ @property
+ @fitted_only
+ def main_effects(self) -> pd.DataFrame:
+ r"""
+ The main effects per observation and featuren (i.e., the diagonals of the
+ interaction matrices), with shape
+ :math:`(\mathit{n_observations}, \mathit{n_outputs} * \mathit{n_features})`.
+
+ :raise AttributeError: this SHAP calculator does not support interaction values
+ """
+
+ if not self.interaction_values:
+ raise AttributeError("main effects are only defined for interaction values")
+
+ assert (
+ self.shap_ is not None and self.feature_index_ is not None
+ ), ASSERTION__CALCULATOR_IS_FITTED
+
+ n_observations = len(self.shap_)
+ n_features = len(self.feature_index_)
+ interaction_matrix = self.shap_
+
+ return pd.DataFrame(
+ np.diagonal(
+ interaction_matrix.values.reshape(
+ (n_observations, n_features, -1, n_features)
+ # observations x features x outputs x features
+ ),
+ axis1=1,
+ axis2=3,
+ ).reshape((n_observations, -1)),
+ # observations x (outputs * features)
+ index=cast(pd.MultiIndex, interaction_matrix.index).levels[0],
+ columns=interaction_matrix.columns,
+ )
+
+ def validate_features(self, features: pd.DataFrame) -> None:
+ """
+ Check that the given feature matrix is valid for this calculator.
+
+ :param features: the feature matrix to validate
+ :raise ValueError: if the feature matrix is not compatible with this
+ calculator
+ """
+
+ features_expected = self.input_names
+ if features_expected is None:
+ # no input names defined, so we cannot validate the features
+ return
+
+ diff = features.columns.symmetric_difference(features_expected)
+ if not diff.empty:
+ raise ValueError(
+ f"Features to be explained do not match the features used to fit the"
+ f"learner: expected {features_expected}, got "
+ f"{features.columns.tolist()}."
+ )
+
+ def _reset_fit(self) -> None:
+ # set this calculator to its initial unfitted state
+ self.shap_ = None
+ self.feature_index_ = None
+ self.output_names_ = None
+
+ def _make_explainer(self, features: pd.DataFrame) -> BaseExplainer:
+
+ # prepare the background dataset
+
+ background_dataset: Optional[pd.DataFrame]
+
+ if self.explainer_factory.uses_background_dataset:
+ background_dataset = features
+
+ background_dataset_not_na = background_dataset.dropna()
+
+ if len(background_dataset_not_na) != len(background_dataset):
+ n_original = len(background_dataset)
+ n_dropped = n_original - len(background_dataset_not_na)
+ log.warning(
+ f"{n_dropped} out of {n_original} observations in the background "
+ f"dataset have missing values after pre-processing and will be "
+ f"dropped."
+ )
+
+ background_dataset = background_dataset_not_na
+
+ else:
+ background_dataset = None
+
+ model = self.model
+ explainer = self.explainer_factory.make_explainer(
+ model=model, data=background_dataset
+ )
+
+ if self.n_jobs != 1:
+ explainer = ParallelExplainer(
+ explainer,
+ n_jobs=self.n_jobs,
+ shared_memory=self.shared_memory,
+ pre_dispatch=self.pre_dispatch,
+ verbose=self.verbose,
+ )
+
+ return explainer
+
+ def _calculate_shap(
+ self, *, features: pd.DataFrame, explainer: BaseExplainer
+ ) -> pd.DataFrame:
+ if features.isna().values.any():
+ log.warning(
+ "preprocessed features passed to SHAP explainer include NaN values; "
+ "try to change preprocessing to impute all NaN values"
+ )
+
+ multi_output_index_name = self.MULTI_OUTPUT_INDEX_NAME
+ multi_output_names = self.output_names
+ assert self.feature_index_ is not None, ASSERTION__CALCULATOR_IS_FITTED
+ feature_names = self.feature_index_
+
+ # calculate the shap values, and ensure the result is a list of arrays
+ shap_values: List[npt.NDArray[np.float_]] = self._convert_shap_tensors_to_list(
+ shap_tensors=(
+ explainer.shap_interaction_values(X=features)
+ if self.interaction_values
+ else explainer.shap_values(X=features)
+ ),
+ n_outputs=len(multi_output_names),
+ )
+
+ # convert to a data frame per output (different logic depending on whether
+ # we have a regressor or a classifier, implemented by method
+ # shap_matrix_for_split_to_df_fn)
+
+ shap_values_df_per_output: List[pd.DataFrame] = self._convert_shap_to_df(
+ raw_shap_tensors=shap_values,
+ observation_idx=features.index,
+ feature_idx=feature_names,
+ )
+
+ # if we have a single output, return the data frame for that output;
+ # else, add a top level to the column index indicating each output
+
+ if len(shap_values_df_per_output) == 1:
+ return shap_values_df_per_output[0]
+ else:
+ return pd.concat(
+ shap_values_df_per_output,
+ axis=1,
+ keys=multi_output_names,
+ names=[multi_output_index_name, feature_names.name],
+ )
+
+ def _convert_shap_tensors_to_list(
+ self,
+ *,
+ shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]],
+ n_outputs: int,
+ ) -> List[npt.NDArray[np.float_]]:
+ def _validate_shap_tensor(_t: npt.NDArray[np.float_]) -> None:
+ if np.isnan(np.sum(_t)):
+ raise AssertionError(
+ "Output of SHAP explainer includes NaN values. "
+ "This should not happen; consider initialising the "
+ "LearnerInspector with an ExplainerFactory that has a different "
+ "configuration, or that makes SHAP explainers of a different type."
+ )
+
+ if isinstance(shap_tensors, list):
+ for shap_tensor in shap_tensors:
+ _validate_shap_tensor(shap_tensor)
+ else:
+ _validate_shap_tensor(shap_tensors)
+ shap_tensors = [shap_tensors]
+
+ if n_outputs != len(shap_tensors):
+ raise AssertionError(
+ f"count of SHAP tensors (n={len(shap_tensors)}) "
+ f"should match number of outputs (n={n_outputs})"
+ )
+
+ return shap_tensors
+
+ @abstractmethod
+ def _convert_shap_to_df(
+ self,
+ raw_shap_tensors: List[npt.NDArray[np.float_]],
+ observation_idx: pd.Index,
+ feature_idx: pd.Index,
+ ) -> List[pd.DataFrame]:
+ """
+ Convert the SHAP tensors for a single split to a data frame.
+
+ :param raw_shap_tensors: the raw values returned by the SHAP explainer
+ :param observation_idx: the ids used for indexing the explained observations
+ :param feature_idx: the feature names
+ :return: SHAP values of a single split as data frame
+ """
+ pass
+
+ def _convert_raw_shap_to_df(
+ self,
+ raw_shap_tensors: List[npt.NDArray[np.float_]],
+ observation_idx: pd.Index,
+ feature_idx: pd.Index,
+ ) -> List[pd.DataFrame]:
+ # Convert "raw output" shap tensors to data frames.
+ # This is typically the output obtained for regressors, or generic functions.
+ if self.interaction_values:
+ row_index = pd.MultiIndex.from_product(
+ iterables=(observation_idx, feature_idx),
+ names=(observation_idx.name, feature_idx.name),
+ )
+
+ return [
+ pd.DataFrame(
+ data=raw_interaction_tensor.reshape(
+ (-1, raw_interaction_tensor.shape[2])
+ ),
+ index=row_index,
+ columns=feature_idx,
+ )
+ for raw_interaction_tensor in raw_shap_tensors
+ ]
+ else:
+ return [
+ pd.DataFrame(
+ data=raw_shap_matrix, index=observation_idx, columns=feature_idx
+ )
+ for raw_shap_matrix in raw_shap_tensors
+ ]
+
+
+__tracker.validate()
diff --git a/src/facet/inspection/shap/sklearn/__init__.py b/src/facet/inspection/shap/sklearn/__init__.py
new file mode 100644
index 00000000..98c5fc19
--- /dev/null
+++ b/src/facet/inspection/shap/sklearn/__init__.py
@@ -0,0 +1,5 @@
+"""
+SHAP calculators for *scikit-learn* regressors and classifiers.
+"""
+
+from ._sklearn import *
diff --git a/src/facet/inspection/shap/sklearn/_sklearn.py b/src/facet/inspection/shap/sklearn/_sklearn.py
new file mode 100644
index 00000000..a1d9539c
--- /dev/null
+++ b/src/facet/inspection/shap/sklearn/_sklearn.py
@@ -0,0 +1,324 @@
+"""
+Implementation of package ``facet.inspection.shap.learner``.
+"""
+
+import logging
+from abc import ABCMeta
+from typing import Generic, List, Optional, TypeVar, Union, cast
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+from sklearn.base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
+
+from pytools.api import AllTracker, inheritdoc, subsdoc
+
+from facet.inspection._explainer import ExplainerFactory
+from facet.inspection.shap import ShapCalculator
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+ "ClassifierShapCalculator",
+ "LearnerShapCalculator",
+ "RegressorShapCalculator",
+]
+
+#
+# Type variables
+#
+
+T_Classifier = TypeVar("T_Classifier", bound=ClassifierMixin)
+T_Learner = TypeVar("T_Learner", bound=Union[RegressorMixin, ClassifierMixin])
+T_Regressor = TypeVar("T_Regressor", bound=RegressorMixin)
+
+
+#
+# Ensure all symbols introduced below are included in __all__
+#
+
+__tracker = AllTracker(globals())
+
+
+#
+# Class definitions
+#
+
+
+class LearnerShapCalculator(
+ ShapCalculator[T_Learner], Generic[T_Learner], metaclass=ABCMeta
+):
+ """
+ Base class for SHAP calculators based on :mod:`sklearndf` learners.
+ """
+
+ @property
+ def input_names(self) -> Optional[List[str]]:
+ try:
+ return cast(List[str], self.model.feature_names_in_.tolist())
+ except AttributeError:
+ # the learner does not have a feature_names_in_ attribute,
+ return None
+
+
+@inheritdoc(match="""[see superclass]""")
+class RegressorShapCalculator(
+ LearnerShapCalculator[T_Regressor], Generic[T_Regressor], metaclass=ABCMeta
+):
+ """
+ Calculates SHAP (interaction) values for regression models.
+ """
+
+ @subsdoc(
+ pattern=r"(?m)(^\s*)(:param model: .*$)",
+ replacement=r"\1\2\n"
+ r"\1:param output_names: the names of the outputs of the regressor",
+ using=LearnerShapCalculator.__init__,
+ )
+ def __init__(
+ self,
+ model: T_Regressor,
+ *,
+ output_names: List[str],
+ explainer_factory: ExplainerFactory[T_Learner],
+ interaction_values: bool,
+ n_jobs: Optional[int] = None,
+ shared_memory: Optional[bool] = None,
+ pre_dispatch: Optional[Union[str, int]] = None,
+ verbose: Optional[int] = None,
+ ) -> None:
+ """[see superclass]"""
+
+ super().__init__(
+ model=model,
+ explainer_factory=explainer_factory,
+ interaction_values=interaction_values,
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
+
+ if not is_regressor(model):
+ raise ValueError(
+ f"regressor SHAP calculator requires a regressor, "
+ f"but got a {type(model)}"
+ )
+
+ try:
+ n_outputs = model.n_outputs_
+ except AttributeError:
+ # assume a single output if the learner lacks the n_outputs_ attribute
+ n_outputs = 1
+
+ if n_outputs != len(output_names):
+ raise ValueError(
+ f"Number of output names ({len(output_names)}) does not match the "
+ f"number of outputs of the regressor ({n_outputs})."
+ )
+
+ self._output_names = output_names
+
+ #: Multi-output SHAP values are determined by target.
+ MULTI_OUTPUT_INDEX_NAME = "target"
+
+ @property
+ def output_names(self) -> List[str]:
+ """[see superclass]"""
+ return self._output_names
+
+ def _convert_shap_to_df(
+ self,
+ raw_shap_tensors: List[npt.NDArray[np.float_]],
+ observation_idx: pd.Index,
+ feature_idx: pd.Index,
+ ) -> List[pd.DataFrame]:
+ # Convert shap tensors to data frames.
+
+ return self._convert_raw_shap_to_df(
+ raw_shap_tensors=raw_shap_tensors,
+ observation_idx=observation_idx,
+ feature_idx=feature_idx,
+ )
+
+
+@inheritdoc(match="""[see superclass]""")
+class ClassifierShapCalculator(
+ LearnerShapCalculator[T_Classifier], Generic[T_Classifier], metaclass=ABCMeta
+):
+ """
+ Calculates SHAP (interaction) values for classification models.
+ """
+
+ #: Multi-output SHAP values are determined by class.
+ MULTI_OUTPUT_INDEX_NAME = "class"
+
+ def __init__(
+ self,
+ model: T_Classifier,
+ *,
+ explainer_factory: ExplainerFactory[T_Learner],
+ interaction_values: bool,
+ n_jobs: Optional[int] = None,
+ shared_memory: Optional[bool] = None,
+ pre_dispatch: Optional[Union[str, int]] = None,
+ verbose: Optional[int] = None,
+ ) -> None:
+ """[see superclass]"""
+ super().__init__(
+ model=model,
+ explainer_factory=explainer_factory,
+ interaction_values=interaction_values,
+ n_jobs=n_jobs,
+ shared_memory=shared_memory,
+ pre_dispatch=pre_dispatch,
+ verbose=verbose,
+ )
+ if not is_classifier(model):
+ raise ValueError(
+ f"classifier SHAP calculator requires a classifier, "
+ f"but got a {type(model)}"
+ )
+
+ self._output_names = classifier_shap_output_names(model)
+
+ @property
+ def output_names(self) -> List[str]:
+ """[see superclass]"""
+ return self._output_names
+
+ @staticmethod
+ def validate_learner(learner: T_Classifier) -> None:
+ """[see superclass]"""
+
+ try:
+ n_outputs_ = learner.n_outputs_
+ except AttributeError:
+ # no n_outputs_ defined; we assume the classifier is not multi-target
+ pass
+ else:
+ if n_outputs_ > 1:
+ raise ValueError(
+ "classifier SHAP calculator does not support multi-output "
+ "classifiers, but got a classifier with n_outputs_="
+ f"{n_outputs_}"
+ )
+
+ def _convert_shap_tensors_to_list(
+ self,
+ *,
+ shap_tensors: Union[npt.NDArray[np.float_], List[npt.NDArray[np.float_]]],
+ n_outputs: int,
+ ) -> List[npt.NDArray[np.float_]]:
+
+ if n_outputs == 1 and isinstance(shap_tensors, list) and len(shap_tensors) == 2:
+ # in the binary classification case, we will proceed with SHAP values
+ # for class 0 only, since values for class 1 will just be the same
+ # values times (*-1) (the opposite delta probability)
+
+ # to ensure the values are returned as expected above,
+ # and no information of class 1 is discarded, assert the
+ # following:
+ if not np.allclose(shap_tensors[0], -shap_tensors[1]):
+ _raw_shap_tensor_totals = shap_tensors[0] + shap_tensors[1]
+ log.warning(
+ "shap values of binary classifiers should add up to 0.0 "
+ "for each observation and feature, but total shap values range "
+ f"from {_raw_shap_tensor_totals.min():g} "
+ f"to {_raw_shap_tensor_totals.max():g}"
+ )
+
+ return super()._convert_shap_tensors_to_list(
+ shap_tensors=shap_tensors[1], n_outputs=1
+ )
+ else:
+ return super()._convert_shap_tensors_to_list(
+ shap_tensors=shap_tensors, n_outputs=n_outputs
+ )
+
+ def _convert_shap_to_df(
+ self,
+ raw_shap_tensors: List[npt.NDArray[np.float_]],
+ observation_idx: pd.Index,
+ feature_idx: pd.Index,
+ ) -> List[pd.DataFrame]:
+
+ if self.interaction_values:
+ # return a list of data frame [(obs x features) x features],
+ # one for each of the outputs
+
+ # each row is indexed by an observation and a feature
+ row_index = pd.MultiIndex.from_product(
+ iterables=(observation_idx, feature_idx),
+ names=(observation_idx.name, feature_idx.name),
+ )
+
+ return [
+ pd.DataFrame(
+ data=raw_shap_interaction_matrix.reshape(
+ (-1, raw_shap_interaction_matrix.shape[2])
+ ),
+ index=row_index,
+ columns=feature_idx,
+ )
+ for raw_shap_interaction_matrix in raw_shap_tensors
+ ]
+
+ else:
+ # return a list of data frame [obs x features], one for each of the outputs
+
+ return [
+ pd.DataFrame(
+ data=raw_shap_matrix, index=observation_idx, columns=feature_idx
+ )
+ for raw_shap_matrix in raw_shap_tensors
+ ]
+
+
+__tracker.validate()
+
+
+#
+# auxiliary methods
+#
+
+
+def classifier_shap_output_names(classifier: ClassifierMixin) -> List[str]:
+ """
+ Get the names of the SHAP outputs that will be generated for the given classifier.
+
+ For binary classifiers, the only output name is the name of the positive class.
+ For multi-class classifiers, the output names are the names of all classes.
+
+ The classifier must be fitted, and must have a ``classes_`` attribute.
+
+ :param classifier: a classifier
+ :return: the names of the SHAP outputs
+ :raises ValueError: if the classifier does not define the ``classes_`` attribute,
+ is multi-output, or has only a single class
+ """
+ try:
+ # noinspection PyUnresolvedReferences
+ classes = classifier.classes_
+ except AttributeError as cause:
+ raise ValueError("classifier must define classes_ attribute") from cause
+
+ if not isinstance(classes, np.ndarray):
+ raise ValueError(
+ "classifier must be single-output, with classes_ as a numpy array"
+ )
+
+ class_names: List[str] = list(map(str, classes))
+ n_classes = len(class_names)
+
+ if n_classes == 1:
+ raise ValueError(f"cannot explain a model with single class {class_names[0]!r}")
+
+ elif n_classes == 2:
+ # for binary classifiers, we will generate only output for the second
+ # (positive) class as the probabilities for the second class are trivially
+ # linked to class 1
+ return class_names[:1]
+
+ else:
+ return class_names
diff --git a/src/facet/selection/_parameters.py b/src/facet/selection/_parameters.py
index a434cc2b..778d0e02 100644
--- a/src/facet/selection/_parameters.py
+++ b/src/facet/selection/_parameters.py
@@ -23,6 +23,7 @@
from scipy import stats
from sklearn.base import BaseEstimator
+from typing_extensions import TypeAlias
from pytools.api import AllTracker, inheritdoc, subsdoc, to_list, validate_element_types
from pytools.expression import Expression, make_expression
@@ -41,11 +42,11 @@
#
-# Type constants
+# Type aliases
#
-ParameterSet = Union[List[Any], stats.rv_continuous, stats.rv_discrete]
-ParameterDict = Dict[str, ParameterSet]
+ParameterSet: TypeAlias = Union[List[Any], stats.rv_continuous, stats.rv_discrete]
+ParameterDict: TypeAlias = Dict[str, ParameterSet]
try:
rv_frozen = next(
diff --git a/src/facet/selection/base/_parameters.py b/src/facet/selection/base/_parameters.py
index 04a0959c..a06e5a76 100644
--- a/src/facet/selection/base/_parameters.py
+++ b/src/facet/selection/base/_parameters.py
@@ -4,7 +4,7 @@
import logging
from abc import ABCMeta, abstractmethod
-from typing import Any, Dict, Generic, List, Optional, TypeVar, Union
+from typing import Any, Dict, Generic, List, Optional, TypeVar, Union, cast
import numpy.typing as npt
import pandas as pd
@@ -151,39 +151,45 @@ def _get_candidate(self) -> Union[ClassifierDF, RegressorDF, TransformerDF]:
return self.candidate
def _get_classes(self) -> Union[npt.NDArray[Any], List[npt.NDArray[Any]]]:
- return self._get_candidate().classes_
+ return cast(
+ Union[npt.NDArray[Any], List[npt.NDArray[Any]]],
+ self._get_candidate()._get_classes(),
+ )
# noinspection PyPep8Naming
def predict_proba(
- self, X: pd.DataFrame, **predict_params: Any
+ self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""[see superclass]"""
return self._get_candidate().predict_proba(X, **predict_params)
# noinspection PyPep8Naming
def predict_log_proba(
- self, X: pd.DataFrame, **predict_params: Any
+ self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.DataFrame, List[pd.DataFrame]]:
"""[see superclass]"""
return self._get_candidate().predict_log_proba(X, **predict_params)
# noinspection PyPep8Naming
def decision_function(
- self, X: pd.DataFrame, **predict_params: Any
+ self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.Series, pd.DataFrame]:
"""[see superclass]"""
return self._get_candidate().decision_function(X, **predict_params)
# noinspection PyPep8Naming
def score(
- self, X: pd.DataFrame, y: pd.Series, sample_weight: Optional[pd.Series] = None
+ self,
+ X: Union[pd.Series, pd.DataFrame],
+ y: pd.Series,
+ sample_weight: Optional[pd.Series] = None,
) -> float:
"""[see superclass]"""
return self._get_candidate().score(X, y, sample_weight)
# noinspection PyPep8Naming
def predict(
- self, X: pd.DataFrame, **predict_params: Any
+ self, X: Union[pd.Series, pd.DataFrame], **predict_params: Any
) -> Union[pd.Series, pd.DataFrame]:
"""[see superclass]"""
return self._get_candidate().predict(X, **predict_params)
@@ -191,7 +197,7 @@ def predict(
# noinspection PyPep8Naming
def fit(
self: T_CandidateEstimatorDF,
- X: pd.DataFrame,
+ X: Union[pd.Series, pd.DataFrame],
y: Optional[Union[pd.Series, pd.DataFrame]] = None,
**fit_params: Any,
) -> T_CandidateEstimatorDF:
@@ -205,12 +211,12 @@ def is_fitted(self) -> bool:
return self.candidate is not None and self.candidate.is_fitted
# noinspection PyPep8Naming
- def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
+ def inverse_transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
"""[see superclass]"""
return self._get_candidate().inverse_transform(X)
# noinspection PyPep8Naming
- def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+ def transform(self, X: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
"""[see superclass]"""
return self._get_candidate().transform(X)
diff --git a/src/facet/simulation/_result.py b/src/facet/simulation/_result.py
index 7c1b68f0..fa4de644 100644
--- a/src/facet/simulation/_result.py
+++ b/src/facet/simulation/_result.py
@@ -113,7 +113,7 @@ def __init__(
:param baseline: the average observed actual output, acting as the baseline
of the simulation
:param confidence_level: the width of the confidence interval determined by
- bootstrapping, ranging between 0.0 and 1.0 (exclusive)
+ the standard error of the mean, ranging between 0.0 and 1.0 (exclusive)
"""
super().__init__()
diff --git a/src/facet/simulation/base/_base.py b/src/facet/simulation/base/_base.py
index bf9e671a..a10d2c80 100644
--- a/src/facet/simulation/base/_base.py
+++ b/src/facet/simulation/base/_base.py
@@ -22,7 +22,7 @@
from pytools.api import AllTracker
from pytools.parallelization import Job, JobRunner, ParallelizableMixin
-from sklearndf import LearnerDF, RegressorDF
+from sklearndf import RegressorDF, SupervisedLearnerDF
from facet.data import Sample
from facet.data.partition import Partitioner
@@ -41,7 +41,7 @@
#
T_Value = TypeVar("T_Value", bound=np.generic)
-T_LearnerDF = TypeVar("T_LearnerDF", bound=LearnerDF)
+T_SupervisedLearnerDF = TypeVar("T_SupervisedLearnerDF", bound=SupervisedLearnerDF)
#
@@ -52,7 +52,7 @@
class BaseUnivariateSimulator(
- ParallelizableMixin, Generic[T_LearnerDF], metaclass=ABCMeta
+ ParallelizableMixin, Generic[T_SupervisedLearnerDF], metaclass=ABCMeta
):
"""
Base class for univariate simulations.
@@ -71,7 +71,7 @@ class BaseUnivariateSimulator(
verbose: Optional[int]
#: The learner pipeline used to conduct simulations
- model: T_LearnerDF
+ model: T_SupervisedLearnerDF
#: The sample to be used in baseline calculations and simulations
sample: Sample
@@ -82,7 +82,7 @@ class BaseUnivariateSimulator(
def __init__(
self,
- model: T_LearnerDF,
+ model: T_SupervisedLearnerDF,
sample: Sample,
*,
confidence_level: float = 0.95,
@@ -199,13 +199,13 @@ def expected_output(self) -> float:
@staticmethod
@abstractmethod
- def _expected_learner_type() -> Type[T_LearnerDF]:
+ def _expected_learner_type() -> Type[T_SupervisedLearnerDF]:
pass
@staticmethod
@abstractmethod
def _simulate(
- model: T_LearnerDF, x: pd.DataFrame, name: str, value: Any
+ model: T_SupervisedLearnerDF, x: pd.DataFrame, name: str, value: Any
) -> Tuple[float, float]:
pass
diff --git a/test/test/conftest.py b/test/test/conftest.py
index 6288faff..bbfcb15b 100644
--- a/test/test/conftest.py
+++ b/test/test/conftest.py
@@ -1,5 +1,5 @@
import logging
-from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, cast
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, cast
import numpy as np
import numpy.typing as npt
@@ -31,7 +31,7 @@
import facet
from facet.data import Sample
-from facet.inspection import LearnerInspector, TreeExplainerFactory
+from facet.inspection import LearnerInspector
from facet.selection import LearnerSelector, ParameterSpace
from facet.validation import BootstrapCV, StratifiedBootstrapCV
@@ -211,31 +211,6 @@ def best_lgbm_model(
)
-@pytest.fixture # type: ignore
-def preprocessed_feature_names(
- best_lgbm_model: RegressorPipelineDF[LGBMRegressorDF],
-) -> Set[str]:
- """
- Names of all features after preprocessing
- """
- return set(best_lgbm_model.final_estimator.feature_names_in_)
-
-
-@pytest.fixture # type: ignore
-def regressor_inspector(
- best_lgbm_model: RegressorPipelineDF[LGBMRegressorDF], sample: Sample, n_jobs: int
-) -> LearnerInspector[RegressorPipelineDF[LGBMRegressorDF]]:
- inspector = LearnerInspector(
- pipeline=best_lgbm_model,
- explainer_factory=TreeExplainerFactory(
- feature_perturbation="tree_path_dependent", uses_background_dataset=True
- ),
- n_jobs=n_jobs,
- ).fit(sample=sample)
-
- return inspector
-
-
@pytest.fixture # type: ignore
def simple_preprocessor(sample: Sample) -> TransformerDF:
features = sample.features
@@ -257,7 +232,7 @@ def simple_preprocessor(sample: Sample) -> TransformerDF:
column_transforms.append(
(
STEP_ONE_HOT_ENCODE,
- OneHotEncoderDF(sparse=False, handle_unknown="ignore"),
+ OneHotEncoderDF(handle_unknown="ignore"),
list(map(str, category_columns)),
)
)
@@ -461,8 +436,8 @@ def iris_inspector_multi_class(
n_jobs: int,
) -> LearnerInspector[ClassifierPipelineDF[RandomForestClassifierDF]]:
return LearnerInspector(
- pipeline=iris_classifier_multi_class, shap_interaction=True, n_jobs=n_jobs
- ).fit(sample=iris_sample_multi_class)
+ model=iris_classifier_multi_class, shap_interaction=True, n_jobs=n_jobs
+ ).fit(iris_sample_multi_class)
#
diff --git a/test/test/facet/test_inspection.py b/test/test/facet/test_inspection.py
index 457a8232..f6a1f332 100644
--- a/test/test/facet/test_inspection.py
+++ b/test/test/facet/test_inspection.py
@@ -4,7 +4,7 @@
import logging
import platform
import warnings
-from typing import List, Optional, Set, TypeVar, cast
+from typing import Any, Dict, List, Optional, Type, TypeVar, cast
import numpy as np
import pandas as pd
@@ -16,7 +16,7 @@
from pytools.data import LinkageTree, Matrix
from pytools.viz.dendrogram import DendrogramDrawer, DendrogramReportStyle
-from sklearndf import __sklearn_1_1__, __sklearn_version__
+from sklearndf import ClassifierDF, __sklearn_1_1__, __sklearn_version__
from sklearndf.classification import (
GradientBoostingClassifierDF,
RandomForestClassifierDF,
@@ -27,8 +27,12 @@
from ..conftest import check_ranking
from facet.data import Sample
from facet.inspection import (
+ ExactExplainerFactory,
+ ExplainerFactory,
+ FunctionInspector,
KernelExplainerFactory,
LearnerInspector,
+ PermutationExplainerFactory,
TreeExplainerFactory,
)
from facet.selection import LearnerSelector
@@ -63,14 +67,36 @@ def test_regressor_selector(
)
+@pytest.mark.parametrize( # type: ignore
+ argnames=("explainer_factory_cls", "explainer_factory_args"),
+ argvalues=[
+ (
+ TreeExplainerFactory,
+ dict(
+ feature_perturbation="tree_path_dependent", uses_background_dataset=True
+ ),
+ ),
+ (KernelExplainerFactory, dict(link="identity", data_size_limit=8)),
+ (ExactExplainerFactory, {}),
+ (PermutationExplainerFactory, {}),
+ ],
+)
def test_model_inspection(
+ explainer_factory_cls: Type[ExplainerFactory[LGBMRegressorDF]],
+ explainer_factory_args: Dict[str, Any],
best_lgbm_model: RegressorPipelineDF[LGBMRegressorDF],
- preprocessed_feature_names: Set[str],
- regressor_inspector: LearnerInspector[RegressorPipelineDF[LGBMRegressorDF]],
sample: Sample,
n_jobs: int,
) -> None:
- shap_values: pd.DataFrame = regressor_inspector.shap_values()
+ # test the ModelInspector with the given explainer factory:
+
+ inspector = LearnerInspector(
+ model=best_lgbm_model,
+ explainer_factory=explainer_factory_cls(**explainer_factory_args),
+ n_jobs=n_jobs,
+ ).fit(sample)
+
+ shap_values: pd.DataFrame = inspector.shap_values()
# the length of rows in shap_values should be equal to the unique observation
# indices we have had in the predictions_df
@@ -81,7 +107,9 @@ def test_model_inspection(
assert shap_values.columns.names == [Sample.IDX_FEATURE]
# column index
- assert set(shap_values.columns) == preprocessed_feature_names
+ assert set(shap_values.columns) == set(
+ inspector.model.final_estimator.feature_names_in_
+ )
# check that the SHAP values add up to the predictions
shap_totals = shap_values.sum(axis=1)
@@ -90,24 +118,43 @@ def test_model_inspection(
# for every observation. This is always the same constant value,
# therefore the mean absolute deviation is zero.
- shap_minus_pred = shap_totals - best_lgbm_model.predict(X=sample.features)
+ shap_minus_pred = shap_totals - inspector.model.predict(X=sample.features)
assert (
round((shap_minus_pred - shap_minus_pred.mean()).abs().mean(), 12) == 0.0
), "predictions matching total SHAP"
+ # validate the linkage tree of the resulting inspector
- # test the ModelInspector with a KernelExplainer:
+ # if the inspector supports interaction values, test the redundancy linkage
+ # otherwise test the association linkage
+ if inspector.shap_interaction:
+ linkage = inspector.feature_redundancy_linkage()
+ mode = "Redundancy"
+ else:
+ linkage = inspector.feature_association_linkage()
+ mode = "Association"
- inspector_2 = LearnerInspector(
- pipeline=best_lgbm_model,
- explainer_factory=KernelExplainerFactory(link="identity", data_size_limit=20),
- n_jobs=n_jobs,
- ).fit(sample=sample)
- inspector_2.shap_values()
+ # validate the linkage tree
- linkage_tree = cast(LinkageTree, inspector_2.feature_association_linkage())
+ assert isinstance(linkage, LinkageTree)
+ # get the node whose child distance is < 0.8, and confirm it is the only one
+ cluster_nodes = [
+ node
+ for node in linkage.iter_nodes()
+ if not node.is_leaf and node.children_distance < 0.7
+ ]
+ assert len(cluster_nodes) == 1, "only two features form a cluster"
+ # check the child nodes are Longitude and Latitude
+ children = linkage.children(cluster_nodes[0])
+ assert children is not None, "a cluster node has children"
+ assert {child.name.split("__")[-1] for child in children} == (
+ {"Longitude", "Latitude"}
+ ), "the cluster is Longitude and Latitude features"
print()
- DendrogramDrawer(style="text").draw(data=linkage_tree, title="Association")
+ DendrogramDrawer(style="text").draw(
+ data=linkage,
+ title=f"{inspector.explainer_factory.__class__.__name__} ({mode})",
+ )
def test_binary_classifier_ranking(
@@ -141,10 +188,10 @@ def test_model_inspection_classifier_binary(
) -> None:
model_inspector = LearnerInspector(
- pipeline=iris_classifier_binary,
+ model=iris_classifier_binary,
shap_interaction=False,
n_jobs=n_jobs,
- ).fit(sample=iris_sample_binary)
+ ).fit(iris_sample_binary)
# calculate the shap value matrix, without any consolidation
shap_values = model_inspector.shap_values()
@@ -191,7 +238,7 @@ def test_model_inspection_classifier_binary(
)
-def test_model_inspection_classifier_binary_single_shap_output() -> None:
+def test_model_inspection_classifier_binary_single_shap_output(n_jobs: int) -> None:
# simulate some data
x, y = make_classification(
n_samples=200, n_features=5, n_informative=5, n_redundant=0, random_state=42
@@ -210,16 +257,14 @@ def test_model_inspection_classifier_binary_single_shap_output() -> None:
).fit(sample_df.features, sample_df.target)
# fit the inspector
- LearnerInspector(pipeline=pipeline, n_jobs=-3).fit(sample=sample_df)
+ LearnerInspector(model=pipeline, n_jobs=n_jobs).fit(sample_df)
# noinspection DuplicatedCode
def test_model_inspection_classifier_multi_class(
- iris_inspector_multi_class: LearnerInspector[
- ClassifierPipelineDF[RandomForestClassifierDF]
- ],
+ iris_inspector_multi_class: LearnerInspector[RandomForestClassifierDF],
) -> None:
- iris_classifier = iris_inspector_multi_class.pipeline
+ iris_classifier = iris_inspector_multi_class.model
iris_sample = iris_inspector_multi_class.sample_
# calculate the shap value matrix, without any consolidation
@@ -237,7 +282,7 @@ def test_model_inspection_classifier_multi_class(
pd.Index(iris_sample.feature_names, name="feature")
)
assert feature_importance.columns.equals(
- pd.Index(iris_inspector_multi_class.output_names_, name="class")
+ pd.Index(iris_inspector_multi_class.output_names, name="class")
)
assert_allclose(
feature_importance.values,
@@ -327,7 +372,7 @@ def test_model_inspection_classifier_multi_class(
)
for output, linkage_tree in zip(
- iris_inspector_multi_class.output_names_, linkage_trees
+ iris_inspector_multi_class.output_names, linkage_trees
):
print()
DendrogramDrawer(style=DendrogramReportStyle()).draw(
@@ -336,9 +381,7 @@ def test_model_inspection_classifier_multi_class(
def _validate_shap_values_against_predictions(
- shap_values: pd.DataFrame,
- model: ClassifierPipelineDF[RandomForestClassifierDF],
- sample: Sample,
+ shap_values: pd.DataFrame, model: ClassifierDF, sample: Sample
) -> None:
# calculate the matching predictions, so we can check if the SHAP values add up
@@ -405,21 +448,21 @@ def test_model_inspection_classifier_interaction(
warnings.filterwarnings("ignore", message="You are accessing a training score")
model_inspector = LearnerInspector(
- pipeline=iris_classifier_binary,
+ model=iris_classifier_binary,
explainer_factory=TreeExplainerFactory(
feature_perturbation="tree_path_dependent", uses_background_dataset=True
),
n_jobs=n_jobs,
- ).fit(sample=iris_sample_binary)
+ ).fit(iris_sample_binary)
model_inspector_no_interaction = LearnerInspector(
- pipeline=iris_classifier_binary,
+ model=iris_classifier_binary,
shap_interaction=False,
explainer_factory=TreeExplainerFactory(
feature_perturbation="tree_path_dependent", uses_background_dataset=True
),
n_jobs=n_jobs,
- ).fit(sample=iris_sample_binary)
+ ).fit(iris_sample_binary)
# calculate shap interaction values
shap_interaction_values: pd.DataFrame = model_inspector.shap_interaction_values()
@@ -684,12 +727,12 @@ def test_model_inspection_classifier_interaction_dual_target(
with pytest.raises(
ValueError,
match=(
- f"only single-output classifiers .* are supported.*"
+ f"only single-target classifiers .* are supported.*"
f"{iris_target_name}.*{iris_target_name}2"
),
):
- LearnerInspector(pipeline=iris_classifier_dual_target, n_jobs=n_jobs).fit(
- sample=iris_sample_binary_dual_target
+ LearnerInspector(model=iris_classifier_dual_target, n_jobs=n_jobs).fit(
+ iris_sample_binary_dual_target
)
@@ -701,7 +744,7 @@ def test_shap_plot_data(
) -> None:
shap_plot_data = iris_inspector_multi_class.shap_plot_data()
# noinspection SpellCheckingInspection
- assert tuple(iris_inspector_multi_class.output_names_) == (
+ assert tuple(iris_inspector_multi_class.output_names) == (
"setosa",
"versicolor",
"virginica",
@@ -723,6 +766,52 @@ def test_shap_plot_data(
)
+def test_function_inspector(n_jobs: int) -> None:
+ # define a function to inspect, taking a 2D array with 3 columns as input,
+ # calculating (x1 + x2) * x3
+
+ pi2 = 2 * np.pi
+
+ def model_function(x: pd.DataFrame) -> pd.Series:
+ return np.sin(pi2 * x.x1) * np.sin(pi2 * (x.x2 + x.x3) / 2.0) + x.x4 + x.x5
+
+ # create a background dataset, with 1000 random samples
+ observations = pd.DataFrame(
+ np.random.random(size=(1000, 4)), columns=["x1", "x2", "x4", "x5"]
+ )
+ # column x3 is the same as x2
+ observations["x3"] = observations["x2"]
+ # add a column with the target values
+ observations["y"] = model_function(observations)
+ # create a sample from the background dataset
+ background = Sample(
+ observations=observations,
+ target_name="y",
+ )
+
+ # create a function inspector
+ inspector = FunctionInspector(
+ model=model_function,
+ feature_names=background.feature_names,
+ explainer_factory=ExactExplainerFactory(),
+ n_jobs=1,
+ )
+
+ # fit the inspector
+ inspector.fit(background)
+
+ # print the redundancy and synergy linkage using dendrogram drawers
+ print()
+ DendrogramDrawer(style="text").draw(
+ data=cast(LinkageTree, inspector.feature_redundancy_linkage()),
+ title="FunctionInspector (Redundancy)",
+ )
+ DendrogramDrawer(style="text").draw(
+ data=cast(LinkageTree, inspector.feature_synergy_linkage()),
+ title="FunctionInspector (Synergy)",
+ )
+
+
#
# Utility functions
#
diff --git a/test/test/facet/test_shap_decomposition.py b/test/test/facet/test_shap_decomposition.py
index 3fd3210d..e2866504 100644
--- a/test/test/facet/test_shap_decomposition.py
+++ b/test/test/facet/test_shap_decomposition.py
@@ -10,15 +10,33 @@
from sklearndf.pipeline import RegressorPipelineDF
from sklearndf.regression.extra import LGBMRegressorDF
-from facet.inspection import LearnerInspector
+from facet.data import Sample
+from facet.inspection import LearnerInspector, TreeExplainerFactory
log = logging.getLogger(__name__)
def test_feature_affinity_matrices(
- preprocessed_feature_names: Set[str],
- regressor_inspector: LearnerInspector[RegressorPipelineDF[LGBMRegressorDF]],
+ best_lgbm_model: RegressorPipelineDF[LGBMRegressorDF],
+ sample: Sample,
+ n_jobs: int,
) -> None:
+ regressor_inspector: LearnerInspector[
+ RegressorPipelineDF[LGBMRegressorDF]
+ ] = LearnerInspector(
+ model=best_lgbm_model,
+ explainer_factory=TreeExplainerFactory(
+ feature_perturbation="tree_path_dependent", uses_background_dataset=True
+ ),
+ n_jobs=n_jobs,
+ ).fit(
+ sample
+ )
+
+ preprocessed_feature_names: Set[str] = set(
+ best_lgbm_model.final_estimator.feature_names_in_
+ )
+
# feature affinity matrices (feature dependencies)
# check that dimensions of pairwise feature matrices are equal to # of features,
# and value ranges: