API: adopt revised design of pytools visualisations (#312)

* API: remove custom blank line at end of simulation output * API: return LearnerInspector matrix outputs as Matrix instances * API: move class LinkageTree to module pytools.data * API: annotate affinity matrices with weights and axis labels * API: feature/dependent feature --> primary feature/associated feature * VIZ: set color of axis labels to foreground color * API: order feature linkage leaves for minimal neighbour distance * API: rename Matrix.data to .values, and .weight_label to .value_label * API: set diagonals of affinity matrices to np.nan * API: return Matrix instances from feature_interaction_matrix() * REFACTOR: rename __feature_matrix_to_df to __arrays_to_matrix * REFACTOR: import scipy modules instead of individual functions * TEST: update unit tests for new Matrix class * TEST: support NAN values in print_expected_matrix() helper function * DOC: update release notes
BCG-X-Official · Oct 19, 2021 · d700cfd · d700cfd
1 parent e4a5b44
commit d700cfd
Show file tree

Hide file tree

Showing 8 changed files with 337 additions and 238 deletions.
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -1,6 +1,26 @@
 Release Notes
 =============
 
+FACET 2.0
+---------
+
+2.0.0
+~~~~~
+
+- API: return :class:`.LearnerInspector` matrix outputs as :class:`.Matrix` instances
+- API: diagonals of feature synergy, redundancy, and association matrices are now
+  ``nan`` instead of 1.0
+- API: the leaf order of :class:`.LinkageTree` objects generated by
+  ``feature_…_linkage`` methods of :class:`.LearnerInspector` is now the same as the
+  row and column order of :class:`.Matrix` objects returned by the corresponding
+  ``feature_…_matrix`` methods of :class:`.LearnerInspector`, minimizing the distance
+  between adjacent leaves
+  The old sorting behaviour of FACET 1.x can be restored using method
+  :meth:`.LinkageTree.sort_by_weight`
+- VIZ: minor tweaks to simulation plots and reports generated by
+  :class:`.SimulationDrawer`
+
+
 FACET 1.2
 ---------
 
@@ -22,7 +42,7 @@ the baseline of a simulation.
 ~~~~~
 
 - BUILD: added support for *sklearndf* 1.2 and *scikit-learn* 0.24
-- API: new optional parameter `subsample` in method 
+- API: new optional parameter ``subsample`` in method
   :meth:`.BaseUnivariateSimulator.simulate_feature` can be used to specify a subsample
   to be used in the simulation (but simulating using a crossfit based on the full
   sample)
@@ -45,7 +65,7 @@ by the :class:`.LearnerInspector`.
 ~~~~~
 
 - API: SHAP interaction vectors can (in part) also be influenced by redundancy among
-  features. This can inflate quantificatios of synergy, especially in cases where two
+  features. This can inflate quantifications of synergy, especially in cases where two
   variables are highly redundant. FACET now corrects interaction vectors for redundancy
   prior to calculating synergy. Technically we ensure that each interaction vector is
   orthogonal w.r.t the main effect vectors of both associated features.
@@ -66,7 +86,8 @@ FACET 1.0
 1.0.3
 ~~~~~
 
-- FIX: restrict package requirements to *gamma-pytools* 1.0.* and *sklearndf* 1.0.x, since FACET 1.0 is not compatible with *gamma-pytools* 1.1.* 
+- FIX: restrict package requirements to *gamma-pytools* 1.0.* and *sklearndf* 1.0.x,
+  since FACET 1.0 is not compatible with *gamma-pytools* 1.1.*
 
 1.0.2
 ~~~~~

diff --git a/sphinx/source/tutorial/Classification_with_Facet.ipynb b/sphinx/source/tutorial/Classification_with_Facet.ipynb
@@ -203,7 +203,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pytools.viz.dendrogram import DendrogramDrawer, LinkageTree\n",
+    "from pytools.viz.dendrogram import DendrogramDrawer\n",
     "from pytools.viz.matrix import MatrixDrawer"
    ]
   },
@@ -3152,10 +3152,10 @@
      "evalue": "name 'TableOne' is not defined",
      "output_type": "error",
      "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-38-2d93f376f83b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m ]\n\u001b[1;32m     19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m mytable = TableOne(\n\u001b[0m\u001b[1;32m     21\u001b[0m     \u001b[0mprediab_eda\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprediab_eda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pre_diab\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'TableOne' is not defined"
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mNameError\u001B[0m                                 Traceback (most recent call last)",
+      "\u001B[0;32m<ipython-input-38-2d93f376f83b>\u001B[0m in \u001B[0;36m<module>\u001B[0;34m\u001B[0m\n\u001B[1;32m     18\u001B[0m ]\n\u001B[1;32m     19\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m---> 20\u001B[0;31m mytable = TableOne(\n\u001B[0m\u001B[1;32m     21\u001B[0m     \u001B[0mprediab_eda\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m     22\u001B[0m     \u001B[0mcolumns\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mprediab_eda\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcolumns\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mdrop\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"Pre_diab\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mto_list\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n",
+      "\u001B[0;31mNameError\u001B[0m: name 'TableOne' is not defined"
      ]
     }
    ],
@@ -3294,4 +3294,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/src/facet/inspection/_inspection.py b/src/facet/inspection/_inspection.py
@@ -7,13 +7,13 @@
 
 import numpy as np
 import pandas as pd
-from scipy.cluster.hierarchy import leaves_list, linkage, optimal_leaf_ordering
-from scipy.spatial.distance import squareform
+from scipy.cluster import hierarchy
+from scipy.spatial import distance
 
 from pytools.api import AllTracker, inheritdoc
+from pytools.data import LinkageTree, Matrix
 from pytools.fit import FittableMixin
 from pytools.parallelization import ParallelizableMixin
-from pytools.viz.dendrogram import LinkageTree
 from sklearndf import ClassifierDF, LearnerDF, RegressorDF
 from sklearndf.pipeline import LearnerPipelineDF
 
@@ -491,7 +491,7 @@ def feature_synergy_matrix(
         symmetrical: bool = False,
         aggregation: Optional[str] = AGG_MEAN,
         clustered: bool = True,
-    ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+    ) -> Union[Matrix, List[Matrix]]:
         """
         Calculate the feature synergy matrix.
 
@@ -540,6 +540,7 @@ def feature_synergy_matrix(
             affinity_symmetrical=explainer.synergy(
                 symmetrical=True, absolute=False, std=False
             ),
+            affinity_metric="synergy",
             clustered=clustered,
         )
 
@@ -550,7 +551,7 @@ def feature_redundancy_matrix(
         symmetrical: bool = False,
         aggregation: Optional[str] = AGG_MEAN,
         clustered: bool = True,
-    ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+    ) -> Union[Matrix, List[Matrix]]:
         """
         Calculate the feature redundancy matrix.
 
@@ -599,6 +600,7 @@ def feature_redundancy_matrix(
             affinity_symmetrical=explainer.redundancy(
                 symmetrical=True, absolute=False, std=False
             ),
+            affinity_metric="redundancy",
             clustered=clustered,
         )
 
@@ -609,7 +611,7 @@ def feature_association_matrix(
         symmetrical: bool = False,
         aggregation: Optional[str] = AGG_MEAN,
         clustered: bool = True,
-    ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+    ) -> Union[Matrix, List[Matrix]]:
         """
         Calculate the feature association matrix.
 
@@ -662,6 +664,7 @@ def feature_association_matrix(
             affinity_symmetrical=global_explainer.association(
                 symmetrical=True, absolute=False, std=False
             ),
+            affinity_metric="association",
             clustered=clustered,
         )
 
@@ -725,7 +728,7 @@ def feature_association_linkage(self) -> Union[LinkageTree, List[LinkageTree]]:
             )
         )
 
-    def feature_interaction_matrix(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+    def feature_interaction_matrix(self) -> Union[Matrix, List[Matrix]]:
         """
         Calculate relative shap interaction values for all feature pairings.
 
@@ -825,7 +828,9 @@ def feature_interaction_matrix(self) -> Union[pd.DataFrame, List[pd.DataFrame]]:
         )[np.newaxis, :, :]
 
         # create a data frame from the feature matrix
-        return self.__feature_matrix_to_df(interaction_matrix)
+        return self.__arrays_to_matrix(
+            interaction_matrix, value_label="relative shap interaction"
+        )
 
     def shap_plot_data(self) -> ShapPlotData:
         """
@@ -882,9 +887,9 @@ def shap_plot_data(self) -> ShapPlotData:
             sample=sample,
         )
 
-    def __feature_matrix_to_df(
-        self, matrix: np.ndarray
-    ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+    def __arrays_to_matrix(
+        self, matrix: np.ndarray, value_label: str
+    ) -> Union[Matrix, List[Matrix]]:
         # transform a matrix of shape (n_outputs, n_features, n_features)
         # to a data frame
 
@@ -897,27 +902,38 @@ def __feature_matrix_to_df(
 
         # convert array to data frame(s) with features as row and column indices
         if len(matrix) == 1:
-            return pd.DataFrame(
-                data=matrix[0], index=feature_index, columns=feature_index
+            return self.__array_to_matrix(
+                matrix[0],
+                feature_importance=self.feature_importance(),
+                value_label=value_label,
             )
         else:
             return [
-                pd.DataFrame(data=m, index=feature_index, columns=feature_index)
-                for m in matrix
+                self.__array_to_matrix(
+                    m,
+                    feature_importance=feature_importance,
+                    value_label=f"{value_label} ({output_name})",
+                )
+                for m, (_, feature_importance), output_name in zip(
+                    matrix, self.feature_importance().items(), self.output_names_
+                )
             ]
 
-    @staticmethod
     def __feature_affinity_matrix(
+        self,
         affinity_matrices: List[pd.DataFrame],
         affinity_symmetrical: np.ndarray,
+        affinity_metric: str,
         clustered: bool,
-    ):
+    ) -> Matrix:
         if clustered:
-            affinity_matrices = LearnerInspector.__sort_affinity_matrices(
+            affinity_matrices = self.__sort_affinity_matrices(
                 affinity_matrices=affinity_matrices,
                 symmetrical_affinity_matrices=affinity_symmetrical,
             )
-        return LearnerInspector.__isolate_single_frame(affinity_matrices)
+        return self.__isolate_single_frame(
+            affinity_matrices, affinity_metric=affinity_metric
+        )
 
     @staticmethod
     def __sort_affinity_matrices(
@@ -928,22 +944,14 @@ def __sort_affinity_matrices(
         fn_linkage = LearnerInspector.__linkage_matrix_from_affinity_matrix_for_output
 
         return [
-            affinity_matrix.iloc[feature_order, feature_order]
+            (lambda feature_order: affinity_matrix.iloc[feature_order, feature_order])(
+                feature_order=hierarchy.leaves_list(
+                    Z=fn_linkage(feature_affinity_matrix=symmetrical_affinity_matrix)
+                )
+            )
             for affinity_matrix, symmetrical_affinity_matrix in zip(
                 affinity_matrices, symmetrical_affinity_matrices
             )
-            for feature_order in (
-                leaves_list(
-                    Z=optimal_leaf_ordering(
-                        Z=fn_linkage(
-                            feature_affinity_matrix=symmetrical_affinity_matrix
-                        ),
-                        y=symmetrical_affinity_matrix,
-                    )
-                )
-                # reverse the index list so larger values tend to end up on top
-                [::-1],
-            )
         ]
 
     @staticmethod
@@ -989,7 +997,8 @@ def __linkages_from_affinity_matrices(
 
             return [
                 self.__linkage_tree_from_affinity_matrix_for_output(
-                    feature_affinity_for_output, feature_importance_for_output
+                    feature_affinity_for_output,
+                    feature_importance_for_output,
                 )
                 for feature_affinity_for_output, (
                     _,
@@ -1037,10 +1046,20 @@ def __linkage_matrix_from_affinity_matrix_for_output(
         # (1 = closest, 0 = most distant)
 
         # compress the distance matrix (required by SciPy)
-        compressed_distance_vector = squareform(1 - abs(feature_affinity_matrix))
+        distance_matrix = 1.0 - abs(feature_affinity_matrix)
+        np.fill_diagonal(distance_matrix, 0.0)
+        compressed_distance_matrix: np.ndarray = distance.squareform(distance_matrix)
 
         # calculate the linkage matrix
-        return linkage(y=compressed_distance_vector, method="single")
+        leaf_ordering: np.ndarray = hierarchy.optimal_leaf_ordering(
+            Z=hierarchy.linkage(y=compressed_distance_matrix, method="single"),
+            y=compressed_distance_matrix,
+        )
+
+        # reverse the leaf ordering, so that larger values tend to end up on top
+        leaf_ordering[:, [1, 0]] = leaf_ordering[:, [0, 1]]
+
+        return leaf_ordering
 
     def _ensure_shap_interaction(self) -> None:
         if not self._shap_interaction:
@@ -1050,14 +1069,69 @@ def _ensure_shap_interaction(self) -> None:
                 "enable calculations involving SHAP interaction values."
             )
 
-    @staticmethod
     def __isolate_single_frame(
+        self,
         frames: List[pd.DataFrame],
-    ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
+        affinity_metric: str,
+    ) -> Union[Matrix, List[Matrix]]:
+        feature_importance = self.feature_importance()
+
         if len(frames) == 1:
-            return frames[0]
+            assert isinstance(feature_importance, pd.Series)
+            return self.__frame_to_matrix(
+                frames[0],
+                affinity_metric=affinity_metric,
+                feature_importance=feature_importance,
+            )
         else:
-            return frames
+            return [
+                self.__frame_to_matrix(
+                    frame,
+                    affinity_metric=affinity_metric,
+                    feature_importance=frame_importance,
+                    feature_importance_category=str(frame_name),
+                )
+                for frame, (frame_name, frame_importance) in zip(
+                    frames, feature_importance.items()
+                )
+            ]
+
+    @staticmethod
+    def __array_to_matrix(
+        a: np.ndarray,
+        *,
+        feature_importance: pd.Series,
+        value_label: str,
+    ) -> Matrix:
+        return Matrix(
+            a,
+            names=(feature_importance.index, feature_importance.index),
+            weights=(feature_importance, feature_importance),
+            value_label=value_label,
+            name_labels=("feature", "feature"),
+        )
+
+    @staticmethod
+    def __frame_to_matrix(
+        frame: pd.DataFrame,
+        *,
+        affinity_metric: str,
+        feature_importance: pd.Series,
+        feature_importance_category: Optional[str] = None,
+    ) -> Matrix:
+        return Matrix.from_frame(
+            frame,
+            weights=(
+                feature_importance.reindex(frame.index),
+                feature_importance.reindex(frame.columns),
+            ),
+            value_label=(
+                f"{affinity_metric} ({feature_importance_category})"
+                if feature_importance_category
+                else affinity_metric
+            ),
+            name_labels=("primary feature", "associated feature"),
+        )
 
     @staticmethod
     def __validate_aggregation_method(aggregation: str) -> None:

diff --git a/src/facet/inspection/_shap_global_explanation.py b/src/facet/inspection/_shap_global_explanation.py
@@ -119,8 +119,8 @@ def from_relative_affinity(
             where=affinity_abs_sym_ij_2x > 0.0,
         )
 
-        # re-set the diagonal to 1.0 in case of rounding errors
-        fill_diagonal(affinity_rel_sym_ij, 1.0)
+        # affinity of a feature with itself is undefined
+        fill_diagonal(affinity_rel_sym_ij, np.nan)
 
         # return the AffinityMatrices object
         return AffinityMatrix(