BorealisAI · durandtibo · Nov 7, 2022 · Nov 7, 2022 · Nov 7, 2022 · Nov 7, 2022
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-max-line-length = 120
+max-line-length = 100
 select = ANN,B,B9,BLK,C,D,DAR,E,F,S,W
 ignore = E203,E501,W503
 exclude =

diff --git a/README.md b/README.md
@@ -27,7 +27,10 @@
 </p>
 
 
-This repo contains the implementation to compute feature importance of correlated features.
+This release looks at the role of joint feature importance for explainability in instances where
+features may be highly correlated when providing an output. Specifically, the method operates by
+regrouping the correlated features and then looking at the group-level impact of imputation. Doing
+so allows us to consider the impact of a joint permutation of the correlated features.
 
 ## Examples
 
@@ -48,9 +51,12 @@ pip install groufi
 ### Installing from source
 
 To install `groufi` from source, you can follow the steps below. First, you will need to
-install [`poetry`](https://python-poetry.org/docs/master/). `poetry` is used to manage and install the dependencies.
-If `poetry` is already installed on your machine, you can skip this step. There are several ways to install `poetry` so
-you can use the one that you prefer. You can check the `poetry` installation by running the following command:
+install [`poetry`](https://python-poetry.org/docs/master/). `poetry` is used to manage and install
+the dependencies.
+If `poetry` is already installed on your machine, you can skip this step. There are several ways to
+install `poetry` so
+you can use the one that you prefer. You can check the `poetry` installation by running the
+following command:
 
 ```shell
 poetry --version
@@ -62,29 +68,34 @@ Then, you can clone the git repository:
 git clone git@github.com:BorealisAI/group-feature-importance.git
 ```
 
-Then, it is recommended to create a Python 3.8+ virtual environment. This step is optional so you can skip it. To create
+Then, it is recommended to create a Python 3.8+ virtual environment. This step is optional so you
+can skip it. To create
 a virtual environment, you can use the following command:
 
 ```shell
 make conda
 ```
 
-It automatically creates a conda virtual environment. When the virtual environment is created, you can activate it with
+It automatically creates a conda virtual environment. When the virtual environment is created, you
+can activate it with
 the following command:
 
 ```shell
 conda activate groufi
 ```
 
-This example uses `conda` to create a virtual environment, but you can use other tools or configurations. Then, you
+This example uses `conda` to create a virtual environment, but you can use other tools or
+configurations. Then, you
 should install the required package to use `groufi` with the following command:
 
 ```shell
 make install
 ```
 
-This command will install all the required packages. You can also use this command to update the required packages. This
-command will check if there is a more recent package available and will install it. Finally, you can test the
+This command will install all the required packages. You can also use this command to update the
+required packages. This
+command will check if there is a more recent package available and will install it. Finally, you can
+test the
 installation with the following command:
 
 ```shell
@@ -93,5 +104,6 @@ make test
 
 ## License
 
-This repository is released under the Attribution-NonCommercial-ShareAlike 4.0 International license as found in
+This repository is released under the Attribution-NonCommercial-ShareAlike 4.0 International license
+as found in
 the [LICENSE](LICENSE) file.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "groufi"
-version = "0.0.2"
+version = "0.0.3"
 description = "A small library to compute group feature importance"
 readme = "README.md"
 authors = ["Borealis AI"]
@@ -68,12 +68,12 @@ addopts = "--color yes --durations 10 -rf"
 # Configuration of the short test summary info https://docs.pytest.org/en/stable/usage.html#detailed-summary-report
 
 [tool.black]
-line-length = 120
+line-length = 100
 target-version = ["py38", "py39"]
 include = '\.pyi?$'
 
 [tool.pylint.FORMAT]
-max-line-length = 120
+max-line-length = 100
 
 [tool.isort]
 profile = "black"
diff --git a/src/groufi/group_permutation.py b/src/groufi/group_permutation.py
@@ -34,23 +34,24 @@ def group_permutation_importance(
 
     Args:
         estimator: Specifies a sklearn estimator with a ``predict`` method.
-        features (``numpy.ndarray`` of shape ``(num_examples, feature_size)``): Specifies the matrix of features.
+        features (``numpy.ndarray`` of shape ``(num_examples, feature_size)``): Specifies the
+            matrix of features.
         feature_names: Specifies the names of each feature to make result interpretation easier.
         threshold (float, optional): Specifies the threshold used to create the groups.
             Two features are considered correlated if the correlation value is higher
             than the threshold. Default: ``0.75``
         n_iter (int, optional): Specifies the number of iterations of the basic algorithm.
-            Each iteration starting from a different random seed to create the feature permutation.
-            Default: ``20``
-        random_state (``None``, int or ``RandomState``, optional): see ``sklearn.utils.check_random_state``
-            documentation. Default: ``None``
+            Each iteration starting from a different random seed to create the feature
+            permutation. Default: ``20``
+        random_state (``None``, int or ``RandomState``, optional): see
+            ``sklearn.utils.check_random_state`` documentation. Default: ``None``
 
     Returns:
         ``pandas.DataFrame``: A DataFrame with the feature importance of each feature.
-            The index of the DataFrame is the feature names sorted by decreasing order of feature importance value.
-            The DataFrame has two columns: ``'Feature Importance'`` and ``'group'``.
-            ``'Feature Importance'`` contains the estimated feature importance whereas ``'group'``
-            contains the correlated features to the current feature.
+            The index of the DataFrame is the feature names sorted by decreasing order of feature
+            importance value. The DataFrame has two columns: ``'Feature Importance'`` and
+            ``'group'``. ``'Feature Importance'`` contains the estimated feature importance
+            whereas ``'group'`` contains the correlated features to the current feature.
 
     Example:
 
@@ -93,7 +94,9 @@ def score_func(features: np.ndarray, y_true: np.ndarray):
     ).sort_values(by=["Feature Importance"], ascending=False)
 
 
-def create_correlated_groups(correlation: np.ndarray, threshold: float = 0.75) -> Tuple[Tuple[int, ...], ...]:
+def create_correlated_groups(
+    correlation: np.ndarray, threshold: float = 0.75
+) -> Tuple[Tuple[int, ...], ...]:
     r"""Creates the groups of correlated features.
 
     Note: NaN is interpreted as no correlation between the two variables.
@@ -105,12 +108,12 @@ def create_correlated_groups(correlation: np.ndarray, threshold: float = 0.75) -
             than the threshold. Default: ``0.75``
 
     Returns:
-        tuple: The group of correlated features for each feature. It is represented as a tuple of tuples.
-            The outer tuple indicates each feature, and the inner tuples indicates the correlated groups.
-            If the output is the variable ``output``, ``output[i]`` indicates the group of features
-            correlated  to the ``i``-th feature.``output[i][j]`` is the ``j``-th correlated feature
-            to the ``i``-th feature. Note that the current feature is always included in the group
-            of correlated features.
+        tuple: The group of correlated features for each feature. It is represented as a tuple
+            of tuples. The outer tuple indicates each feature, and the inner tuples indicates
+            the correlated groups. If the output is the variable ``output``, ``output[i]``
+            indicates the group of features correlated  to the ``i``-th feature.``output[i][j]``
+            is the ``j``-th correlated feature to the ``i``-th feature. Note that the current
+            feature is always included in the group of correlated features.
 
     Raises:
         ValueError if ``correlation`` is not a squared matrix.
@@ -125,9 +128,14 @@ def create_correlated_groups(correlation: np.ndarray, threshold: float = 0.75) -
         ((0, 2), (1,), (0, 2))
     """
     if correlation.ndim != 2:
-        raise ValueError(f"`correlation` has to be 2 dimensional array (received: {correlation.ndim})")
+        raise ValueError(
+            f"`correlation` has to be 2 dimensional array (received: {correlation.ndim})"
+        )
     if correlation.shape[0] != correlation.shape[1]:
-        raise ValueError(f"Incorrect shape. `correlation` has to be a squared matrix (received: {correlation.shape})")
+        raise ValueError(
+            "Incorrect shape. `correlation` has to be a squared matrix "
+            f"(received: {correlation.shape})"
+        )
     indices = []
     for i in range(correlation.shape[0]):
         indices.append(tuple(np.flatnonzero(correlation[i] >= threshold).tolist()))
@@ -138,8 +146,8 @@ def show_correlated_groups(groups: Sequence[Sequence[int]], names: Sequence[str]
     r"""Shows the correlated groups.
 
     Args:
-        groups: Specifies the groups of correlated features. See the output of ``create_correlated_groups``
-            for more information about the structure.
+        groups: Specifies the groups of correlated features. See the output of
+            ``create_correlated_groups`` for more information about the structure.
         names: Specifies the names of each feature.
 
     Raises:
@@ -160,7 +168,9 @@ def show_correlated_groups(groups: Sequence[Sequence[int]], names: Sequence[str]
             (02) feat3
     """
     if len(groups) != len(names):
-        raise ValueError(f"`groups` ({len(groups)}) and `names` ({len(names)}) should have the same length")
+        raise ValueError(
+            f"`groups` ({len(groups)}) and `names` ({len(names)}) should have the same length"
+        )
     for i, group in enumerate(groups):
         corr_names = "\n".join([f"\t({j:02d}) {names[j]}" for j in group])
         logger.debug(f"Group ({i:02d}) {names[i]}:\n{corr_names}")
@@ -178,12 +188,12 @@ def iter_shuffled(
     if you want to use multiple of them at the same time, make copies.
 
     Args:
-        features (``numpy.ndarray`` of shape ``(num_examples, feature_size)``): Specifies the matrix of features
-            to shuffle.
-        groups: Specifies the groups of correlated features. See the output of ``create_correlated_groups``
-            for more information about the structure.
-        random_state (``None``, int or ``RandomState``, optional): see ``sklearn.utils.check_random_state``
-            documentation. Default: ``None``
+        features (``numpy.ndarray`` of shape ``(num_examples, feature_size)``): Specifies the
+            matrix of features to shuffle.
+        groups: Specifies the groups of correlated features. See the output of
+            ``create_correlated_groups`` for more information about the structure.
+        random_state (``None``, int or ``RandomState``, optional): see
+            ``sklearn.utils.check_random_state`` documentation. Default: ``None``
 
     Returns:
         Each item in the iterable is a ``numpy.ndarray`` of shape ``(num_examples, feature_size)``.
@@ -211,22 +221,27 @@ def compute_scores_shuffled(
     r"""Computes the scores associated where the features are shuffled by group.
 
     Args:
-        score_func (callable): Specifies the callable used to compute the score of the predictions done
-            with shuffled features.
-        features (``numpy.array`` of shape ``(num_examples, feature_size)``): Specifies the matrix of features.
-        targets (``numpy.array`` of shape ``(num_examples, prediction_size)``): Specifies the matrix of
-            targets.
-        groups: Specifies the groups of correlated features. See the output of ``create_correlated_groups``
-            for more information about the structure.
-        random_state (``None``, int or ``RandomState``, optional): see ``sklearn.utils.check_random_state``
-            documentation. Default: ``None``
+        score_func (callable): Specifies the callable used to compute the score of the
+            predictions done with shuffled features.
+        features (``numpy.array`` of shape ``(num_examples, feature_size)``): Specifies the
+            matrix of features.
+        targets (``numpy.array`` of shape ``(num_examples, prediction_size)``): Specifies the
+            matrix of targets.
+        groups: Specifies the groups of correlated features. See the output of
+            ``create_correlated_groups`` for more information about the structure.
+        random_state (``None``, int or ``RandomState``, optional): see
+            ``sklearn.utils.check_random_state`` documentation. Default: ``None``
 
     Returns:
-        ``numpy.array`` of shape ``(feature_size,)``: The score associated to each feature when its
-            associated group is shuffled.
+        ``numpy.array`` of shape ``(feature_size,)``: The score associated to each feature
+            when its associated group is shuffled.
     """
-    iter_shuffled_features = iter_shuffled(features=features, groups=groups, random_state=random_state)
-    return np.array([score_func(shuffled_features, targets) for shuffled_features in iter_shuffled_features])
+    iter_shuffled_features = iter_shuffled(
+        features=features, groups=groups, random_state=random_state
+    )
+    return np.array(
+        [score_func(shuffled_features, targets) for shuffled_features in iter_shuffled_features]
+    )
 
 
 def get_score_importances(
@@ -240,17 +255,18 @@ def get_score_importances(
     """Computes the score importance.
 
     Args:
-        score_func (callable): Specifies the callable used to compute the score of the predictions done
-            with shuffled features.
-        features (``numpy.array`` of shape ``(num_examples, feature_size)``): Specifies the matrix of features.
-        targets (``numpy.array`` of shape ``(num_examples, prediction_size)``): Specifies the matrix of
-            targets.
-        groups: Specifies the groups of correlated features. See the output of ``create_correlated_groups``
-            for more information about the structure.
+        score_func (callable): Specifies the callable used to compute the score of the
+            predictions done with shuffled features.
+        features (``numpy.array`` of shape ``(num_examples, feature_size)``): Specifies the
+            matrix of features.
+        targets (``numpy.array`` of shape ``(num_examples, prediction_size)``): Specifies the
+            matrix of targets.
+        groups: Specifies the groups of correlated features. See the output of
+            ``create_correlated_groups`` for more information about the structure.
         n_iter (int, optional): Specifies the number of iterations of the basic algorithm.
             Each iteration starting from a different random seed. Default: ``20``
-        random_state (``None``, int or ``RandomState``, optional): see ``sklearn.utils.check_random_state``
-            documentation. Default: ``None``
+        random_state (``None``, int or ``RandomState``, optional): see
+            ``sklearn.utils.check_random_state`` documentation. Default: ``None``
 
     Returns:
         ``(base_score, score_decreases)`` tuple with the base score and score decreases

diff --git a/tests/test_group_permutation.py b/tests/test_group_permutation.py
@@ -116,7 +116,9 @@ def test_group_permutation_importance_different_random_states(estimator: BaseEst
 
 
 def test_create_correlated_groups():
-    assert create_correlated_groups(correlation=np.array([[1, 0.3, 0.8], [0.1, 1, 0.1], [0.8, 0.3, 1]])) == (
+    assert create_correlated_groups(
+        correlation=np.array([[1, 0.3, 0.8], [0.1, 1, 0.1], [0.8, 0.3, 1]])
+    ) == (
         (0, 2),
         (1,),
         (0, 2),
@@ -128,7 +130,9 @@ def test_create_correlated_groups_independent():
 
 
 def test_create_correlated_groups_with_nan():
-    assert create_correlated_groups(correlation=np.array([[1, 0.3, np.nan], [0.1, 1, 0.1], [0.8, 0.3, np.nan]])) == (
+    assert create_correlated_groups(
+        correlation=np.array([[1, 0.3, np.nan], [0.1, 1, 0.1], [0.8, 0.3, np.nan]])
+    ) == (
         (0,),
         (1,),
         (0,),
@@ -152,7 +156,9 @@ def test_create_correlated_groups_incorrect_shape():
 
 def test_show_correlated_groups(caplog):
     with caplog.at_level(logging.DEBUG):
-        show_correlated_groups(((0, 2), (1,), (0, 2), tuple()), ["feature1", "feature2", "feature3", "feature4"])
+        show_correlated_groups(
+            ((0, 2), (1,), (0, 2), tuple()), ["feature1", "feature2", "feature3", "feature4"]
+        )
         assert len(caplog.messages) == 4
 
 
@@ -167,7 +173,9 @@ def test_show_correlated_groups_incorrect_length():
 
 
 def test_iter_shuffled():
-    arrays = [array.copy() for array in iter_shuffled(np.arange(15).reshape((3, 5)), ((0, 2), (1,)))]
+    arrays = [
+        array.copy() for array in iter_shuffled(np.arange(15).reshape((3, 5)), ((0, 2), (1,)))
+    ]
     assert all(array.shape == (3, 5) for array in arrays)
 
 
@@ -177,7 +185,9 @@ def shuffle_mock(x: np.array):
 
     rng_mock = Mock(shuffle=shuffle_mock)
     with patch("groufi.group_permutation.check_random_state", Mock(return_value=rng_mock)):
-        arrays = [array.copy() for array in iter_shuffled(np.arange(15).reshape((3, 5)), ((0, 2), (1,)))]
+        arrays = [
+            array.copy() for array in iter_shuffled(np.arange(15).reshape((3, 5)), ((0, 2), (1,)))
+        ]
         assert np.array_equal(
             arrays[0],
             np.array(