Add some docs explaining how to "analyze FS algorithm stability" (#83)

* [WIP] Analyzing algorithm stability recipe * Finish stability example * Reformat & imports * Add note for relative vs absolute paths * Algorithm stability recipe 📈 First version * Finish algorithm stability example 🙌🏻
dunnkers · Oct 4, 2022 · 4c93031 · 4c93031
1 parent 54c7232
commit 4c93031
Show file tree

Hide file tree

Showing 20 changed files with 1,981 additions and 13 deletions.
diff --git a/examples/algorithm-stability-yaml/analyze-results.ipynb b/examples/algorithm-stability-yaml/analyze-results.ipynb
diff --git a/examples/algorithm-stability-yaml/benchmark.py b/examples/algorithm-stability-yaml/benchmark.py
@@ -0,0 +1,111 @@
+from typing import Dict, Optional, Union
+
+import hydra
+import numpy as np
+import pandas as pd
+from skrebate import ReliefF
+
+from fseval.config import PipelineConfig
+from fseval.main import run_pipeline
+from fseval.types import AbstractEstimator, AbstractMetric, Callback
+
+"""
+The checkInputType and getStability functions come from the following paper:
+
+[1] On the Stability of Feature Selection. Sarah Nogueira, Konstantinos Sechidis, Gavin Brown. 
+    Journal of Machine Learning Reasearch (JMLR). 2017.
+You can find a full demo using this package at:
+http://htmlpreview.github.io/?https://github.com/nogueirs/JMLR2017/blob/master/python/stabilityDemo.html
+NB: This package requires the installation of the packages: numpy, scipy and math
+"""
+
+
+def checkInputType(Z):
+    """This function checks that Z is of the rigt type and dimension.
+    It raises an exception if not.
+    OUTPUT: The input Z as a numpy.ndarray
+    """
+    ### We check that Z is a list or a numpy.array
+    if isinstance(Z, list):
+        Z = np.asarray(Z)
+    elif not isinstance(Z, np.ndarray):
+        raise ValueError("The input matrix Z should be of type list or numpy.ndarray")
+    ### We check if Z is a matrix (2 dimensions)
+    if Z.ndim != 2:
+        raise ValueError("The input matrix Z should be of dimension 2")
+    return Z
+
+
+def getStability(Z):
+    """
+    Let us assume we have M>1 feature sets and d>0 features in total.
+    This function computes the stability estimate as given in Definition 4 in  [1].
+
+    INPUT: A BINARY matrix Z (given as a list or as a numpy.ndarray of size M*d).
+           Each row of the binary matrix represents a feature set, where a 1 at the f^th position
+           means the f^th feature has been selected and a 0 means it has not been selected.
+
+    OUTPUT: The stability of the feature selection procedure
+    """
+    Z = checkInputType(Z)
+    M, d = Z.shape
+    hatPF = np.mean(Z, axis=0)
+    kbar = np.sum(hatPF)
+    denom = (kbar / d) * (1 - kbar / d)
+    return 1 - (M / (M - 1)) * np.mean(np.multiply(hatPF, 1 - hatPF)) / denom
+
+
+class StabilityNogueira(AbstractMetric):
+    def score_bootstrap(
+        self,
+        ranker: AbstractEstimator,
+        validator: AbstractEstimator,
+        callbacks: Callback,
+        scores: Dict,
+        **kwargs,
+    ) -> Dict:
+        # compute stability and send to table
+        Z = np.array(self.support_matrix)
+        Z = Z.astype(int)
+        stability = getStability(Z)
+        stability_df = pd.DataFrame([{"stability": stability}])
+        callbacks.on_table(stability_df, "stability")
+
+        # set in scores dict
+        scores["stability"] = stability
+
+        return scores
+
+    def score_ranking(
+        self,
+        scores: Union[Dict, pd.DataFrame],
+        ranker: AbstractEstimator,
+        bootstrap_state: int,
+        callbacks: Callback,
+        feature_importances: Optional[np.ndarray] = None,
+    ):
+        support_matrix = getattr(self, "support_matrix", [])
+        self.support_matrix = support_matrix
+        self.support_matrix.append(ranker.feature_support_)
+
+
+class ReliefF_FeatureSelection(ReliefF):
+    def fit(self, X, y):
+        super(ReliefF_FeatureSelection, self).fit(X, y)
+
+        # extract feature subset from ReliefF
+        feature_subset = self.top_features_[: self.n_features_to_select]
+
+        # set `support_` vector
+        _, p = np.shape(X)
+        self.support_ = np.zeros(p, dtype=bool)
+        self.support_[feature_subset] = True
+
+
+@hydra.main(config_path="conf", config_name="my_config")
+def main(cfg: PipelineConfig) -> None:
+    run_pipeline(cfg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/algorithm-stability-yaml/conf/dataset/synclf_hard.yaml b/examples/algorithm-stability-yaml/conf/dataset/synclf_hard.yaml
@@ -0,0 +1,18 @@
+name: Synclf hard
+task: classification
+domain: synthetic
+group: Synclf
+adapter:
+  _target_: sklearn.datasets.make_classification
+  class_sep: 0.8
+  n_classes: 3
+  n_clusters_per_class: 3
+  n_features: 50
+  n_informative: 4
+  n_redundant: 0
+  n_repeated: 0
+  n_samples: 1000
+  random_state: 0
+  shuffle: false
+feature_importances:
+  X[:, 0:4]: 1.0
diff --git a/examples/algorithm-stability-yaml/conf/metrics/stability_nogueira.yaml b/examples/algorithm-stability-yaml/conf/metrics/stability_nogueira.yaml
@@ -0,0 +1,3 @@
+# @package metrics
+ranking_scores:
+  _target_: benchmark.StabilityNogueira
diff --git a/examples/algorithm-stability-yaml/conf/my_config.yaml b/examples/algorithm-stability-yaml/conf/my_config.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - base_pipeline_config
+  - _self_
+  - override dataset: synclf_hard
+  - override validator: knn
+  - override /callbacks:
+      - to_sql
+  - override /metrics:
+      - stability_nogueira
+
+n_bootstraps: 10
diff --git a/examples/algorithm-stability-yaml/conf/ranker/boruta.yaml b/examples/algorithm-stability-yaml/conf/ranker/boruta.yaml
@@ -0,0 +1,11 @@
+name: Boruta
+estimator:
+  _target_: boruta.boruta_py.BorutaPy
+  estimator:
+    _target_: sklearn.ensemble.RandomForestClassifier
+  n_estimators: auto
+_estimator_type: classifier
+multioutput: false
+estimates_feature_importances: false
+estimates_feature_support: true
+estimates_feature_ranking: true
diff --git a/examples/algorithm-stability-yaml/conf/ranker/relieff.yaml b/examples/algorithm-stability-yaml/conf/ranker/relieff.yaml
@@ -0,0 +1,7 @@
+name: ReliefF
+estimator:
+  _target_: benchmark.ReliefF_FeatureSelection
+  n_features_to_select: 10 # select best 10 features in feature subset.
+_estimator_type: classifier
+estimates_feature_importances: true
+estimates_feature_support: true
diff --git a/examples/algorithm-stability-yaml/conf/validator/knn.yaml b/examples/algorithm-stability-yaml/conf/validator/knn.yaml
@@ -0,0 +1,6 @@
+name: k-NN
+estimator:
+  _target_: sklearn.neighbors.KNeighborsClassifier
+_estimator_type: classifier
+multioutput: false
+estimates_target: true
diff --git a/fseval/callbacks/to_sql.py b/fseval/callbacks/to_sql.py
@@ -3,11 +3,11 @@
 from typing import Dict
 
 import pandas as pd
-from fseval.config.callbacks.to_sql import ToSQLCallback
-from fseval.types import TerminalColor
 from omegaconf import MISSING, DictConfig
 from sqlalchemy import create_engine
-from sqlalchemy.pool import NullPool
+
+from fseval.config.callbacks.to_sql import ToSQLCallback
+from fseval.types import TerminalColor
 
 from ._base_export_callback import BaseExportCallback
 

diff --git a/fseval/pipelines/_experiment.py b/fseval/pipelines/_experiment.py
@@ -7,10 +7,10 @@
 
 import numpy as np
 import pandas as pd
+from humanfriendly import format_timespan
+
 from fseval.pipeline.estimator import Estimator
 from fseval.types import AbstractEstimator, Callback, TerminalColor
-from humanfriendly import format_timespan
-from sqlalchemy.engine import Engine
 
 
 @dataclass

diff --git a/fseval/pipelines/rank_and_validate/_support_validator.py b/fseval/pipelines/rank_and_validate/_support_validator.py
@@ -63,10 +63,11 @@ def score(self, X, y, **kwargs) -> Union[Dict, pd.DataFrame, np.generic, None]:
         scores = pd.DataFrame([scores_dict])
 
         # add custom metrics
+        X_, y_ = self._prepare_data(X, y)
+
         for metric_name, metric_class in self.metrics.items():
-            X, y = self._prepare_data(X, y)
             scores_metric = metric_class.score_support(  # type: ignore
-                scores, self.validator, X, y, self.callbacks
+                scores, self.validator, X_, y_, self.callbacks
             )  # type: ignore
 
             if scores_metric is not None:

diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py
@@ -2,9 +2,10 @@
 import tempfile
 
 import pytest
+from hydra.conf import ConfigStore
+
 from fseval.config import EstimatorConfig, PipelineConfig
 from fseval.main import run_pipeline
-from fseval.types import IncompatibilityError
 from fseval.utils.hydra_utils import get_config
 from hydra.conf import ConfigStore
 from hydra.errors import InstantiationException

diff --git a/website/docs/_recipes/algorithm-stability.md b/website/docs/_recipes/algorithm-stability.md
diff --git a/website/docs/_recipes/running-on-aws.md b/website/docs/_recipes/running-on-aws.md
diff --git a/website/docs/_recipes/running-on-slurm.md b/website/docs/_recipes/running-on-slurm.md
diff --git a/website/docs/quick-start.mdx b/website/docs/quick-start.mdx
@@ -144,6 +144,14 @@ We can now decide how to export the results. We can upload our results to a live
 sql_con=sqlite:////Users/dunnkers/Downloads/results.sqlite # any well-defined database URL
 ```
 
+:::note Relative vs absolute paths
+
+If you define a _relative_ database URL, like `sql_con=sqlite:///./results.sqlite`, the results will be saved right where Hydra stores its individual run files. In other words, multiple `.sqlite` files are stored in the `./multirun` subfolders.
+
+To prevent this, and store all results in 1 `.sqlite` file, use an **absolute** path, like above. But preferably, you are using a proper running database - see the recipes for more instructions on this.
+
+:::
+
 We are now ready to run an experiment. In a terminal, `cd` into the unzipped example directory and run the following:
 ```shell
 python benchmark.py --multirun ranker='glob(*)' +callbacks.to_sql.url=$sql_con

diff --git a/website/docs/_recipes/_category_.json → website/docs/recipes/_category_.json b/website/docs/_recipes/_category_.json → website/docs/recipes/_category_.json