Skip to content

Commit

Permalink
Add some docs explaining how to "analyze FS algorithm stability" (#83)
Browse files Browse the repository at this point in the history
* [WIP] Analyzing algorithm stability recipe

* Finish stability example

* Reformat & imports

* Add note for relative vs absolute paths

* Algorithm stability recipe 📈 First version

* Finish algorithm stability example 🙌🏻
  • Loading branch information
dunnkers authored Oct 4, 2022
1 parent 54c7232 commit 4c93031
Show file tree
Hide file tree
Showing 20 changed files with 1,981 additions and 13 deletions.
1,476 changes: 1,476 additions & 0 deletions examples/algorithm-stability-yaml/analyze-results.ipynb

Large diffs are not rendered by default.

111 changes: 111 additions & 0 deletions examples/algorithm-stability-yaml/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import Dict, Optional, Union

import hydra
import numpy as np
import pandas as pd
from skrebate import ReliefF

from fseval.config import PipelineConfig
from fseval.main import run_pipeline
from fseval.types import AbstractEstimator, AbstractMetric, Callback

"""
The checkInputType and getStability functions come from the following paper:
[1] On the Stability of Feature Selection. Sarah Nogueira, Konstantinos Sechidis, Gavin Brown.
Journal of Machine Learning Reasearch (JMLR). 2017.
You can find a full demo using this package at:
http://htmlpreview.github.io/?https://github.com/nogueirs/JMLR2017/blob/master/python/stabilityDemo.html
NB: This package requires the installation of the packages: numpy, scipy and math
"""


def checkInputType(Z):
"""This function checks that Z is of the rigt type and dimension.
It raises an exception if not.
OUTPUT: The input Z as a numpy.ndarray
"""
### We check that Z is a list or a numpy.array
if isinstance(Z, list):
Z = np.asarray(Z)
elif not isinstance(Z, np.ndarray):
raise ValueError("The input matrix Z should be of type list or numpy.ndarray")
### We check if Z is a matrix (2 dimensions)
if Z.ndim != 2:
raise ValueError("The input matrix Z should be of dimension 2")
return Z


def getStability(Z):
"""
Let us assume we have M>1 feature sets and d>0 features in total.
This function computes the stability estimate as given in Definition 4 in [1].
INPUT: A BINARY matrix Z (given as a list or as a numpy.ndarray of size M*d).
Each row of the binary matrix represents a feature set, where a 1 at the f^th position
means the f^th feature has been selected and a 0 means it has not been selected.
OUTPUT: The stability of the feature selection procedure
"""
Z = checkInputType(Z)
M, d = Z.shape
hatPF = np.mean(Z, axis=0)
kbar = np.sum(hatPF)
denom = (kbar / d) * (1 - kbar / d)
return 1 - (M / (M - 1)) * np.mean(np.multiply(hatPF, 1 - hatPF)) / denom


class StabilityNogueira(AbstractMetric):
def score_bootstrap(
self,
ranker: AbstractEstimator,
validator: AbstractEstimator,
callbacks: Callback,
scores: Dict,
**kwargs,
) -> Dict:
# compute stability and send to table
Z = np.array(self.support_matrix)
Z = Z.astype(int)
stability = getStability(Z)
stability_df = pd.DataFrame([{"stability": stability}])
callbacks.on_table(stability_df, "stability")

# set in scores dict
scores["stability"] = stability

return scores

def score_ranking(
self,
scores: Union[Dict, pd.DataFrame],
ranker: AbstractEstimator,
bootstrap_state: int,
callbacks: Callback,
feature_importances: Optional[np.ndarray] = None,
):
support_matrix = getattr(self, "support_matrix", [])
self.support_matrix = support_matrix
self.support_matrix.append(ranker.feature_support_)


class ReliefF_FeatureSelection(ReliefF):
def fit(self, X, y):
super(ReliefF_FeatureSelection, self).fit(X, y)

# extract feature subset from ReliefF
feature_subset = self.top_features_[: self.n_features_to_select]

# set `support_` vector
_, p = np.shape(X)
self.support_ = np.zeros(p, dtype=bool)
self.support_[feature_subset] = True


@hydra.main(config_path="conf", config_name="my_config")
def main(cfg: PipelineConfig) -> None:
run_pipeline(cfg)


if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions examples/algorithm-stability-yaml/conf/dataset/synclf_hard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Synclf hard
task: classification
domain: synthetic
group: Synclf
adapter:
_target_: sklearn.datasets.make_classification
class_sep: 0.8
n_classes: 3
n_clusters_per_class: 3
n_features: 50
n_informative: 4
n_redundant: 0
n_repeated: 0
n_samples: 1000
random_state: 0
shuffle: false
feature_importances:
X[:, 0:4]: 1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @package metrics
ranking_scores:
_target_: benchmark.StabilityNogueira
11 changes: 11 additions & 0 deletions examples/algorithm-stability-yaml/conf/my_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
defaults:
- base_pipeline_config
- _self_
- override dataset: synclf_hard
- override validator: knn
- override /callbacks:
- to_sql
- override /metrics:
- stability_nogueira

n_bootstraps: 10
11 changes: 11 additions & 0 deletions examples/algorithm-stability-yaml/conf/ranker/boruta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Boruta
estimator:
_target_: boruta.boruta_py.BorutaPy
estimator:
_target_: sklearn.ensemble.RandomForestClassifier
n_estimators: auto
_estimator_type: classifier
multioutput: false
estimates_feature_importances: false
estimates_feature_support: true
estimates_feature_ranking: true
7 changes: 7 additions & 0 deletions examples/algorithm-stability-yaml/conf/ranker/relieff.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: ReliefF
estimator:
_target_: benchmark.ReliefF_FeatureSelection
n_features_to_select: 10 # select best 10 features in feature subset.
_estimator_type: classifier
estimates_feature_importances: true
estimates_feature_support: true
6 changes: 6 additions & 0 deletions examples/algorithm-stability-yaml/conf/validator/knn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name: k-NN
estimator:
_target_: sklearn.neighbors.KNeighborsClassifier
_estimator_type: classifier
multioutput: false
estimates_target: true
6 changes: 3 additions & 3 deletions fseval/callbacks/to_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from typing import Dict

import pandas as pd
from fseval.config.callbacks.to_sql import ToSQLCallback
from fseval.types import TerminalColor
from omegaconf import MISSING, DictConfig
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool

from fseval.config.callbacks.to_sql import ToSQLCallback
from fseval.types import TerminalColor

from ._base_export_callback import BaseExportCallback

Expand Down
4 changes: 2 additions & 2 deletions fseval/pipelines/_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

import numpy as np
import pandas as pd
from humanfriendly import format_timespan

from fseval.pipeline.estimator import Estimator
from fseval.types import AbstractEstimator, Callback, TerminalColor
from humanfriendly import format_timespan
from sqlalchemy.engine import Engine


@dataclass
Expand Down
5 changes: 3 additions & 2 deletions fseval/pipelines/rank_and_validate/_support_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,11 @@ def score(self, X, y, **kwargs) -> Union[Dict, pd.DataFrame, np.generic, None]:
scores = pd.DataFrame([scores_dict])

# add custom metrics
X_, y_ = self._prepare_data(X, y)

for metric_name, metric_class in self.metrics.items():
X, y = self._prepare_data(X, y)
scores_metric = metric_class.score_support( # type: ignore
scores, self.validator, X, y, self.callbacks
scores, self.validator, X_, y_, self.callbacks
) # type: ignore

if scores_metric is not None:
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import tempfile

import pytest
from hydra.conf import ConfigStore

from fseval.config import EstimatorConfig, PipelineConfig
from fseval.main import run_pipeline
from fseval.types import IncompatibilityError
from fseval.utils.hydra_utils import get_config
from hydra.conf import ConfigStore
from hydra.errors import InstantiationException
Expand Down
1 change: 0 additions & 1 deletion website/docs/_recipes/algorithm-stability.md

This file was deleted.

3 changes: 0 additions & 3 deletions website/docs/_recipes/running-on-aws.md

This file was deleted.

1 change: 0 additions & 1 deletion website/docs/_recipes/running-on-slurm.md

This file was deleted.

8 changes: 8 additions & 0 deletions website/docs/quick-start.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,14 @@ We can now decide how to export the results. We can upload our results to a live
sql_con=sqlite:////Users/dunnkers/Downloads/results.sqlite # any well-defined database URL
```

:::note Relative vs absolute paths

If you define a _relative_ database URL, like `sql_con=sqlite:///./results.sqlite`, the results will be saved right where Hydra stores its individual run files. In other words, multiple `.sqlite` files are stored in the `./multirun` subfolders.

To prevent this, and store all results in 1 `.sqlite` file, use an **absolute** path, like above. But preferably, you are using a proper running database - see the recipes for more instructions on this.

:::

We are now ready to run an experiment. In a terminal, `cd` into the unzipped example directory and run the following:
```shell
python benchmark.py --multirun ranker='glob(*)' +callbacks.to_sql.url=$sql_con
Expand Down
File renamed without changes.
Loading

0 comments on commit 4c93031

Please sign in to comment.