-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add some docs explaining how to "analyze FS algorithm stability" (#83)
* [WIP] Analyzing algorithm stability recipe * Finish stability example * Reformat & imports * Add note for relative vs absolute paths * Algorithm stability recipe 📈 First version * Finish algorithm stability example 🙌🏻
- Loading branch information
Showing
20 changed files
with
1,981 additions
and
13 deletions.
There are no files selected for viewing
1,476 changes: 1,476 additions & 0 deletions
1,476
examples/algorithm-stability-yaml/analyze-results.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
from typing import Dict, Optional, Union | ||
|
||
import hydra | ||
import numpy as np | ||
import pandas as pd | ||
from skrebate import ReliefF | ||
|
||
from fseval.config import PipelineConfig | ||
from fseval.main import run_pipeline | ||
from fseval.types import AbstractEstimator, AbstractMetric, Callback | ||
|
||
""" | ||
The checkInputType and getStability functions come from the following paper: | ||
[1] On the Stability of Feature Selection. Sarah Nogueira, Konstantinos Sechidis, Gavin Brown. | ||
Journal of Machine Learning Reasearch (JMLR). 2017. | ||
You can find a full demo using this package at: | ||
http://htmlpreview.github.io/?https://github.com/nogueirs/JMLR2017/blob/master/python/stabilityDemo.html | ||
NB: This package requires the installation of the packages: numpy, scipy and math | ||
""" | ||
|
||
|
||
def checkInputType(Z): | ||
"""This function checks that Z is of the rigt type and dimension. | ||
It raises an exception if not. | ||
OUTPUT: The input Z as a numpy.ndarray | ||
""" | ||
### We check that Z is a list or a numpy.array | ||
if isinstance(Z, list): | ||
Z = np.asarray(Z) | ||
elif not isinstance(Z, np.ndarray): | ||
raise ValueError("The input matrix Z should be of type list or numpy.ndarray") | ||
### We check if Z is a matrix (2 dimensions) | ||
if Z.ndim != 2: | ||
raise ValueError("The input matrix Z should be of dimension 2") | ||
return Z | ||
|
||
|
||
def getStability(Z): | ||
""" | ||
Let us assume we have M>1 feature sets and d>0 features in total. | ||
This function computes the stability estimate as given in Definition 4 in [1]. | ||
INPUT: A BINARY matrix Z (given as a list or as a numpy.ndarray of size M*d). | ||
Each row of the binary matrix represents a feature set, where a 1 at the f^th position | ||
means the f^th feature has been selected and a 0 means it has not been selected. | ||
OUTPUT: The stability of the feature selection procedure | ||
""" | ||
Z = checkInputType(Z) | ||
M, d = Z.shape | ||
hatPF = np.mean(Z, axis=0) | ||
kbar = np.sum(hatPF) | ||
denom = (kbar / d) * (1 - kbar / d) | ||
return 1 - (M / (M - 1)) * np.mean(np.multiply(hatPF, 1 - hatPF)) / denom | ||
|
||
|
||
class StabilityNogueira(AbstractMetric): | ||
def score_bootstrap( | ||
self, | ||
ranker: AbstractEstimator, | ||
validator: AbstractEstimator, | ||
callbacks: Callback, | ||
scores: Dict, | ||
**kwargs, | ||
) -> Dict: | ||
# compute stability and send to table | ||
Z = np.array(self.support_matrix) | ||
Z = Z.astype(int) | ||
stability = getStability(Z) | ||
stability_df = pd.DataFrame([{"stability": stability}]) | ||
callbacks.on_table(stability_df, "stability") | ||
|
||
# set in scores dict | ||
scores["stability"] = stability | ||
|
||
return scores | ||
|
||
def score_ranking( | ||
self, | ||
scores: Union[Dict, pd.DataFrame], | ||
ranker: AbstractEstimator, | ||
bootstrap_state: int, | ||
callbacks: Callback, | ||
feature_importances: Optional[np.ndarray] = None, | ||
): | ||
support_matrix = getattr(self, "support_matrix", []) | ||
self.support_matrix = support_matrix | ||
self.support_matrix.append(ranker.feature_support_) | ||
|
||
|
||
class ReliefF_FeatureSelection(ReliefF): | ||
def fit(self, X, y): | ||
super(ReliefF_FeatureSelection, self).fit(X, y) | ||
|
||
# extract feature subset from ReliefF | ||
feature_subset = self.top_features_[: self.n_features_to_select] | ||
|
||
# set `support_` vector | ||
_, p = np.shape(X) | ||
self.support_ = np.zeros(p, dtype=bool) | ||
self.support_[feature_subset] = True | ||
|
||
|
||
@hydra.main(config_path="conf", config_name="my_config") | ||
def main(cfg: PipelineConfig) -> None: | ||
run_pipeline(cfg) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
18 changes: 18 additions & 0 deletions
18
examples/algorithm-stability-yaml/conf/dataset/synclf_hard.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
name: Synclf hard | ||
task: classification | ||
domain: synthetic | ||
group: Synclf | ||
adapter: | ||
_target_: sklearn.datasets.make_classification | ||
class_sep: 0.8 | ||
n_classes: 3 | ||
n_clusters_per_class: 3 | ||
n_features: 50 | ||
n_informative: 4 | ||
n_redundant: 0 | ||
n_repeated: 0 | ||
n_samples: 1000 | ||
random_state: 0 | ||
shuffle: false | ||
feature_importances: | ||
X[:, 0:4]: 1.0 |
3 changes: 3 additions & 0 deletions
3
examples/algorithm-stability-yaml/conf/metrics/stability_nogueira.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# @package metrics | ||
ranking_scores: | ||
_target_: benchmark.StabilityNogueira |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
defaults: | ||
- base_pipeline_config | ||
- _self_ | ||
- override dataset: synclf_hard | ||
- override validator: knn | ||
- override /callbacks: | ||
- to_sql | ||
- override /metrics: | ||
- stability_nogueira | ||
|
||
n_bootstraps: 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: Boruta | ||
estimator: | ||
_target_: boruta.boruta_py.BorutaPy | ||
estimator: | ||
_target_: sklearn.ensemble.RandomForestClassifier | ||
n_estimators: auto | ||
_estimator_type: classifier | ||
multioutput: false | ||
estimates_feature_importances: false | ||
estimates_feature_support: true | ||
estimates_feature_ranking: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
name: ReliefF | ||
estimator: | ||
_target_: benchmark.ReliefF_FeatureSelection | ||
n_features_to_select: 10 # select best 10 features in feature subset. | ||
_estimator_type: classifier | ||
estimates_feature_importances: true | ||
estimates_feature_support: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
name: k-NN | ||
estimator: | ||
_target_: sklearn.neighbors.KNeighborsClassifier | ||
_estimator_type: classifier | ||
multioutput: false | ||
estimates_target: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Oops, something went wrong.