-
Notifications
You must be signed in to change notification settings - Fork 73
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: Implemented performance speedup for binary ReliefF + bug fixes
- Loading branch information
1 parent
1679885
commit e6a7ec6
Showing
6 changed files
with
622 additions
and
166 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar | ||
from sklearn.pipeline import make_pipeline | ||
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.model_selection import cross_val_score | ||
import pandas as pd | ||
import numpy as np | ||
import warnings | ||
|
||
import timeit | ||
|
||
warnings.filterwarnings('ignore') | ||
|
||
np.random.seed(3249083) | ||
|
||
genetic_data = pd.read_csv( | ||
'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz', sep='\t', compression='gzip') | ||
# genetic_data = genetic_data.sample(frac=0.25) | ||
|
||
genetic_data_cont_endpoint = pd.read_csv( | ||
'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') | ||
genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True) | ||
genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25) | ||
|
||
genetic_data_mixed_attributes = pd.read_csv( | ||
'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') | ||
genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True) | ||
genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25) | ||
|
||
genetic_data_missing_values = pd.read_csv( | ||
'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip') | ||
genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True) | ||
genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25) | ||
|
||
genetic_data_multiclass = pd.read_csv('data/3Class_Datasets_Loc_2_01.txt', sep='\t') | ||
genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True) | ||
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25) | ||
|
||
|
||
features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values | ||
headers = list(genetic_data.drop("class", axis=1)) | ||
|
||
features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop( | ||
'class', axis=1).values, genetic_data_cont_endpoint['class'].values | ||
headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1)) | ||
|
||
features_mixed_attributes, labels_mixed_attributes = genetic_data_mixed_attributes.drop( | ||
'class', axis=1).values, genetic_data_mixed_attributes['class'].values | ||
headers_mixed_attributes = list(genetic_data_mixed_attributes.drop("class", axis=1)) | ||
|
||
features_missing_values, labels_missing_values = genetic_data_missing_values.drop( | ||
'class', axis=1).values, genetic_data_missing_values['class'].values | ||
headers_missing_values = list(genetic_data_missing_values.drop("class", axis=1)) | ||
|
||
features_multiclass, labels_multiclass = genetic_data_multiclass.drop( | ||
'class', axis=1).values, genetic_data_multiclass['class'].values | ||
headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1)) | ||
|
||
|
||
|
||
# Basic Parallelization Tests and Core binary data and discrete feature data testing (Focus on ReliefF only for efficiency)------------------------------------------------------------ | ||
def test_relieff(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline""" | ||
np.random.seed(49082) | ||
|
||
alg = ReliefF(n_features_to_select=2, n_neighbors=10) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_relieff_parallel(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized""" | ||
# Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be. | ||
np.random.seed(49082) | ||
|
||
alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_relieffpercent(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline""" | ||
np.random.seed(49082) | ||
|
||
alg = ReliefF(n_features_to_select=2, n_neighbors=0.1) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_surf(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline""" | ||
np.random.seed(240932) | ||
|
||
alg = SURF(n_features_to_select=2) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_surf_parallel(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized""" | ||
np.random.seed(240932) | ||
|
||
alg = SURF(n_features_to_select=2, n_jobs=-1) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_surfstar(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipelined""" | ||
np.random.seed(9238745) | ||
|
||
alg = SURFstar(n_features_to_select=2) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_surfstar_parallel(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized""" | ||
np.random.seed(9238745) | ||
|
||
alg = SURFstar(n_features_to_select=2, n_jobs=-1) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_multisurfstar(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline""" | ||
np.random.seed(320931) | ||
|
||
alg = MultiSURFstar(n_features_to_select=2) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_multisurfstar_parallel(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized""" | ||
np.random.seed(320931) | ||
|
||
alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_multisurf(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline""" | ||
np.random.seed(320931) | ||
|
||
alg = MultiSURF(n_features_to_select=2) | ||
alg.fit(features, labels) | ||
|
||
|
||
def test_multisurf_parallel(): | ||
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized""" | ||
np.random.seed(320931) | ||
|
||
alg = MultiSURF(n_features_to_select=2, n_jobs=-1) | ||
alg.fit(features, labels) | ||
|
||
|
||
test_cases = [ | ||
test_relieff, | ||
test_relieff_parallel, | ||
test_relieffpercent, | ||
# test_surf, | ||
# test_surf_parallel, | ||
# test_surfstar, | ||
# test_surfstar_parallel, | ||
# test_multisurfstar, | ||
# test_multisurfstar_parallel, | ||
# test_multisurf, | ||
# test_multisurf_parallel | ||
] | ||
|
||
if __name__ == '__main__': | ||
timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std']) | ||
|
||
for test_case in test_cases: | ||
timing = timeit.repeat(test_case, number=1, repeat=5) | ||
# ignore the first test to avoid high initial overhead to compile numba | ||
# functions with small datasets | ||
timing = timing[1:] | ||
print(test_case.__name__, np.mean(timing), np.std(timing)) | ||
d = {'test_case' : test_case.__name__, 'mean' : np.mean(timing), 'std' : np.std(timing)} | ||
timing_df = timing_df.append(d, ignore_index = True) | ||
|
||
print(timing_df) | ||
|
||
timing_df.to_csv('timing_benchmarks.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
python -m cProfile -o perf_data.pstats performance_tests.py | ||
gprof2dot -f pstats perf_data.pstats | dot -Tpng -o perfgraph.png |
Oops, something went wrong.