Skip to content

Commit

Permalink
ENH: Implemented performance speedup for binary ReliefF + bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
CaptainKanuk committed Dec 16, 2021
1 parent 1679885 commit e6a7ec6
Show file tree
Hide file tree
Showing 6 changed files with 622 additions and 166 deletions.
179 changes: 179 additions & 0 deletions performance_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import warnings

import timeit

warnings.filterwarnings('ignore')

np.random.seed(3249083)

genetic_data = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz', sep='\t', compression='gzip')
# genetic_data = genetic_data.sample(frac=0.25)

genetic_data_cont_endpoint = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25)

genetic_data_mixed_attributes = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25)

genetic_data_missing_values = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz', sep='\t', compression='gzip')
genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25)

genetic_data_multiclass = pd.read_csv('data/3Class_Datasets_Loc_2_01.txt', sep='\t')
genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)


features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values
headers = list(genetic_data.drop("class", axis=1))

features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop(
'class', axis=1).values, genetic_data_cont_endpoint['class'].values
headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1))

features_mixed_attributes, labels_mixed_attributes = genetic_data_mixed_attributes.drop(
'class', axis=1).values, genetic_data_mixed_attributes['class'].values
headers_mixed_attributes = list(genetic_data_mixed_attributes.drop("class", axis=1))

features_missing_values, labels_missing_values = genetic_data_missing_values.drop(
'class', axis=1).values, genetic_data_missing_values['class'].values
headers_missing_values = list(genetic_data_missing_values.drop("class", axis=1))

features_multiclass, labels_multiclass = genetic_data_multiclass.drop(
'class', axis=1).values, genetic_data_multiclass['class'].values
headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1))



# Basic Parallelization Tests and Core binary data and discrete feature data testing (Focus on ReliefF only for efficiency)------------------------------------------------------------
def test_relieff():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10)
alg.fit(features, labels)


def test_relieff_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized"""
# Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be.
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
alg.fit(features, labels)


def test_relieffpercent():
"""Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=0.1)
alg.fit(features, labels)


def test_surf():
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2)
alg.fit(features, labels)


def test_surf_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_surfstar():
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipelined"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_surfstar_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): SURF* works in a sklearn pipeline when SURF* is parallelized"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurfstar():
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurfstar_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF* works in a sklearn pipeline when MultiSURF* is parallelized"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurf():
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurf_parallel():
"""Check: Data (Binary Endpoint, Discrete Features): MultiSURF works in a sklearn pipeline when MultiSURF is parallelized"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


test_cases = [
test_relieff,
test_relieff_parallel,
test_relieffpercent,
# test_surf,
# test_surf_parallel,
# test_surfstar,
# test_surfstar_parallel,
# test_multisurfstar,
# test_multisurfstar_parallel,
# test_multisurf,
# test_multisurf_parallel
]

if __name__ == '__main__':
timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std'])

for test_case in test_cases:
timing = timeit.repeat(test_case, number=1, repeat=5)
# ignore the first test to avoid high initial overhead to compile numba
# functions with small datasets
timing = timing[1:]
print(test_case.__name__, np.mean(timing), np.std(timing))
d = {'test_case' : test_case.__name__, 'mean' : np.mean(timing), 'std' : np.std(timing)}
timing_df = timing_df.append(d, ignore_index = True)

print(timing_df)

timing_df.to_csv('timing_benchmarks.csv')
2 changes: 2 additions & 0 deletions run_performance_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python -m cProfile -o perf_data.pstats performance_tests.py
gprof2dot -f pstats perf_data.pstats | dot -Tpng -o perfgraph.png
Loading

0 comments on commit e6a7ec6

Please sign in to comment.