Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Implemented performance speedup for binary ReliefF + bug fixes #79

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions Visualize_Performance_Benchmarks.ipynb

Large diffs are not rendered by default.

3,201 changes: 3,201 additions & 0 deletions data/test_model_Models.txt_EDM-1_1.txt

Large diffs are not rendered by default.

314 changes: 314 additions & 0 deletions performance_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
import sys
import warnings
import numpy as np
import pandas as pd
import timeit

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar


warnings.filterwarnings('ignore')

np.random.seed(3249083)

genetic_data = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data = genetic_data.sample(frac=0.25)

# a larger dataset to be used for performance benchmarking sweeps
larger_data = pd.read_csv('data/test_model_Models.txt_EDM-1_1.txt', sep='\t')
larger_data.rename(columns={'Class': 'class'}, inplace=True)

# We're deliberately leaving all these unused datasets in here to make it
# easier to extend testing to more cases in the future.
genetic_data_cont_endpoint = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25)

genetic_data_mixed_attributes = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25)

genetic_data_missing_values = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25)

genetic_data_multiclass = pd.read_csv(
'data/3Class_Datasets_Loc_2_01.txt',
sep='\t'
)
genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)


features = genetic_data.drop('class', axis=1).values
labels = genetic_data['class'].values
headers = list(genetic_data.drop("class", axis=1))

larger_features = larger_data.drop('class', axis=1).values
larger_labels = larger_data['class'].values
larger_headers = list(larger_data.drop('class', axis=1))

features_cont_endpoint = genetic_data_cont_endpoint.drop(
'class',
axis=1
).values
labels_cont_endpoint = genetic_data_cont_endpoint['class'].values
headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1))

features_mixed_attributes = genetic_data_mixed_attributes.drop(
'class',
axis=1
).values
labels_mixed_attributes = genetic_data_mixed_attributes['class'].values
headers_mixed_attributes = list(
genetic_data_mixed_attributes.drop("class", axis=1)
)

features_missing_values = genetic_data_missing_values.drop(
'class',
axis=1
).values
labels_missing_values = genetic_data_missing_values['class'].values
headers_missing_values = list(
genetic_data_missing_values.drop("class", axis=1)
)

features_multiclass, labels_multiclass = genetic_data_multiclass.drop(
'class', axis=1).values, genetic_data_multiclass['class'].values
headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1))


# Basic tests for performance of the algorithms in parallel and serial modes
def test_relieff():
"""
Runtime: Data (Binary Endpoint, Discrete Features): ReliefF Serial
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10)
alg.fit(features, labels)


def test_relieff_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): ReliefF parallel
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
alg.fit(features, labels)

def test_relieff_parallel_larger_data():
"""
Runtime: Data (Binary Endpoint, Discrete Features):
ReliefF parallel larger data
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
alg.fit(larger_features, larger_labels)

def test_relieffpercent():
"""
Runtime: Data (Binary Endpoint, Discrete Features):
ReliefF with % neighbors
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=0.1)
alg.fit(features, labels)


def test_surf():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF serial
"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2)
alg.fit(features, labels)


def test_surf_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF parallel
"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_surfstar():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF* serial
"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_surfstar_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF* parallel
"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurfstar():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF* serial
"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurfstar_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF* parallel
"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurf():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF serial
"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurf_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF parallel
"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def run_parameter_sweep_binary_discrete():
"""
Run a parameter sweep across several combinations of different row and col
sizes for our test dataset. Saves a pandas df to disk that we can use
for visualizations to confirm performance characteristics.

To utilize, run `python performance_tests.py sweep`
"""
row_sizes = [200, 400, 800, 1600, 3200]
attr_sizes = [25, 50, 100, 200, 400]

param_sweep_data = pd.DataFrame(columns=attr_sizes, index=row_sizes)

for row_size in row_sizes:
for attr_size in attr_sizes:
# import pdb; pdb.set_trace()
# sample down to the desired number of rows
frac_to_sample = row_size / larger_data.shape[0]
data_sample = larger_data.sample(frac=frac_to_sample)

# sample down to the desired number of attr_size
# len(cols) - attr_size - 1, the 1 is for the class column
data_sample = data_sample.iloc[
:,
(len(data_sample.columns)-attr_size-1):
]

features, labels = (
data_sample.drop('class', axis=1).values,
data_sample['class'].values
)
headers = list(data_sample.drop("class", axis=1))

np.random.seed(49082)
def run_algorithm():
alg = ReliefF(
n_features_to_select=2,
n_neighbors=10,
n_jobs=-1
)
alg.fit(features, labels)

timing = timeit.repeat(run_algorithm, number=1, repeat=5)

param_sweep_data.loc[row_size,attr_size] = np.mean(timing[1:])

print('%s rows, %s attributes: %s seconds' % (
row_size,
attr_size,
np.mean(timing[1:]))
)

param_sweep_data.to_csv('param_sweep_data.csv')

test_cases = [
test_relieff,
test_relieff_parallel,
test_relieff_parallel_larger_data,
test_relieffpercent,
# test_surf,
# test_surf_parallel,
# test_surfstar,
# test_surfstar_parallel,
# test_multisurfstar,
# test_multisurfstar_parallel,
# test_multisurf,
# test_multisurf_parallel
]

if __name__ == '__main__':

if len(sys.argv) > 1 and sys.argv[1] == 'sweep':
run_parameter_sweep_binary_discrete()
else:
timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std'])

for test_case in test_cases:
timing = timeit.repeat(test_case, number=1, repeat=5)
# ignore the first test to avoid high initial overhead
# to compile numba functions with small datasets
timing = timing[1:]
print(test_case.__name__, np.mean(timing), np.std(timing))
d = {
'test_case' : test_case.__name__,
'mean' : np.mean(timing),
'std' : np.std(timing)
}
timing_df = timing_df.append(d, ignore_index = True)

print(timing_df)

timing_df.to_csv('timing_benchmarks.csv')
31 changes: 31 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
astroid==2.9.0
flake8==4.0.1
gprof2dot==2021.2.21
isort==5.10.1
joblib==1.1.0
lazy-object-proxy==1.7.1
llvmlite==0.37.0
mccabe==0.6.1
mypy==0.920
mypy-extensions==0.4.3
nose==1.3.7
numba==0.54.1
numpy==1.20.3
pandas==1.3.4
platformdirs==2.4.0
pycodestyle==2.8.0
pydocstyle==6.1.1
pyflakes==2.4.0
pylint==2.12.2
python-dateutil==2.8.2
pytz==2021.3
scikit-learn==1.0.1
scipy==1.7.3
six==1.16.0
sklearn==0.0
snowballstemmer==2.2.0
threadpoolctl==3.0.0
toml==0.10.2
tomli==2.0.0
typing_extensions==4.0.1
wrapt==1.13.3
2 changes: 2 additions & 0 deletions run_performance_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python -m cProfile -o perf_data.pstats performance_tests.py
gprof2dot -f pstats perf_data.pstats | dot -Tpng -o perfgraph.png
Loading