Skip to content

Commit

Permalink
ENH: Implemented performance speedup for binary ReliefF + bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
CaptainKanuk committed Dec 17, 2021
1 parent 1679885 commit 05fb82e
Show file tree
Hide file tree
Showing 9 changed files with 4,173 additions and 167 deletions.
134 changes: 134 additions & 0 deletions Visualize_Performance_Benchmarks.ipynb

Large diffs are not rendered by default.

3,201 changes: 3,201 additions & 0 deletions data/test_model_Models.txt_EDM-1_1.txt

Large diffs are not rendered by default.

314 changes: 314 additions & 0 deletions performance_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
import sys
import warnings
import numpy as np
import pandas as pd
import timeit

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar


warnings.filterwarnings('ignore')

np.random.seed(3249083)

genetic_data = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data = genetic_data.sample(frac=0.25)

# a larger dataset to be used for performance benchmarking sweeps
larger_data = pd.read_csv('data/test_model_Models.txt_EDM-1_1.txt', sep='\t')
larger_data.rename(columns={'Class': 'class'}, inplace=True)

# We're deliberately leaving all these unused datasets in here to make it
# easier to extend testing to more cases in the future.
genetic_data_cont_endpoint = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_continuous_endpoint_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_cont_endpoint.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_cont_endpoint = genetic_data_cont_endpoint.sample(frac=0.25)

genetic_data_mixed_attributes = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_mixed_attribute_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_mixed_attributes.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_mixed_attributes = genetic_data_mixed_attributes.sample(frac=0.25)

genetic_data_missing_values = pd.read_csv(
'data/GAMETES_Epistasis_2-Way_missing_values_0.1_a_20s_1600her_0.4__maf_0.2_EDM-2_01.tsv.gz',
sep='\t',
compression='gzip'
)
genetic_data_missing_values.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_missing_values = genetic_data_missing_values.sample(frac=0.25)

genetic_data_multiclass = pd.read_csv(
'data/3Class_Datasets_Loc_2_01.txt',
sep='\t'
)
genetic_data_multiclass.rename(columns={'Class': 'class'}, inplace=True)
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)


features = genetic_data.drop('class', axis=1).values
labels = genetic_data['class'].values
headers = list(genetic_data.drop("class", axis=1))

larger_features = larger_data.drop('class', axis=1).values
larger_labels = larger_data['class'].values
larger_headers = list(larger_data.drop('class', axis=1))

features_cont_endpoint = genetic_data_cont_endpoint.drop(
'class',
axis=1
).values
labels_cont_endpoint = genetic_data_cont_endpoint['class'].values
headers_cont_endpoint = list(genetic_data_cont_endpoint.drop("class", axis=1))

features_mixed_attributes = genetic_data_mixed_attributes.drop(
'class',
axis=1
).values
labels_mixed_attributes = genetic_data_mixed_attributes['class'].values
headers_mixed_attributes = list(
genetic_data_mixed_attributes.drop("class", axis=1)
)

features_missing_values = genetic_data_missing_values.drop(
'class',
axis=1
).values
labels_missing_values = genetic_data_missing_values['class'].values
headers_missing_values = list(
genetic_data_missing_values.drop("class", axis=1)
)

features_multiclass, labels_multiclass = genetic_data_multiclass.drop(
'class', axis=1).values, genetic_data_multiclass['class'].values
headers_multiclass = list(genetic_data_multiclass.drop("class", axis=1))


# Basic tests for performance of the algorithms in parallel and serial modes
def test_relieff():
"""
Runtime: Data (Binary Endpoint, Discrete Features): ReliefF Serial
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10)
alg.fit(features, labels)


def test_relieff_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): ReliefF parallel
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
alg.fit(features, labels)

def test_relieff_parallel_larger_data():
"""
Runtime: Data (Binary Endpoint, Discrete Features):
ReliefF parallel larger data
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1)
alg.fit(larger_features, larger_labels)

def test_relieffpercent():
"""
Runtime: Data (Binary Endpoint, Discrete Features):
ReliefF with % neighbors
"""
np.random.seed(49082)

alg = ReliefF(n_features_to_select=2, n_neighbors=0.1)
alg.fit(features, labels)


def test_surf():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF serial
"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2)
alg.fit(features, labels)


def test_surf_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF parallel
"""
np.random.seed(240932)

alg = SURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_surfstar():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF* serial
"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_surfstar_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): SURF* parallel
"""
np.random.seed(9238745)

alg = SURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurfstar():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF* serial
"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurfstar_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF* parallel
"""
np.random.seed(320931)

alg = MultiSURFstar(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def test_multisurf():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF serial
"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2)
alg.fit(features, labels)


def test_multisurf_parallel():
"""
Runtime: Data (Binary Endpoint, Discrete Features): MultiSURF parallel
"""
np.random.seed(320931)

alg = MultiSURF(n_features_to_select=2, n_jobs=-1)
alg.fit(features, labels)


def run_parameter_sweep_binary_discrete():
"""
Run a parameter sweep across several combinations of different row and col
sizes for our test dataset. Saves a pandas df to disk that we can use
for visualizations to confirm performance characteristics.
To utilize, run `python performance_tests.py sweep`
"""
row_sizes = [200, 400, 800, 1600, 3200]
attr_sizes = [25, 50, 100, 200, 400]

param_sweep_data = pd.DataFrame(columns=attr_sizes, index=row_sizes)

for row_size in row_sizes:
for attr_size in attr_sizes:
# import pdb; pdb.set_trace()
# sample down to the desired number of rows
frac_to_sample = row_size / larger_data.shape[0]
data_sample = larger_data.sample(frac=frac_to_sample)

# sample down to the desired number of attr_size
# len(cols) - attr_size - 1, the 1 is for the class column
data_sample = data_sample.iloc[
:,
(len(data_sample.columns)-attr_size-1):
]

features, labels = (
data_sample.drop('class', axis=1).values,
data_sample['class'].values
)
headers = list(data_sample.drop("class", axis=1))

np.random.seed(49082)
def run_algorithm():
alg = ReliefF(
n_features_to_select=2,
n_neighbors=10,
n_jobs=-1
)
alg.fit(features, labels)

timing = timeit.repeat(run_algorithm, number=1, repeat=5)

param_sweep_data.loc[row_size,attr_size] = np.mean(timing[1:])

print('%s rows, %s attributes: %s seconds' % (
row_size,
attr_size,
np.mean(timing[1:]))
)

param_sweep_data.to_csv('param_sweep_data.csv')

test_cases = [
test_relieff,
test_relieff_parallel,
test_relieff_parallel_larger_data,
test_relieffpercent,
# test_surf,
# test_surf_parallel,
# test_surfstar,
# test_surfstar_parallel,
# test_multisurfstar,
# test_multisurfstar_parallel,
# test_multisurf,
# test_multisurf_parallel
]

if __name__ == '__main__':

if len(sys.argv) > 1 and sys.argv[1] == 'sweep':
run_parameter_sweep_binary_discrete()
else:
timing_df = pd.DataFrame(columns=['test_case', 'mean', 'std'])

for test_case in test_cases:
timing = timeit.repeat(test_case, number=1, repeat=5)
# ignore the first test to avoid high initial overhead
# to compile numba functions with small datasets
timing = timing[1:]
print(test_case.__name__, np.mean(timing), np.std(timing))
d = {
'test_case' : test_case.__name__,
'mean' : np.mean(timing),
'std' : np.std(timing)
}
timing_df = timing_df.append(d, ignore_index = True)

print(timing_df)

timing_df.to_csv('timing_benchmarks.csv')
31 changes: 31 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
astroid==2.9.0
flake8==4.0.1
gprof2dot==2021.2.21
isort==5.10.1
joblib==1.1.0
lazy-object-proxy==1.7.1
llvmlite==0.37.0
mccabe==0.6.1
mypy==0.920
mypy-extensions==0.4.3
nose==1.3.7
numba==0.54.1
numpy==1.20.3
pandas==1.3.4
platformdirs==2.4.0
pycodestyle==2.8.0
pydocstyle==6.1.1
pyflakes==2.4.0
pylint==2.12.2
python-dateutil==2.8.2
pytz==2021.3
scikit-learn==1.0.1
scipy==1.7.3
six==1.16.0
sklearn==0.0
snowballstemmer==2.2.0
threadpoolctl==3.0.0
toml==0.10.2
tomli==2.0.0
typing_extensions==4.0.1
wrapt==1.13.3
2 changes: 2 additions & 0 deletions run_performance_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python -m cProfile -o perf_data.pstats performance_tests.py
gprof2dot -f pstats perf_data.pstats | dot -Tpng -o perfgraph.png
Loading

0 comments on commit 05fb82e

Please sign in to comment.