Skip to content

Commit

Permalink
Add support for pandas DataFrames - EpistasisLab#53
Browse files Browse the repository at this point in the history
  • Loading branch information
aadu committed Aug 7, 2020
1 parent 7795389 commit 312566a
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 25 deletions.
7 changes: 4 additions & 3 deletions skrebate/relieff.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import warnings
import sys
from sklearn.base import BaseEstimator
from sklearn.utils import check_array, column_or_1d
from joblib import Parallel, delayed
from .scoring_utils import get_row_missing, ReliefF_compute_scores

Expand Down Expand Up @@ -91,8 +92,8 @@ def fit(self, X, y):
-------
Copy of the ReliefF instance
"""
self._X = X # matrix of predictive variables ('independent variables')
self._y = y # vector of values for outcome variable ('dependent variable')
self._X = check_array(X, force_all_finite=False) # matrix of predictive variables ('independent variables')
self._y = column_or_1d(y) # vector of values for outcome variable ('dependent variable')

# Set up the properties for ReliefF -------------------------------------------------------------------------------------
self._datalen = len(self._X) # Number of training instances ('n')
Expand Down Expand Up @@ -220,7 +221,7 @@ def transform(self, X):
if self._num_attributes < self.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')

return X[:, self.top_features_[:self.n_features_to_select]]
return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]]

#=========================================================================#
def fit_transform(self, X, y):
Expand Down
10 changes: 5 additions & 5 deletions skrebate/turf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import time
import warnings
import sys
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, column_or_1d
# from sklearn.feature_selection.base import SelectorMixin
from joblib import Parallel, delayed
# from .scoring_utils import get_row_missing, ReliefF_compute_scores
Expand Down Expand Up @@ -72,8 +72,8 @@ def fit(self, X, y, headers):
Copy of the TuRF instance
"""

self.X_mat = X
self._y = y
self.X_mat = check_array(X, force_all_finite=False)
self._y = column_or_1d(y)
self.headers = headers
self._num_attributes = len(self.X_mat[0])
self._lost = {}
Expand Down Expand Up @@ -184,7 +184,7 @@ def transform(self, X):
if self._num_attributes < self.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')

return X[:, self.top_features_[:self.n_features_to_select]]
return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]]
#return X[:, self.top_features_]

#=========================================================================#
Expand Down
10 changes: 5 additions & 5 deletions skrebate/vlsrelief.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import time
import warnings
import sys
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array, column_or_1d
# from sklearn.feature_selection.base import SelectorMixin
from joblib import Parallel, delayed
# from .scoring_utils import get_row_missing, ReliefF_compute_scores
Expand Down Expand Up @@ -77,8 +77,8 @@ def fit(self, X, y, headers):
Copy of the VLSRelief instance
"""

self.X_mat = X
self._y = y
self.X_mat = check_array(X, force_all_finite=False)
self._y = column_or_1d(y)
self.headers = headers

if self.core_algorithm.lower() == "multisurf":
Expand Down Expand Up @@ -152,7 +152,7 @@ def transform(self, X):
Reduced feature matrix
"""

return X[:, self.top_features_[:self.n_features_to_select]]
return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]]

# return X[:, self.top_features_]

Expand Down
87 changes: 75 additions & 12 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -60,7 +60,8 @@
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)


features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values
features_df, labels_s = genetic_data.drop('class', axis=1), genetic_data['class']
features, labels = features_df.values, labels_s.values
headers = list(genetic_data.drop("class", axis=1))

features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop(
Expand Down Expand Up @@ -290,7 +291,7 @@ def test_relieff_pipeline_multiclass():
np.random.seed(49082)

clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -302,7 +303,7 @@ def test_surf_pipeline_multiclass():
np.random.seed(240932)

clf = make_pipeline(SURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -314,7 +315,7 @@ def test_surfstar_pipeline_multiclass():
np.random.seed(9238745)

clf = make_pipeline(SURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -326,7 +327,7 @@ def test_multisurfstar_pipeline_multiclass():
np.random.seed(320931)

clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -338,7 +339,7 @@ def test_multisurf_pipeline_multiclass():
np.random.seed(320931)

clf = make_pipeline(MultiSURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand Down Expand Up @@ -466,7 +467,7 @@ def test_relieff_pipeline_missing_values():
np.random.seed(49082)

clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -478,7 +479,7 @@ def test_surf_pipeline_missing_values():
np.random.seed(240932)

clf = make_pipeline(SURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -490,7 +491,7 @@ def test_surfstar_pipeline_missing_values():
np.random.seed(9238745)

clf = make_pipeline(SURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -502,7 +503,7 @@ def test_multisurfstar_pipeline_missing_values():
np.random.seed(320931)

clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -514,8 +515,70 @@ def test_multisurf_pipeline_missing_values():
np.random.seed(320931)

clf = make_pipeline(MultiSURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
labels_missing_values, cv=3, n_jobs=-1)) > 0.7

# Test Dataframe handling:


def test_relieff_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7


def test_surf_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs"""
np.random.seed(240932)
clf = make_pipeline(SURF(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_surfstar_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs"""
np.random.seed(9238745)
clf = make_pipeline(SURFstar(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_multisurfstar_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs"""
np.random.seed(320931)
clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_multisurf_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs"""
np.random.seed(320931)
clf = make_pipeline(MultiSURF(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


# def test_turf_pandas_inputs():
# """Check: Data (pandas DataFrame/Series): TuRF works with pandas DataFrame and Series inputs"""
# np.random.seed(320931)
# clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100),
# RandomForestClassifier(n_estimators=100, n_jobs=-1))
# assert np.mean(cross_val_score(clf, features, labels, fit_params={'turf__headers': headers}, cv=3, n_jobs=-1)) > 0.7


# def test_vlsrelief_pandas_inputs():
# """Check: Data (pandas DataFrame/Series): VLSRelief works with pandas DataFrame and Series inputs"""
# np.random.seed(49082)
# clf = make_pipeline(VLSRelief(core_algorithm="ReliefF", n_features_to_select=2, n_neighbors=100),
# RandomForestClassifier(n_estimators=100, n_jobs=-1))
# assert np.mean(cross_val_score(clf, features, labels, fit_params={'vlsrelief__headers': headers}, cv=3, n_jobs=-1)) > 0.7

0 comments on commit 312566a

Please sign in to comment.