Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pandas DataFrames #73

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion skrebate/relieff.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import warnings
import sys
from sklearn.base import BaseEstimator
from sklearn.utils import check_array, column_or_1d
from joblib import Parallel, delayed
from .scoring_utils import get_row_missing, ReliefF_compute_scores, get_row_missing_iter

Expand Down Expand Up @@ -99,8 +100,9 @@ def fit(self, X, y, weights=None):
-------
Copy of the ReliefF instance
"""
X = check_array(X, force_all_finite=False)
self._X = X # matrix of predictive variables ('independent variables')
self._y = y # vector of values for outcome variable ('dependent variable')
self._y = column_or_1d(y) # vector of values for outcome variable ('dependent variable')
if isinstance(weights, np.ndarray):
if isinstance(weights, np.ndarray):
if len(weights) != len(X[0]):
Expand Down Expand Up @@ -246,6 +248,7 @@ def transform(self, X):
X_reduced: array-like {n_samples, n_features_to_select}
Reduced feature matrix
"""
X = check_array(X, force_all_finite=False)
if self._num_attributes < self.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')

Expand Down
6 changes: 5 additions & 1 deletion skrebate/turf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from sklearn.base import BaseEstimator
from sklearn.utils import check_array, column_or_1d
import copy
import numpy as np


class TURF(BaseEstimator):

def __init__(self,relief_object,pct=0.5,num_scores_to_return=100):
Expand Down Expand Up @@ -35,6 +37,8 @@ def fit(self, X, y):
-------
self
"""
X = check_array(X, force_all_finite=False)
y = column_or_1d(y)
#Adjust num_scores_to_return
num_features = X.shape[1]
self.num_scores_to_return = min(self.num_scores_to_return,num_features)
Expand Down Expand Up @@ -141,9 +145,9 @@ def check_is_float(self, num):
return False

def transform(self, X):
X = check_array(X, force_all_finite=False)
if X.shape[1] < self.relief_object.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')

return X[:, self.top_features_[:self.relief_object.n_features_to_select]]

def fit_transform(self, X, y):
Expand Down
4 changes: 4 additions & 0 deletions skrebate/vls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from sklearn.base import BaseEstimator
from sklearn.utils import check_array, column_or_1d
import copy
import random
import numpy as np
Expand Down Expand Up @@ -40,6 +41,8 @@ def fit(self, X, y,weights=None):
-------
self
"""
X = check_array(X, force_all_finite=False)
y = column_or_1d(y)
#random_state
if self.random_state != None:
np.random.seed(self.random_state)
Expand Down Expand Up @@ -155,6 +158,7 @@ def check_is_float(self, num):
return False

def transform(self, X):
X = check_array(X, force_all_finite=False)
if X.shape[1] < self.relief_object.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')

Expand Down
87 changes: 75 additions & 12 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
Expand Down Expand Up @@ -60,7 +60,8 @@
genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25)


features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values
features_df, labels_s = genetic_data.drop('class', axis=1), genetic_data['class']
features, labels = features_df.values, labels_s.values
headers = list(genetic_data.drop("class", axis=1))

features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop(
Expand Down Expand Up @@ -290,7 +291,7 @@ def test_relieff_pipeline_multiclass():
np.random.seed(49082)

clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -302,7 +303,7 @@ def test_surf_pipeline_multiclass():
np.random.seed(240932)

clf = make_pipeline(SURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -314,7 +315,7 @@ def test_surfstar_pipeline_multiclass():
np.random.seed(9238745)

clf = make_pipeline(SURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -326,7 +327,7 @@ def test_multisurfstar_pipeline_multiclass():
np.random.seed(320931)

clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand All @@ -338,7 +339,7 @@ def test_multisurf_pipeline_multiclass():
np.random.seed(320931)

clf = make_pipeline(MultiSURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_multiclass,
Expand Down Expand Up @@ -466,7 +467,7 @@ def test_relieff_pipeline_missing_values():
np.random.seed(49082)

clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -478,7 +479,7 @@ def test_surf_pipeline_missing_values():
np.random.seed(240932)

clf = make_pipeline(SURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -490,7 +491,7 @@ def test_surfstar_pipeline_missing_values():
np.random.seed(9238745)

clf = make_pipeline(SURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -502,7 +503,7 @@ def test_multisurfstar_pipeline_missing_values():
np.random.seed(320931)

clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
Expand All @@ -514,8 +515,70 @@ def test_multisurf_pipeline_missing_values():
np.random.seed(320931)

clf = make_pipeline(MultiSURF(n_features_to_select=2),
Imputer(),
SimpleImputer(),
RandomForestClassifier(n_estimators=100, n_jobs=-1))

assert np.mean(cross_val_score(clf, features_missing_values,
labels_missing_values, cv=3, n_jobs=-1)) > 0.7

# Test Dataframe handling:


def test_relieff_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs"""
np.random.seed(49082)
clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7


def test_surf_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs"""
np.random.seed(240932)
clf = make_pipeline(SURF(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_surfstar_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs"""
np.random.seed(9238745)
clf = make_pipeline(SURFstar(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_multisurfstar_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs"""
np.random.seed(320931)
clf = make_pipeline(MultiSURFstar(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


def test_multisurf_pandas_inputs():
"""Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs"""
np.random.seed(320931)
clf = make_pipeline(MultiSURF(n_features_to_select=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1))
assert np.mean(cross_val_score(clf, features_df,
labels_s, cv=3, n_jobs=-1)) > 0.7


# def test_turf_pandas_inputs():
# """Check: Data (pandas DataFrame/Series): TuRF works with pandas DataFrame and Series inputs"""
# np.random.seed(320931)
# clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100),
# RandomForestClassifier(n_estimators=100, n_jobs=-1))
# assert np.mean(cross_val_score(clf, features, labels, fit_params={'turf__headers': headers}, cv=3, n_jobs=-1)) > 0.7


# def test_vlsrelief_pandas_inputs():
# """Check: Data (pandas DataFrame/Series): VLSRelief works with pandas DataFrame and Series inputs"""
# np.random.seed(49082)
# clf = make_pipeline(VLSRelief(core_algorithm="ReliefF", n_features_to_select=2, n_neighbors=100),
# RandomForestClassifier(n_estimators=100, n_jobs=-1))
# assert np.mean(cross_val_score(clf, features, labels, fit_params={'vlsrelief__headers': headers}, cv=3, n_jobs=-1)) > 0.7