diff --git a/skrebate/relieff.py b/skrebate/relieff.py index b190056..5c64b99 100644 --- a/skrebate/relieff.py +++ b/skrebate/relieff.py @@ -27,6 +27,7 @@ import warnings import sys from sklearn.base import BaseEstimator +from sklearn.utils import check_array, column_or_1d from joblib import Parallel, delayed from .scoring_utils import get_row_missing, ReliefF_compute_scores, get_row_missing_iter @@ -99,8 +100,9 @@ def fit(self, X, y, weights=None): ------- Copy of the ReliefF instance """ + X = check_array(X, force_all_finite=False) self._X = X # matrix of predictive variables ('independent variables') - self._y = y # vector of values for outcome variable ('dependent variable') + self._y = column_or_1d(y) # vector of values for outcome variable ('dependent variable') if isinstance(weights, np.ndarray): if isinstance(weights, np.ndarray): if len(weights) != len(X[0]): @@ -246,6 +248,7 @@ def transform(self, X): X_reduced: array-like {n_samples, n_features_to_select} Reduced feature matrix """ + X = check_array(X, force_all_finite=False) if self._num_attributes < self.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') diff --git a/skrebate/turf.py b/skrebate/turf.py index 5201663..289a69a 100644 --- a/skrebate/turf.py +++ b/skrebate/turf.py @@ -1,7 +1,9 @@ from sklearn.base import BaseEstimator +from sklearn.utils import check_array, column_or_1d import copy import numpy as np + class TURF(BaseEstimator): def __init__(self,relief_object,pct=0.5,num_scores_to_return=100): @@ -35,6 +37,8 @@ def fit(self, X, y): ------- self """ + X = check_array(X, force_all_finite=False) + y = column_or_1d(y) #Adjust num_scores_to_return num_features = X.shape[1] self.num_scores_to_return = min(self.num_scores_to_return,num_features) @@ -141,9 +145,9 @@ def check_is_float(self, num): return False def transform(self, X): + X = check_array(X, force_all_finite=False) if X.shape[1] < self.relief_object.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') - return X[:, self.top_features_[:self.relief_object.n_features_to_select]] def fit_transform(self, X, y): diff --git a/skrebate/vls.py b/skrebate/vls.py index d3e351f..d882ff9 100644 --- a/skrebate/vls.py +++ b/skrebate/vls.py @@ -1,4 +1,5 @@ from sklearn.base import BaseEstimator +from sklearn.utils import check_array, column_or_1d import copy import random import numpy as np @@ -40,6 +41,8 @@ def fit(self, X, y,weights=None): ------- self """ + X = check_array(X, force_all_finite=False) + y = column_or_1d(y) #random_state if self.random_state != None: np.random.seed(self.random_state) @@ -155,6 +158,7 @@ def check_is_float(self, num): return False def transform(self, X): + X = check_array(X, force_all_finite=False) if X.shape[1] < self.relief_object.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') diff --git a/tests.py b/tests.py index 3ba82a4..c0a05b6 100644 --- a/tests.py +++ b/tests.py @@ -26,7 +26,7 @@ from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer from sklearn.model_selection import cross_val_score import pandas as pd import numpy as np @@ -60,7 +60,8 @@ genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25) -features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values +features_df, labels_s = genetic_data.drop('class', axis=1), genetic_data['class'] +features, labels = features_df.values, labels_s.values headers = list(genetic_data.drop("class", axis=1)) features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop( @@ -290,7 +291,7 @@ def test_relieff_pipeline_multiclass(): np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -302,7 +303,7 @@ def test_surf_pipeline_multiclass(): np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -314,7 +315,7 @@ def test_surfstar_pipeline_multiclass(): np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -326,7 +327,7 @@ def test_multisurfstar_pipeline_multiclass(): np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -338,7 +339,7 @@ def test_multisurf_pipeline_multiclass(): np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -466,7 +467,7 @@ def test_relieff_pipeline_missing_values(): np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -478,7 +479,7 @@ def test_surf_pipeline_missing_values(): np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -490,7 +491,7 @@ def test_surfstar_pipeline_missing_values(): np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -502,7 +503,7 @@ def test_multisurfstar_pipeline_missing_values(): np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -514,8 +515,70 @@ def test_multisurf_pipeline_missing_values(): np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, labels_missing_values, cv=3, n_jobs=-1)) > 0.7 + +# Test Dataframe handling: + + +def test_relieff_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs""" + np.random.seed(49082) + clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_surf_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs""" + np.random.seed(240932) + clf = make_pipeline(SURF(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_surfstar_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs""" + np.random.seed(9238745) + clf = make_pipeline(SURFstar(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_multisurfstar_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs""" + np.random.seed(320931) + clf = make_pipeline(MultiSURFstar(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_multisurf_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs""" + np.random.seed(320931) + clf = make_pipeline(MultiSURF(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +# def test_turf_pandas_inputs(): +# """Check: Data (pandas DataFrame/Series): TuRF works with pandas DataFrame and Series inputs""" +# np.random.seed(320931) +# clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100), +# RandomForestClassifier(n_estimators=100, n_jobs=-1)) +# assert np.mean(cross_val_score(clf, features, labels, fit_params={'turf__headers': headers}, cv=3, n_jobs=-1)) > 0.7 + + +# def test_vlsrelief_pandas_inputs(): +# """Check: Data (pandas DataFrame/Series): VLSRelief works with pandas DataFrame and Series inputs""" +# np.random.seed(49082) +# clf = make_pipeline(VLSRelief(core_algorithm="ReliefF", n_features_to_select=2, n_neighbors=100), +# RandomForestClassifier(n_estimators=100, n_jobs=-1)) +# assert np.mean(cross_val_score(clf, features, labels, fit_params={'vlsrelief__headers': headers}, cv=3, n_jobs=-1)) > 0.7