From 312566a4c010dbf00cf93bfff2bc3074255bed9e Mon Sep 17 00:00:00 2001 From: Aaron Duke Date: Fri, 7 Aug 2020 15:56:16 -0700 Subject: [PATCH 1/2] Add support for pandas DataFrames - #53 --- skrebate/relieff.py | 7 ++-- skrebate/turf.py | 10 ++--- skrebate/vlsrelief.py | 10 ++--- tests.py | 87 +++++++++++++++++++++++++++++++++++++------ 4 files changed, 89 insertions(+), 25 deletions(-) diff --git a/skrebate/relieff.py b/skrebate/relieff.py index f41d3c3..73bbb06 100644 --- a/skrebate/relieff.py +++ b/skrebate/relieff.py @@ -27,6 +27,7 @@ import warnings import sys from sklearn.base import BaseEstimator +from sklearn.utils import check_array, column_or_1d from joblib import Parallel, delayed from .scoring_utils import get_row_missing, ReliefF_compute_scores @@ -91,8 +92,8 @@ def fit(self, X, y): ------- Copy of the ReliefF instance """ - self._X = X # matrix of predictive variables ('independent variables') - self._y = y # vector of values for outcome variable ('dependent variable') + self._X = check_array(X, force_all_finite=False) # matrix of predictive variables ('independent variables') + self._y = column_or_1d(y) # vector of values for outcome variable ('dependent variable') # Set up the properties for ReliefF ------------------------------------------------------------------------------------- self._datalen = len(self._X) # Number of training instances ('n') @@ -220,7 +221,7 @@ def transform(self, X): if self._num_attributes < self.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') - return X[:, self.top_features_[:self.n_features_to_select]] + return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]] #=========================================================================# def fit_transform(self, X, y): diff --git a/skrebate/turf.py b/skrebate/turf.py index 43818af..a6b2fe4 100644 --- a/skrebate/turf.py +++ b/skrebate/turf.py @@ -2,8 +2,8 @@ import time import warnings import sys -from sklearn.base import BaseEstimator -from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array, column_or_1d # from sklearn.feature_selection.base import SelectorMixin from joblib import Parallel, delayed # from .scoring_utils import get_row_missing, ReliefF_compute_scores @@ -72,8 +72,8 @@ def fit(self, X, y, headers): Copy of the TuRF instance """ - self.X_mat = X - self._y = y + self.X_mat = check_array(X, force_all_finite=False) + self._y = column_or_1d(y) self.headers = headers self._num_attributes = len(self.X_mat[0]) self._lost = {} @@ -184,7 +184,7 @@ def transform(self, X): if self._num_attributes < self.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') - return X[:, self.top_features_[:self.n_features_to_select]] + return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]] #return X[:, self.top_features_] #=========================================================================# diff --git a/skrebate/vlsrelief.py b/skrebate/vlsrelief.py index dcd2870..6e30bf5 100644 --- a/skrebate/vlsrelief.py +++ b/skrebate/vlsrelief.py @@ -3,8 +3,8 @@ import time import warnings import sys -from sklearn.base import BaseEstimator -from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array, column_or_1d # from sklearn.feature_selection.base import SelectorMixin from joblib import Parallel, delayed # from .scoring_utils import get_row_missing, ReliefF_compute_scores @@ -77,8 +77,8 @@ def fit(self, X, y, headers): Copy of the VLSRelief instance """ - self.X_mat = X - self._y = y + self.X_mat = check_array(X, force_all_finite=False) + self._y = column_or_1d(y) self.headers = headers if self.core_algorithm.lower() == "multisurf": @@ -152,7 +152,7 @@ def transform(self, X): Reduced feature matrix """ - return X[:, self.top_features_[:self.n_features_to_select]] + return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]] # return X[:, self.top_features_] diff --git a/tests.py b/tests.py index 3ba82a4..c0a05b6 100644 --- a/tests.py +++ b/tests.py @@ -26,7 +26,7 @@ from skrebate import ReliefF, SURF, SURFstar, MultiSURF, MultiSURFstar from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.preprocessing import Imputer +from sklearn.impute import SimpleImputer from sklearn.model_selection import cross_val_score import pandas as pd import numpy as np @@ -60,7 +60,8 @@ genetic_data_multiclass = genetic_data_multiclass.sample(frac=0.25) -features, labels = genetic_data.drop('class', axis=1).values, genetic_data['class'].values +features_df, labels_s = genetic_data.drop('class', axis=1), genetic_data['class'] +features, labels = features_df.values, labels_s.values headers = list(genetic_data.drop("class", axis=1)) features_cont_endpoint, labels_cont_endpoint = genetic_data_cont_endpoint.drop( @@ -290,7 +291,7 @@ def test_relieff_pipeline_multiclass(): np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -302,7 +303,7 @@ def test_surf_pipeline_multiclass(): np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -314,7 +315,7 @@ def test_surfstar_pipeline_multiclass(): np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -326,7 +327,7 @@ def test_multisurfstar_pipeline_multiclass(): np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -338,7 +339,7 @@ def test_multisurf_pipeline_multiclass(): np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_multiclass, @@ -466,7 +467,7 @@ def test_relieff_pipeline_missing_values(): np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -478,7 +479,7 @@ def test_surf_pipeline_missing_values(): np.random.seed(240932) clf = make_pipeline(SURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -490,7 +491,7 @@ def test_surfstar_pipeline_missing_values(): np.random.seed(9238745) clf = make_pipeline(SURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -502,7 +503,7 @@ def test_multisurfstar_pipeline_missing_values(): np.random.seed(320931) clf = make_pipeline(MultiSURFstar(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, @@ -514,8 +515,70 @@ def test_multisurf_pipeline_missing_values(): np.random.seed(320931) clf = make_pipeline(MultiSURF(n_features_to_select=2), - Imputer(), + SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_missing_values, labels_missing_values, cv=3, n_jobs=-1)) > 0.7 + +# Test Dataframe handling: + + +def test_relieff_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs""" + np.random.seed(49082) + clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_surf_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs""" + np.random.seed(240932) + clf = make_pipeline(SURF(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_surfstar_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): SURF* works with pandas DataFrame and Series inputs""" + np.random.seed(9238745) + clf = make_pipeline(SURFstar(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_multisurfstar_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): MultiSURF* works with pandas DataFrame and Series inputs""" + np.random.seed(320931) + clf = make_pipeline(MultiSURFstar(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +def test_multisurf_pandas_inputs(): + """Check: Data (pandas DataFrame/Series): MultiSURF works with pandas DataFrame and Series inputs""" + np.random.seed(320931) + clf = make_pipeline(MultiSURF(n_features_to_select=2), + RandomForestClassifier(n_estimators=100, n_jobs=-1)) + assert np.mean(cross_val_score(clf, features_df, + labels_s, cv=3, n_jobs=-1)) > 0.7 + + +# def test_turf_pandas_inputs(): +# """Check: Data (pandas DataFrame/Series): TuRF works with pandas DataFrame and Series inputs""" +# np.random.seed(320931) +# clf = make_pipeline(TuRF(core_algorithm="ReliefF", n_features_to_select=2, pct=0.5, n_neighbors=100), +# RandomForestClassifier(n_estimators=100, n_jobs=-1)) +# assert np.mean(cross_val_score(clf, features, labels, fit_params={'turf__headers': headers}, cv=3, n_jobs=-1)) > 0.7 + + +# def test_vlsrelief_pandas_inputs(): +# """Check: Data (pandas DataFrame/Series): VLSRelief works with pandas DataFrame and Series inputs""" +# np.random.seed(49082) +# clf = make_pipeline(VLSRelief(core_algorithm="ReliefF", n_features_to_select=2, n_neighbors=100), +# RandomForestClassifier(n_estimators=100, n_jobs=-1)) +# assert np.mean(cross_val_score(clf, features, labels, fit_params={'vlsrelief__headers': headers}, cv=3, n_jobs=-1)) > 0.7 From 5261b1b36c543f5b897a516adc5ff239c1a1d9ce Mon Sep 17 00:00:00 2001 From: Aaron Duke Date: Tue, 18 May 2021 12:17:38 -0700 Subject: [PATCH 2/2] add pandas support --- skrebate/relieff.py | 6 ++++-- skrebate/vls.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/skrebate/relieff.py b/skrebate/relieff.py index 0646565..5c64b99 100644 --- a/skrebate/relieff.py +++ b/skrebate/relieff.py @@ -100,7 +100,8 @@ def fit(self, X, y, weights=None): ------- Copy of the ReliefF instance """ - self._X = check_array(X, force_all_finite=False) # matrix of predictive variables ('independent variables') + X = check_array(X, force_all_finite=False) + self._X = X # matrix of predictive variables ('independent variables') self._y = column_or_1d(y) # vector of values for outcome variable ('dependent variable') if isinstance(weights, np.ndarray): if isinstance(weights, np.ndarray): @@ -247,10 +248,11 @@ def transform(self, X): X_reduced: array-like {n_samples, n_features_to_select} Reduced feature matrix """ + X = check_array(X, force_all_finite=False) if self._num_attributes < self.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') - return check_array(X, force_all_finite=False)[:, self.top_features_[:self.n_features_to_select]] + return X[:, self.top_features_[:self.n_features_to_select]] #=========================================================================# def fit_transform(self, X, y): diff --git a/skrebate/vls.py b/skrebate/vls.py index d3e351f..d882ff9 100644 --- a/skrebate/vls.py +++ b/skrebate/vls.py @@ -1,4 +1,5 @@ from sklearn.base import BaseEstimator +from sklearn.utils import check_array, column_or_1d import copy import random import numpy as np @@ -40,6 +41,8 @@ def fit(self, X, y,weights=None): ------- self """ + X = check_array(X, force_all_finite=False) + y = column_or_1d(y) #random_state if self.random_state != None: np.random.seed(self.random_state) @@ -155,6 +158,7 @@ def check_is_float(self, num): return False def transform(self, X): + X = check_array(X, force_all_finite=False) if X.shape[1] < self.relief_object.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.')