diff --git a/skrebate/__init__.py b/skrebate/__init__.py index 42d7f51..8104249 100644 --- a/skrebate/__init__.py +++ b/skrebate/__init__.py @@ -30,4 +30,6 @@ from .surfstar import SURFstar from .multisurf import MultiSURF from .multisurfstar import MultiSURFstar -from .turf import TuRF +from .turf import TURF +from .vls import VLS +from .iter import Iter \ No newline at end of file diff --git a/skrebate/_version.py b/skrebate/_version.py index b3e1b3b..0afd419 100644 --- a/skrebate/_version.py +++ b/skrebate/_version.py @@ -24,4 +24,4 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -__version__ = '0.61' +__version__ = '0.7' diff --git a/skrebate/iter.py b/skrebate/iter.py new file mode 100644 index 0000000..2e6b2e3 --- /dev/null +++ b/skrebate/iter.py @@ -0,0 +1,138 @@ +from sklearn.base import BaseEstimator +import copy +import numpy as np + +class Iter(BaseEstimator): + + def __init__(self,relief_object,max_iter=10,convergence_threshold=0.0001,beta=0.1): + ''' + :param relief_object: Must be an object that implements the standard sklearn fit function, and after fit, has attribute feature_importances_ + that can be accessed. Scores must be a 1D np.ndarray of length # of features. The fit function must also be able to + take in an optional 1D np.ndarray 'weights' parameter of length num_features. + :param max_iter: Maximum number of iterations to run + :param convergence_threshold Difference between iteration feature weights to determine convergence + :param beta Learning Rate for Widrow Hoff Weight Update + ''' + + if not self.check_is_int(max_iter) or max_iter < 0: + raise Exception('max_iter must be a nonnegative integer') + + if not self.check_is_float(convergence_threshold) or convergence_threshold < 0: + raise Exception('convergence_threshold must be a nonnegative float') + + if not self.check_is_float(beta): + raise Exception('beta must be a float') + + self.relief_object = relief_object + self.max_iter = max_iter + self.converage_threshold = convergence_threshold + self.rank_absolute = self.relief_object.rank_absolute + self.beta = beta + + def fit(self, X, y): + """Scikit-learn required: Computes the feature importance scores from the training data. + Parameters + ---------- + X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from + y: array-like {n_samples} Training labels + Returns + ------- + self + """ + + #Iterate, feeding the resulting weights of the first run into the fit of the next run (how are they translated?) + last_iteration_scores = None + last_last_iteration_scores = None + for i in range(self.max_iter): + copy_relief_object = copy.deepcopy(self.relief_object) + if i == 0: + copy_relief_object.fit(X,y) + last_iteration_scores = copy_relief_object.feature_importances_ + elif i == 1: + if self.rank_absolute: + absolute_weights = np.absolute(last_iteration_scores) + transformed_weights = absolute_weights/np.max(absolute_weights) + else: + transformed_weights = self.transform_weights(last_iteration_scores) + copy_relief_object.fit(X, y, weights=transformed_weights) + if self.has_converged(last_iteration_scores,copy_relief_object.feature_importances_): + last_iteration_scores = copy_relief_object.feature_importances_ + break + last_last_iteration_scores = copy.deepcopy(transformed_weights) + last_iteration_scores = copy_relief_object.feature_importances_ + else: + if self.rank_absolute: + absolute_weights = np.absolute(last_iteration_scores) + new_weights = absolute_weights/np.max(absolute_weights) + else: + new_weights = self.transform_weights(last_iteration_scores) + + transformed_weights = self.widrow_hoff(last_last_iteration_scores,new_weights,self.beta) + copy_relief_object.fit(X,y,weights=transformed_weights) + if self.has_converged(last_iteration_scores,copy_relief_object.feature_importances_): + last_iteration_scores = copy_relief_object.feature_importances_ + break + last_last_iteration_scores = copy.deepcopy(transformed_weights) + last_iteration_scores = copy_relief_object.feature_importances_ + + #DEBUGGING + #print(last_iteration_scores) + + #Save final FI as feature_importances_ + self.feature_importances_ = last_iteration_scores + + if self.rank_absolute: + self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1] + else: + self.top_features_ = np.argsort(self.feature_importances_)[::-1] + + return self + + def widrow_hoff(self,originalw, neww,beta): + diff = neww-originalw + return originalw + (beta*diff) + + def has_converged(self,weight1,weight2): + for i in range(len(weight1)): + if abs(weight1[i] - weight2[i]) >= self.converage_threshold: + return False + return True + + def transform_weights(self,weights): + max_val = np.max(weights) + for i in range(len(weights)): + if weights[i] < 0: + weights[i] = 0 + else: + if max_val == 0: + weights[i] = 0 + else: + weights[i] = weights[i]/max_val + return weights + + def check_is_int(self, num): + try: + n = float(num) + if num - int(num) == 0: + return True + else: + return False + except: + return False + + def check_is_float(self, num): + try: + n = float(num) + return True + except: + return False + + def transform(self, X): + if X.shape[1] < self.relief_object.n_features_to_select: + raise ValueError('Number of features to select is larger than the number of features in the dataset.') + + return X[:, self.top_features_[:self.relief_object.n_features_to_select]] + + def fit_transform(self, X, y): + self.fit(X, y) + return self.transform(X) \ No newline at end of file diff --git a/skrebate/multisurf.py b/skrebate/multisurf.py index f497e7a..5b84ad0 100644 --- a/skrebate/multisurf.py +++ b/skrebate/multisurf.py @@ -71,9 +71,15 @@ def _run_algorithm(self): NNlist = [self._find_neighbors(datalen) for datalen in range(self._datalen)] - scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( - MultiSURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, - NN_near, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) - for instance_num, NN_near in zip(range(self._datalen), NNlist)), axis=0) + if isinstance(self._weights, np.ndarray) and self.weight_final_scores: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + MultiSURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights) + for instance_num, NN_near in zip(range(self._datalen), NNlist)), axis=0) + else: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + MultiSURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) + for instance_num, NN_near in zip(range(self._datalen), NNlist)), axis=0) return np.array(scores) diff --git a/skrebate/multisurfstar.py b/skrebate/multisurfstar.py index 84597ee..e47d3c6 100644 --- a/skrebate/multisurfstar.py +++ b/skrebate/multisurfstar.py @@ -76,9 +76,19 @@ def _run_algorithm(self): NN_near_list = [i[0] for i in NNlist] NN_far_list = [i[1] for i in NNlist] - scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( - MultiSURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, - NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) - for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) + if isinstance(self._weights, np.ndarray) and self.weight_final_scores: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + MultiSURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, NN_far, self._headers, self._class_type, self._X, self._y, + self._labels_std, self.data_type, self._weights) + for instance_num, NN_near, NN_far in + zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) + else: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + MultiSURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, NN_far, self._headers, self._class_type, self._X, self._y, + self._labels_std, self.data_type) + for instance_num, NN_near, NN_far in + zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) return np.array(scores) diff --git a/skrebate/relieff.py b/skrebate/relieff.py index 156121b..b190056 100644 --- a/skrebate/relieff.py +++ b/skrebate/relieff.py @@ -28,7 +28,7 @@ import sys from sklearn.base import BaseEstimator from joblib import Parallel, delayed -from .scoring_utils import get_row_missing, ReliefF_compute_scores +from .scoring_utils import get_row_missing, ReliefF_compute_scores, get_row_missing_iter class ReliefF(BaseEstimator): @@ -43,7 +43,7 @@ class ReliefF(BaseEstimator): * For ReliefF, the setting of k is <= to the number of instances that have the least frequent class label (binary and multiclass endpoint data. """ - def __init__(self, n_features_to_select=10, n_neighbors=100, discrete_threshold=10, verbose=False, n_jobs=1): + def __init__(self, n_features_to_select=10, n_neighbors=100, discrete_threshold=10, verbose=False, n_jobs=1,weight_final_scores=False,rank_absolute=False): """Sets up ReliefF to perform feature selection. Note that an approximation of the original 'Relief' algorithm may be run by setting 'n_features_to_select' to 1. Also note that the original Relief parameter 'm' is not included in this software. 'm' specifies the number of random training instances out of 'n' (total @@ -71,15 +71,21 @@ def __init__(self, n_features_to_select=10, n_neighbors=100, discrete_threshold= The number of cores to dedicate to computing the scores with joblib. Assigning this parameter to -1 will dedicate as many cores as are available on your system. We recommend setting this parameter to -1 to speed up the algorithm as much as possible. + weight_final_scores: bool (default: False) + Whether to multiply given weights (in fit) to final scores. Only applicable if weights are given. + rank_absolute: bool (default: False) + Whether to give top features as by ranking features by absolute value. """ self.n_features_to_select = n_features_to_select self.n_neighbors = n_neighbors self.discrete_threshold = discrete_threshold self.verbose = verbose self.n_jobs = n_jobs + self.weight_final_scores = weight_final_scores + self.rank_absolute = rank_absolute #=========================================================================# - def fit(self, X, y): + def fit(self, X, y, weights=None): """Scikit-learn required: Computes the feature importance scores from the training data. Parameters ---------- @@ -87,12 +93,26 @@ def fit(self, X, y): Training instances to compute the feature importance scores from y: array-like {n_samples} Training labels + weights: parameter for iterative relief + Returns ------- Copy of the ReliefF instance """ self._X = X # matrix of predictive variables ('independent variables') self._y = y # vector of values for outcome variable ('dependent variable') + if isinstance(weights, np.ndarray): + if isinstance(weights, np.ndarray): + if len(weights) != len(X[0]): + raise Exception('Dimension of weights param must match number of features') + elif isinstance(weights, list): + if len(weights) != len(X[0]): + raise Exception('Dimension of weights param must match number of features') + weights = np.ndarray(weights) + else: + raise Exception('weights param must be numpy array or list') + + self._weights = weights # Set up the properties for ReliefF ------------------------------------------------------------------------------------- self._datalen = len(self._X) # Number of training instances ('n') @@ -177,9 +197,15 @@ def fit(self, X, y): """ For efficiency, the distance array is computed more efficiently for data with no missing values. This distance array will only be used to identify nearest neighbors. """ if self._missing_data_count > 0: - self._distance_array = self._distarray_missing(xc, xd, cdiffs) + if not isinstance(self._weights, np.ndarray): + self._distance_array = self._distarray_missing(xc, xd, cdiffs) + else: + self._distance_array = self._distarray_missing_iter(xc, xd, cdiffs, self._weights) else: - self._distance_array = self._distarray_no_missing(xc, xd) + if not isinstance(self._weights, np.ndarray): + self._distance_array = self._distarray_no_missing(xc, xd) + else: + self._distance_array = self._distarray_no_missing_iter(xc, xd, self._weights) if self.verbose: elapsed = time.time() - start @@ -201,7 +227,10 @@ def fit(self, X, y): print('Completed scoring in {} seconds.'.format(elapsed)) # Compute indices of top features - self.top_features_ = np.argsort(self.feature_importances_)[::-1] + if self.rank_absolute: + self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1] + else: + self.top_features_ = np.argsort(self.feature_importances_)[::-1] return self @@ -339,7 +368,6 @@ def _dtype_array(self): return attrdiff, cidx, didx #==================================================================# - def _distarray_missing(self, xc, xd, cdiffs): """Distance array calculation for data with missing values""" cindices = [] @@ -359,6 +387,64 @@ def _distarray_missing(self, xc, xd, cdiffs): return np.array(dist_array) #==================================================================# + # For Iter Relief + def _distarray_no_missing_iter(self, xc, xd, weights): + """Distance array calculation for data with no missing values. The 'pdist() function outputs a condense distance array, and squareform() converts this vector-form + distance vector to a square-form, redundant distance matrix. + *This could be a target for saving memory in the future, by not needing to expand to the redundant square-form matrix. """ + from scipy.spatial.distance import pdist, squareform + + # ------------------------------------------# + def pre_normalize(x): + """Normalizes continuous features so they are in the same range (0 to 1)""" + idx = 0 + # goes through all named features (doesn really need to) this method is only applied to continuous features + for i in sorted(self.attr.keys()): + if self.attr[i][0] == 'discrete': + continue + cmin = self.attr[i][2] + diff = self.attr[i][3] + x[:, idx] -= cmin + x[:, idx] /= diff + idx += 1 + return x + + # ------------------------------------------# + + if self.data_type == 'discrete': # discrete features only + return squareform(pdist(self._X, metric='hamming', w=weights)) + elif self.data_type == 'mixed': # mix of discrete and continuous features + d_dist = squareform(pdist(xd, metric='hamming', w=weights)) + # Cityblock is also known as Manhattan distance + c_dist = squareform(pdist(pre_normalize(xc), metric='cityblock', w=weights)) + return np.add(d_dist, c_dist) / self._num_attributes + + else: # continuous features only + # xc = pre_normalize(xc) + return squareform(pdist(pre_normalize(xc), metric='cityblock', w=weights)) + + # ==================================================================# + + # For Iterrelief - get_row_missing_iter is called + def _distarray_missing_iter(self, xc, xd, cdiffs, weights): + """Distance array calculation for data with missing values""" + cindices = [] + dindices = [] + # Get Boolean mask locating missing values for continuous and discrete features separately. These correspond to xc and xd respectively. + for i in range(self._datalen): + cindices.append(np.where(np.isnan(xc[i]))[0]) + dindices.append(np.where(np.isnan(xd[i]))[0]) + + if self.n_jobs != 1: + dist_array = Parallel(n_jobs=self.n_jobs)(delayed(get_row_missing_iter)( + xc, xd, cdiffs, index, cindices, dindices, weights) for index in range(self._datalen)) + else: + # For each instance calculate distance from all other instances (in non-redundant manner) (i.e. computes triangle, and puts zeros in for rest to form square). + dist_array = [get_row_missing_iter(xc, xd, cdiffs, index, cindices, dindices, weights) + for index in range(self._datalen)] + + return np.array(dist_array) + # ==================================================================# ############################# ReliefF ############################################ @@ -449,9 +535,17 @@ def _run_algorithm(self): nan_entries = np.isnan(self._X) # boolean mask for missing data values # Call the scoring method for the ReliefF algorithm - scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( - ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, - NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) - for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0) + if isinstance(self._weights, np.ndarray) and self.weight_final_scores: + # Call the scoring method for the ReliefF algorithm for IRelief + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights) + for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0) + else: + # Call the scoring method for the ReliefF algorithm + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) + for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0) return np.array(scores) diff --git a/skrebate/scoring_utils.py b/skrebate/scoring_utils.py index 3204dbf..fd98949 100644 --- a/skrebate/scoring_utils.py +++ b/skrebate/scoring_utils.py @@ -79,6 +79,66 @@ def get_row_missing(xc, xd, cdiffs, index, cindices, dindices): return row +# For iter relief +def get_row_missing_iter(xc, xd, cdiffs, index, cindices, dindices, weights): + """ Calculate distance between index instance and all other instances. """ + row = np.empty(0, dtype=np.double) # initialize empty row + cinst1 = xc[index] # continuous-valued features for index instance + dinst1 = xd[index] # discrete-valued features for index instance + # Boolean mask locating missing values for continuous features for index instance + can = cindices[index] + # Boolean mask locating missing values for discrete features for index instance + dan = dindices[index] + tf = len(cinst1) + len(dinst1) # total number of features. + # Progressively compare current instance to all others. Excludes comparison with self indexed instance. (Building the distance matrix triangle). + for j in range(index): + dist = 0 + dinst2 = xd[j] # discrete-valued features for compared instance + cinst2 = xc[j] # continuous-valued features for compared instance + + # Manage missing values in discrete features + # Boolean mask locating missing values for discrete features for compared instance + dbn = dindices[j] + # indexes where there is at least one missing value in the feature between an instance pair. + idx = np.unique(np.append(dan, dbn)) + # Number of features excluded from distance calculation due to one or two missing values within instance pair. Used to normalize distance values for comparison. + dmc = len(idx) + d1 = np.delete(dinst1, idx) # delete unique missing features from index instance + d2 = np.delete(dinst2, idx) # delete unique missing features from compared instance + + wd = np.delete(weights, idx) # delete weights corresponding to missing discrete features + # Manage missing values in continuous features + # Boolean mask locating missing values for continuous features for compared instance + cbn = cindices[j] + # indexes where there is at least one missing value in the feature between an instance pair. + idx = np.unique(np.append(can, cbn)) + # Number of features excluded from distance calculation due to one or two missing values within instance pair. Used to normalize distance values for comparison. + cmc = len(idx) + c1 = np.delete(cinst1, idx) # delete unique missing features from index instance + c2 = np.delete(cinst2, idx) # delete unique missing features from compared instance + # delete unique missing features from continuous value difference scores + cdf = np.delete(cdiffs, idx) + wc = np.delete(weights, idx) # delete weights corresponding to missing continuous features + + # Add discrete feature distance contributions (missing values excluded) - Hamming distance + if len(d1)!=0: #To ensure there is atleast one discrete variable + hamming_dist = np.not_equal(d1, d2).astype(float) + weight_hamming_dist = np.dot(hamming_dist, wd)/np.sum(wd) + dist += weight_hamming_dist + + # Add continuous feature distance contributions (missing values excluded) - Manhattan distance (Note that 0-1 continuous value normalization is included ~ subtraction of minimums cancel out) + if len(c1)!=0: #To ensure there is atleast one continuous variable + dist += np.dot((np.absolute(np.subtract(c1, c2)) / cdf), wc)/np.sum(wc) + + # Normalize distance calculation based on total number of missing values bypassed in either discrete or continuous features. + tnmc = tf - dmc - cmc # Total number of unique missing counted + # Distance normalized by number of features included in distance sum (this seeks to handle missing values neutrally in distance calculation) + dist = dist/float(tnmc) + + row = np.append(row, dist) + + return row + def ramp_function(data_type, attr, fname, xinstfeature, xNNifeature): """ Our own user simplified variation of the ramp function suggested by Hong 1994, 1997. Hong's method requires the user to specifiy two thresholds @@ -346,64 +406,87 @@ def compute_score(attr, mcmap, NN, feature, inst, nan_entries, headers, class_ty return diff -def ReliefF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN, headers, class_type, X, y, labels_std, data_type): +def ReliefF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN, headers, class_type, X, y, labels_std, data_type, weights=None): """ Unique scoring procedure for ReliefF algorithm. Scoring based on k nearest hits and misses of current target instance. """ scores = np.zeros(num_attributes) - for feature_num in range(num_attributes): - scores[feature_num] += compute_score(attr, mcmap, NN, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) + if isinstance(weights, np.ndarray): + for feature_num in range(num_attributes): + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + else: + for feature_num in range(num_attributes): + scores[feature_num] += compute_score(attr, mcmap, NN, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) return scores - -def SURF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN, headers, class_type, X, y, labels_std, data_type): +def SURF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN, headers, class_type, X, y, labels_std, data_type, weights=None): """ Unique scoring procedure for SURF algorithm. Scoring based on nearest neighbors within defined radius of current target instance. """ scores = np.zeros(num_attributes) - if len(NN) <= 0: - return scores - for feature_num in range(num_attributes): - scores[feature_num] += compute_score(attr, mcmap, NN, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) + if isinstance(weights, np.ndarray): + if len(NN) <= 0: + return scores + for feature_num in range(num_attributes): + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + else: + if len(NN) <= 0: + return scores + for feature_num in range(num_attributes): + scores[feature_num] += compute_score(attr, mcmap, NN, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) return scores -def SURFstar_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, NN_far, headers, class_type, X, y, labels_std, data_type): +def SURFstar_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, NN_far, headers, class_type, X, y, labels_std, data_type, weights=None): """ Unique scoring procedure for SURFstar algorithm. Scoring based on nearest neighbors within defined radius, as well as 'anti-scoring' of far instances outside of radius of current target instance""" scores = np.zeros(num_attributes) - for feature_num in range(num_attributes): - if len(NN_near) > 0: - scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) - # Note that we are using the near scoring loop in 'compute_score' and then just subtracting it here, in line with original SURF* paper. - if len(NN_far) > 0: - scores[feature_num] -= compute_score(attr, mcmap, NN_far, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) + if isinstance(weights, np.ndarray): + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + # Note that we are using the near scoring loop in 'compute_score' and then just subtracting it here, in line with original SURF* paper. + if len(NN_far) > 0: + scores[feature_num] -= weights[feature_num] * compute_score(attr, mcmap, NN_far, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + else: + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + # Note that we are using the near scoring loop in 'compute_score' and then just subtracting it here, in line with original SURF* paper. + if len(NN_far) > 0: + scores[feature_num] -= compute_score(attr, mcmap, NN_far, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) return scores -def MultiSURF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, headers, class_type, X, y, labels_std, data_type): +def MultiSURF_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, headers, class_type, X, y, labels_std, data_type, weights=None): """ Unique scoring procedure for MultiSURF algorithm. Scoring based on 'extreme' nearest neighbors within defined radius of current target instance. """ scores = np.zeros(num_attributes) - for feature_num in range(num_attributes): - if len(NN_near) > 0: - scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) + if isinstance(weights, np.ndarray): + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + else: + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) return scores -def MultiSURFstar_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, NN_far, headers, class_type, X, y, labels_std, data_type): +def MultiSURFstar_compute_scores(inst, attr, nan_entries, num_attributes, mcmap, NN_near, NN_far, headers, class_type, X, y, labels_std, data_type, weights=None): """ Unique scoring procedure for MultiSURFstar algorithm. Scoring based on 'extreme' nearest neighbors within defined radius, as well as 'anti-scoring' of extreme far instances defined by outer radius of current target instance. """ scores = np.zeros(num_attributes) - for feature_num in range(num_attributes): - if len(NN_near) > 0: - scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type) - # Note that we add this term because we used the far scoring above by setting 'near' to False. This is in line with original MultiSURF* paper. - if len(NN_far) > 0: - scores[feature_num] += compute_score(attr, mcmap, NN_far, feature_num, inst, - nan_entries, headers, class_type, X, y, labels_std, data_type, near=False) + if isinstance(weights, np.ndarray): + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + # Note that we add this term because we used the far scoring above by setting 'near' to False. This is in line with original MultiSURF* paper. + if len(NN_far) > 0: + scores[feature_num] += weights[feature_num] * compute_score(attr, mcmap, NN_far, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type, near=False) + else: + for feature_num in range(num_attributes): + if len(NN_near) > 0: + scores[feature_num] += compute_score(attr, mcmap, NN_near, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type) + # Note that we add this term because we used the far scoring above by setting 'near' to False. This is in line with original MultiSURF* paper. + if len(NN_far) > 0: + scores[feature_num] += compute_score(attr, mcmap, NN_far, feature_num, inst, nan_entries, headers, class_type, X, y, labels_std, data_type, near=False) return scores diff --git a/skrebate/surf.py b/skrebate/surf.py index 2e51358..e40c6e0 100644 --- a/skrebate/surf.py +++ b/skrebate/surf.py @@ -36,7 +36,7 @@ class SURF(ReliefF): for the Genetic Analysis of Complex Human Diseases. """ - def __init__(self, n_features_to_select=10, discrete_threshold=10, verbose=False, n_jobs=1): + def __init__(self, n_features_to_select=10, discrete_threshold=10, verbose=False, n_jobs=1,weight_final_scores=False,rank_absolute=False): """Sets up ReliefF to perform feature selection. Parameters ---------- @@ -53,11 +53,17 @@ def __init__(self, n_features_to_select=10, discrete_threshold=10, verbose=False The number of cores to dedicate to computing the scores with joblib. Assigning this parameter to -1 will dedicate as many cores as are available on your system. We recommend setting this parameter to -1 to speed up the algorithm as much as possible. + weight_final_scores: bool (default: False) + Whether to multiply given weights (in fit) to final scores. Only applicable if weights are given. + rank_absolute: bool (default: False) + Whether to give top features as by ranking features by absolute value. """ self.n_features_to_select = n_features_to_select self.discrete_threshold = discrete_threshold self.verbose = verbose self.n_jobs = n_jobs + self.weight_final_scores = weight_final_scores + self.rank_absolute = rank_absolute ############################# SURF ############################################ def _find_neighbors(self, inst, avg_dist): @@ -89,9 +95,15 @@ def _run_algorithm(self): nan_entries = np.isnan(self._X) NNlist = [self._find_neighbors(datalen, avg_dist) for datalen in range(self._datalen)] - scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( - SURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, - NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) - for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0) + if isinstance(self._weights, np.ndarray) and self.weight_final_scores: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + SURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN, self._headers, self._class_type, self._X, self._y, self._labels_std,self.data_type, self._weights) + for instance_num, NN in zip(range(self._datalen), NNlist)),axis=0) + else: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + SURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN, self._headers, self._class_type, self._X, self._y, self._labels_std,self.data_type) + for instance_num, NN in zip(range(self._datalen), NNlist)),axis=0) return np.array(scores) diff --git a/skrebate/surfstar.py b/skrebate/surfstar.py index 5740ba4..e44886b 100644 --- a/skrebate/surfstar.py +++ b/skrebate/surfstar.py @@ -77,9 +77,16 @@ def _run_algorithm(self): NN_near_list = [i[0] for i in NNlist] NN_far_list = [i[1] for i in NNlist] - scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( - SURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, - NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) - for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) + if isinstance(self._weights, np.ndarray) and self.weight_final_scores: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + SURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights) + for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) + + else: + scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed( + SURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap, + NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type) + for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0) return np.array(scores) diff --git a/skrebate/turf.py b/skrebate/turf.py index 43818af..5201663 100644 --- a/skrebate/turf.py +++ b/skrebate/turf.py @@ -1,207 +1,151 @@ -import numpy as np -import time -import warnings -import sys from sklearn.base import BaseEstimator -from sklearn.base import TransformerMixin -# from sklearn.feature_selection.base import SelectorMixin -from joblib import Parallel, delayed -# from .scoring_utils import get_row_missing, ReliefF_compute_scores -from .multisurf import MultiSURF -from .multisurfstar import MultiSURFstar -from .surf import SURF -from .surfstar import SURFstar -from .relieff import ReliefF - - -class TuRF(BaseEstimator, TransformerMixin): - - """Feature selection using data-mined expert knowledge. - Based on the ReliefF algorithm as introduced in: - Kononenko, Igor et al. Overcoming the myopia of inductive learning - algorithms with RELIEFF (1997), Applied Intelligence, 7(1), p39-55 - """ - - def __init__(self, core_algorithm, n_features_to_select=10, n_neighbors=100, pct=0.5, discrete_threshold=10, verbose=False, n_jobs=1): - """Sets up TuRF to perform feature selection. - Parameters - ---------- - core_algorithm: Core Relief Algorithm to perform TuRF iterations on - n_features_to_select: int (default: 10) - the number of top features (according to the relieff score) to - retain after feature selection is applied. - pct: float/int (default: 0.5) - If of type float, describes the fraction of features to be removed in each iteration. - If of type int, describes the number of features to be removed in each iteration. - discrete_threshold: int (default: 10) - Value used to determine if a feature is discrete or continuous. - If the number of unique levels in a feature is > discrete_threshold, then it is - considered continuous, or discrete otherwise. - verbose: bool (default: False) - If True, output timing of distance array and scoring - n_jobs: int (default: 1) - The number of cores to dedicate to computing the scores with joblib. - Assigning this parameter to -1 will dedicate as many cores as are available on your system. - We recommend setting this parameter to -1 to speed up the algorithm as much as possible. - """ - self.core_algorithm = core_algorithm - self.n_features_to_select = n_features_to_select - self.n_neighbors = n_neighbors - self.pct = pct - self.discrete_threshold = discrete_threshold - self.verbose = verbose - self.n_jobs = n_jobs - - #=========================================================================# - # headers = list(genetic_data.drop("class",axis=1)) - def fit(self, X, y, headers): - """ - Uses the input `core_algorithm` to determine feature importance scores at each iteration. - At every iteration, a certain number(determined by input parameter `pct`) of least important - features are removed, until the feature set is reduced down to the top `n_features_to_select` features. - Parameters - ---------- - X: array-like {n_samples, n_features} - Training instances to compute the feature importance scores from - y: array-like {n_samples} - Training labels - headers: array-like {n_features} - Feature names - Returns - ------- - Copy of the TuRF instance - """ +import copy +import numpy as np - self.X_mat = X - self._y = y - self.headers = headers - self._num_attributes = len(self.X_mat[0]) - self._lost = {} - - #Combine TuRF with specified 'core' Relief-based algorithm - if self.core_algorithm.lower() == "multisurf": - core = MultiSURF(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) +class TURF(BaseEstimator): - elif self.core_algorithm.lower() == "multisurfstar": - core = MultiSURFstar(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) + def __init__(self,relief_object,pct=0.5,num_scores_to_return=100): + ''' + :param relief_object: Must be an object that implements the standard sklearn fit function, and after fit, has attributes feature_importances_ + and top_features_ that can be accessed. Scores must be a 1D np.ndarray of length # of features. + :param pct: % of features to remove from removing features each iteration (if float). Or # of features to remove each iteration (if int) + :param num_scores_to_return: Number of nonzero scores to return after training. Default = min(num_features, 100) + ''' + if not self.check_is_int(num_scores_to_return) or num_scores_to_return < 0: + raise Exception('num_scores_to_return must be a nonnegative integer') - elif self.core_algorithm.lower() == "surf": - core = SURF(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) + if (not self.check_is_int(pct) and not self.check_is_float(pct)) or pct < 0: + raise Exception('pct must be a nonnegative integer/float') - elif self.core_algorithm.lower() == "surfstar": - core = SURFstar(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) + if (not self.check_is_int(pct) and self.check_is_float(pct)) and (pct < 0 or pct > 1): + raise Exception('if pct is a float, it must be from [0,1]') - elif self.core_algorithm.lower() == "relieff": - core = ReliefF(n_features_to_select=self.n_features_to_select, n_neighbors=self.n_neighbors, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) + self.relief_object = relief_object + self.pct = pct + self.num_scores_to_return = num_scores_to_return + self.rank_absolute = self.relief_object.rank_absolute + def fit(self, X, y): + """Scikit-learn required: Computes the feature importance scores from the training data. + Parameters + ---------- + X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from + y: array-like {n_samples} Training labels + Returns + ------- + self + """ + #Adjust num_scores_to_return num_features = X.shape[1] + self.num_scores_to_return = min(self.num_scores_to_return,num_features) + + if self.num_scores_to_return != num_features and self.pct == 1: + raise Exception('num_scores_to_return != num_features and pct == 1. TURF will never reach your intended destination.') + + #Find out out how many features to use in each iteration + features_per_iteration = self.get_features_per_iteration(num_features,self.pct,self.num_scores_to_return) + + #Iterate runs + binary_scores_existence_tracker = np.ones(num_features) #1 means score still left + + copy_relief_object = copy.deepcopy(self.relief_object) + copy_relief_object.fit(X, y) + features_per_iteration.pop(0) + for num_features_to_use_in_iteration in features_per_iteration: + #Find top raw features indices + best_raw_indices = copy_relief_object.top_features_[:num_features_to_use_in_iteration] + + #Map raw features indices to original feature indices array + onesCounter = 0 + copy_tracker = copy.deepcopy(binary_scores_existence_tracker) + for i in range(len(binary_scores_existence_tracker)): + if not (onesCounter in best_raw_indices): + binary_scores_existence_tracker[i] = 0 + if copy_tracker[i] == 1: + onesCounter+=1 + + #Get new X + new_indices = [] + for i in range(len(binary_scores_existence_tracker)): + if binary_scores_existence_tracker[i] == 1: + new_indices.append(i) + + ###DEBUGGING + # print(num_features_to_use_in_iteration) + # print(best_raw_indices) + # print(binary_scores_existence_tracker) + # print(new_indices) + # print() + + new_X = X[:,new_indices] + + #fit + copy_relief_object = copy.deepcopy(self.relief_object) + copy_relief_object.fit(new_X, y) + + #Return remaining scores in their original indices, having zeros for the rest + raw_scores = copy_relief_object.feature_importances_ + counter = 0 + for i in range(len(binary_scores_existence_tracker)): + if binary_scores_existence_tracker[i] == 1: + binary_scores_existence_tracker[i] = raw_scores[counter] + counter += 1 + + # Save FI as feature_importances_ + self.feature_importances_ = binary_scores_existence_tracker + + if self.rank_absolute: + self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1] + else: + self.top_features_ = np.argsort(self.feature_importances_)[::-1] - iter_count = 0 - features_iter = [] - headers_iter = [] - feature_retain_check = 0 - - #Determine maximum number of iterations. - iterMax = int(1/float(self.pct)) - - #Main iterative loop of TuRF - while(iter_count < iterMax): - #Run Core Relief-based algorithm - core_fit = core.fit(self.X_mat, self._y) - features_iter.append(core_fit.feature_importances_) #HISTORY - headers_iter.append(self.headers) #HISTORY - - #Calculate features to keep - perc_retain = 1 - self.pct - feature_retain = int(np.round(num_features*perc_retain)) - - # Edge case (ensures that each iteration, at least one feature is removed) - if feature_retain == feature_retain_check: - feature_retain -= 1 - - num_features = feature_retain - feature_retain_check = feature_retain - #Identify the index location of the top 'num_feature' scoring features (for this particular iteration) - select = np.array(features_iter[iter_count].argsort()[-num_features:]) - #Make index list of features not removed - non_select = np.array(features_iter[iter_count].argsort()[:num_features]) - #Make a dictionary that stores dropped features and the iteration they were dropped. - for i in non_select: - self._lost[self.headers[i]] = iterMax - iter_count #For feature name, store iteration rank it was removed (bigger rank for sooner removal) - - #Drop non-selected features and headers. - self.X_mat = self.X_mat[:, select] #select all instances and only features indexed from select. - self.headers = [self.headers[i] for i in select] - - iter_count += 1 - - #Final scoring iteration - core_fit = core.fit(self.X_mat, self._y) - features_iter.append(core_fit.feature_importances_) #HISTORY - headers_iter.append(self.headers) #HISTORY - iter_count += 1 - - self.num_iter = iter_count - self.feature_history = list(zip(headers_iter, features_iter)) #HISTORY - - #Prepare for assigning token scores to features that had been removed in a previous TuRF iteration. These scores are only meaningful in that they give an idea of when these feature(s) were removed. - low_score = min(core_fit.feature_importances_) - reduction = 0.01 * (max(core_fit.feature_importances_) - low_score) - - #For consistency we report feature importances ordered in same way as original dataset. Same is true for headers. - #Step through each feature name - self.feature_importances_= [] - - for i in headers_iter[0]: - #Check lost dictionary - if i in self._lost: - self.feature_importances_.append(low_score - reduction * self._lost[i]) #append discounted score as a marker of when the feature was removed. - else: #Feature made final cut - score_index = self.headers.index(i) - self.feature_importances_.append(core_fit.feature_importances_[score_index]) - - #Turn feature imporance list into array - self.feature_importances_= np.array(self.feature_importances_) - #self.feature_importances_ = core_fit.feature_importances_ - - self.top_features_ = [headers.index(i) for i in self.headers] - self.top_features_ = self.top_features_[::-1] return self - #=========================================================================# + def get_features_per_iteration(self,num_features,pct,num_scores_to_return): + features_per_iteration = [num_features] + features_left = num_features + if num_features != num_scores_to_return: + if self.check_is_int(pct): # Is int + while True: + if features_left - pct > num_scores_to_return: + features_left -= pct + features_per_iteration.append(features_left) + else: + features_per_iteration.append(num_scores_to_return) + break + else: # Is float + while True: + if int(features_left * pct) > num_scores_to_return: + features_left = int(features_left * pct) + features_per_iteration.append(features_left) + else: + features_per_iteration.append(num_scores_to_return) + break + return features_per_iteration + + def check_is_int(self, num): + try: + n = float(num) + if num - int(num) == 0: + return True + else: + return False + except: + return False + + def check_is_float(self, num): + try: + n = float(num) + return True + except: + return False def transform(self, X): - """Reduces the feature set down to the top `n_features_to_select` features. - Parameters - ---------- - X: array-like {n_samples, n_features} - Feature matrix to perform feature selection on - Returns - ------- - X_reduced: array-like {n_samples, n_features_to_select} - Reduced feature matrix - """ - if self._num_attributes < self.n_features_to_select: + if X.shape[1] < self.relief_object.n_features_to_select: raise ValueError('Number of features to select is larger than the number of features in the dataset.') - - return X[:, self.top_features_[:self.n_features_to_select]] - #return X[:, self.top_features_] - #=========================================================================# + return X[:, self.top_features_[:self.relief_object.n_features_to_select]] - def fit_transform(self, X, y, headers): - # def fit_transform(self, X, y): - """Computes the feature importance scores from the training data, then reduces the feature set down to the top `n_features_to_select` features. - Parameters - ---------- - X: array-like {n_samples, n_features} - Training instances to compute the feature importance scores from - y: array-like {n_samples} - Training labels - Returns - ------- - X_reduced: array-like {n_samples, n_features_to_select} - Reduced feature matrix - """ - self.fit(X, y, headers) - return self.transform(X) + def fit_transform(self, X, y): + self.fit(X, y) + return self.transform(X) \ No newline at end of file diff --git a/skrebate/vls.py b/skrebate/vls.py new file mode 100644 index 0000000..d3e351f --- /dev/null +++ b/skrebate/vls.py @@ -0,0 +1,165 @@ +from sklearn.base import BaseEstimator +import copy +import random +import numpy as np + +class VLS(BaseEstimator): + + def __init__(self,relief_object,num_feature_subset=40,size_feature_subset=5,random_state = None): + ''' + :param relief_object: Must be an object that implements the standard sklearn fit function, and after fit, has attribute feature_importances_ + that can be accessed. Scores must be a 1D np.ndarray of length # of features. The fit function must also be able to + take in an optional 1D np.ndarray 'weights' parameter of length num_features. + :param num_feature_subset: Number of feature subsets generated at random + :param size_feature_subset: Number of features in each subset. Cannot exceed number of features. + :param random_state: random seed + ''' + + if not self.check_is_int(num_feature_subset) or num_feature_subset <= 0: + raise Exception('num_feature_subset must be a positive integer') + + if not self.check_is_int(size_feature_subset) or size_feature_subset <= 0: + raise Exception('size_feature_subset must be a positive integer') + + if random_state != None and not self.check_is_int(random_state): + raise Exception('random_state must be None or integer') + + self.relief_object = relief_object + self.num_feature_subset = num_feature_subset + self.size_feature_subset = size_feature_subset + self.random_state = random_state + self.rank_absolute = self.relief_object.rank_absolute + + def fit(self, X, y,weights=None): + """Scikit-learn required: Computes the feature importance scores from the training data. + Parameters + ---------- + X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from + y: array-like {n_samples} Training labels + Returns + ------- + self + """ + #random_state + if self.random_state != None: + np.random.seed(self.random_state) + random.seed(self.random_state) + + #Make subsets with all the features + num_features = X.shape[1] + self.size_feature_subset = min(self.size_feature_subset,num_features) + subsets = self.make_subsets(list(range(num_features)),self.num_feature_subset,self.size_feature_subset) + + #Fit each subset + scores = [] + for subset in subsets: + new_X = self.custom_transform(X,subset) + copy_relief_object = copy.deepcopy(self.relief_object) + if not isinstance(weights,np.ndarray): + copy_relief_object.fit(new_X,y) + else: + copy_relief_object.fit(new_X,y,weights=weights[subset]) + raw_score = copy_relief_object.feature_importances_ + score = np.empty(num_features) + if self.rank_absolute: + score.fill(0) + else: + score.fill(np.NINF) + counter = 0 + for index in subset: + score[index] = raw_score[counter] + counter+=1 + scores.append(score) + + #DEBUGGING + #print(score) + + scores = np.array(scores) + + #Merge results by selecting largest found weight for each feature + max_scores = [] + for score in scores.T: + if self.rank_absolute: + max = np.max(np.absolute(score)) + if max in score: + max_scores.append(max) + else: + max_scores.append(-max) + else: + max_scores.append(np.max(score)) + max_scores = np.array(max_scores) + + #Save FI as feature_importances_ + self.feature_importances_ = max_scores + + if self.rank_absolute: + self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1] + else: + self.top_features_ = np.argsort(self.feature_importances_)[::-1] + + return self + + def custom_transform(self,X,indices_to_preserve): + return X[:,indices_to_preserve] + + def make_subsets(self,possible_indices,num_feature_subset,size_feature_subset): + if num_feature_subset * size_feature_subset < len(possible_indices): + raise Exception('num_feature_subset * size_feature_subset must be >= number of total features') + + if size_feature_subset > len(possible_indices): + raise Exception('size_feature_subset cannot be > number of total features') + + random.shuffle(possible_indices) + remaining_indices = copy.deepcopy(possible_indices) + + subsets = [] + while True: + subset = [] + while len(remaining_indices) > 0 and len(subset) < size_feature_subset: + subset.append(remaining_indices.pop(0)) + subsets.append(subset) + if len(remaining_indices) < size_feature_subset: + break + + if len(remaining_indices) != 0: + while len(remaining_indices) < size_feature_subset: + index_bad = True + while index_bad: + potential_index = random.choice(possible_indices) + if not (potential_index in remaining_indices): + remaining_indices.append(potential_index) + break + subsets.append(remaining_indices) + + subsets_left = num_feature_subset - len(subsets) + for i in range(subsets_left): + subsets.append(random.sample(possible_indices,size_feature_subset)) + + return subsets + + def check_is_int(self, num): + try: + n = float(num) + if num - int(num) == 0: + return True + else: + return False + except: + return False + + def check_is_float(self, num): + try: + n = float(num) + return True + except: + return False + + def transform(self, X): + if X.shape[1] < self.relief_object.n_features_to_select: + raise ValueError('Number of features to select is larger than the number of features in the dataset.') + + return X[:, self.top_features_[:self.relief_object.n_features_to_select]] + + def fit_transform(self, X, y, weights=None): + self.fit(X, y, weights) + return self.transform(X) \ No newline at end of file diff --git a/skrebate/vlsrelief.py b/skrebate/vlsrelief.py deleted file mode 100644 index dcd2870..0000000 --- a/skrebate/vlsrelief.py +++ /dev/null @@ -1,176 +0,0 @@ -import numpy as np -import pandas as pd -import time -import warnings -import sys -from sklearn.base import BaseEstimator -from sklearn.base import TransformerMixin -# from sklearn.feature_selection.base import SelectorMixin -from joblib import Parallel, delayed -# from .scoring_utils import get_row_missing, ReliefF_compute_scores -from .multisurf import MultiSURF -from .multisurfstar import MultiSURFstar -from .surf import SURF -from .surfstar import SURFstar -from .relieff import ReliefF - - -class VLSRelief(BaseEstimator, TransformerMixin): - - """Feature selection using data-mined expert knowledge. - Based on the ReliefF algorithm as introduced in: - Kononenko, Igor et al. Overcoming the myopia of inductive learning - algorithms with RELIEFF (1997), Applied Intelligence, 7(1), p39-55 - """ - - def __init__(self, core_algorithm, n_features_to_select=2, n_neighbors=100, num_feature_subset=20, size_feature_subset=10, discrete_threshold=10, verbose=False, n_jobs=1): - """Sets up VLSRelief to perform feature selection. - Parameters - ---------- - core_algorithm: Core Relief Algorithm to perform VLSRelief iterations on - n_features_to_select: int (default: 10) - the number of top features (according to the relieff score) to - retain after feature selection is applied. - num_feature_subset: int (default: 40) - Number of subsets generated at random - size_feature_subset: int (default 5) - Number of features in each subset generated - discrete_threshold: int (default: 10) - Value used to determine if a feature is discrete or continuous. - If the number of unique levels in a feature is > discrete_threshold, then it is - considered continuous, or discrete otherwise. - verbose: bool (default: False) - If True, output timing of distance array and scoring - n_jobs: int (default: 1) - The number of cores to dedicate to computing the scores with joblib. - Assigning this parameter to -1 will dedicate as many cores as are available on your system. - We recommend setting this parameter to -1 to speed up the algorithm as much as possible. - """ - self.core_algorithm = core_algorithm - self.n_features_to_select = n_features_to_select - self.n_neighbors = n_neighbors - self.discrete_threshold = discrete_threshold - self.verbose = verbose - self.n_jobs = n_jobs - self.num_feature_subset = num_feature_subset - self.size_feature_subset = size_feature_subset - - #=========================================================================# - # headers = list(genetic_data.drop("class",axis=1)) - def fit(self, X, y, headers): - """ - Generates `num_feature_subset` sets of features each of size `size_feature_subset`. - Thereafter, uses the input `core_algorithm` to determine feature importance scores - for each subset. The global feature score is determined by the max score for that feature - from all its occurences in the subsets generated. Once the final feature scores are obtained, - the top `n_features_to_select` features can be selected. - Parameters - ---------- - X: array-like {n_samples, n_features} - Training instances to compute the feature importance scores from - y: array-like {n_samples} - Training labels - headers: array-like {n_features} - Feature names - Returns - ------- - Copy of the VLSRelief instance - """ - - self.X_mat = X - self._y = y - self.headers = headers - - if self.core_algorithm.lower() == "multisurf": - core = MultiSURF(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) - - elif self.core_algorithm.lower() == "multisurfstar": - core = MultiSURFstar(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) - - elif self.core_algorithm.lower() == "surf": - core = SURF(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) - - elif self.core_algorithm.lower() == "surfstar": - core = SURFstar(n_features_to_select=self.n_features_to_select, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) - - elif self.core_algorithm.lower() == "relieff": - core = ReliefF(n_features_to_select=self.n_features_to_select, n_neighbors=self.n_neighbors, discrete_threshold=self.discrete_threshold, verbose=self.verbose, n_jobs=self.n_jobs) - - total_num_features = X.shape[1] - num_features = self.size_feature_subset - features_scores_iter = [] - headers_iter = [] - features_selected = [] - - for iteration in range(self.num_feature_subset): - features_selected_id = np.random.choice( - range(total_num_features), num_features, replace=False) - self.X_train = self.X_mat[:, features_selected_id] - - core_fit = core.fit(self.X_train, self._y) - - features_scores_iter.append(core_fit.feature_importances_) - features_selected.append(features_selected_id) - # headers_iter.append(self.headers[features_selected_id]) - - self.features_scores_iter = features_scores_iter - self.features_selected = features_selected - - zip_feat_score = [list(zip(features_selected[i], features_scores_iter[i])) - for i in range(len(features_selected))] - feat_score = sorted([item for sublist in zip_feat_score for item in sublist]) - feat_score_df = pd.DataFrame(feat_score) - feat_score_df.columns = ['feature', 'score'] - feat_score_df = feat_score_df.groupby('feature').max().reset_index() - - feature_scores = feat_score_df.values - - feature_scores = [[int(i[0]), i[1]] for i in feature_scores] - - self.feat_score = feature_scores - - head_idx = [i[0] for i in feature_scores] - self.headers_model = list(np.array(self.headers)[head_idx]) - - self.feature_importances_ = [i[1] for i in feature_scores] - self.top_features_ = np.argsort(self.feature_importances_)[::-1] - self.header_top_features_ = [self.headers_model[i] for i in self.top_features_] - - return self - - #=========================================================================# - - def transform(self, X): - """Reduces the feature set down to the top `n_features_to_select` features. - Parameters - ---------- - X: array-like {n_samples, n_features} - Feature matrix to perform feature selection on - Returns - ------- - X_reduced: array-like {n_samples, n_features_to_select} - Reduced feature matrix - """ - - return X[:, self.top_features_[:self.n_features_to_select]] - - # return X[:, self.top_features_] - - #=========================================================================# - - def fit_transform(self, X, y, headers): - # def fit_transform(self, X, y): - """Computes the feature importance scores from the training data, then reduces the feature set down to the top `n_features_to_select` features. - Parameters - ---------- - X: array-like {n_samples, n_features} - Training instances to compute the feature importance scores from - y: array-like {n_samples} - Training labels - Returns - ------- - X_reduced: array-like {n_samples, n_features_to_select} - Reduced feature matrix - """ - self.fit(X, y, headers) - return self.transform(X)