-
Notifications
You must be signed in to change notification settings - Fork 73
/
turf.py
151 lines (126 loc) · 6.27 KB
/
turf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from sklearn.base import BaseEstimator
import copy
import numpy as np
class TURF(BaseEstimator):
def __init__(self,relief_object,pct=0.5,num_scores_to_return=100):
'''
:param relief_object: Must be an object that implements the standard sklearn fit function, and after fit, has attributes feature_importances_
and top_features_ that can be accessed. Scores must be a 1D np.ndarray of length # of features.
:param pct: % of features to remove from removing features each iteration (if float). Or # of features to remove each iteration (if int)
:param num_scores_to_return: Number of nonzero scores to return after training. Default = min(num_features, 100)
'''
if not self.check_is_int(num_scores_to_return) or num_scores_to_return < 0:
raise Exception('num_scores_to_return must be a nonnegative integer')
if (not self.check_is_int(pct) and not self.check_is_float(pct)) or pct < 0:
raise Exception('pct must be a nonnegative integer/float')
if (not self.check_is_int(pct) and self.check_is_float(pct)) and (pct < 0 or pct > 1):
raise Exception('if pct is a float, it must be from [0,1]')
self.relief_object = relief_object
self.pct = pct
self.num_scores_to_return = num_scores_to_return
self.rank_absolute = self.relief_object.rank_absolute
def fit(self, X, y):
"""Scikit-learn required: Computes the feature importance scores from the training data.
Parameters
----------
X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from
y: array-like {n_samples} Training labels
Returns
-------
self
"""
#Adjust num_scores_to_return
num_features = X.shape[1]
self.num_scores_to_return = min(self.num_scores_to_return,num_features)
if self.num_scores_to_return != num_features and self.pct == 1:
raise Exception('num_scores_to_return != num_features and pct == 1. TURF will never reach your intended destination.')
#Find out out how many features to use in each iteration
features_per_iteration = self.get_features_per_iteration(num_features,self.pct,self.num_scores_to_return)
#Iterate runs
binary_scores_existence_tracker = np.ones(num_features) #1 means score still left
copy_relief_object = copy.deepcopy(self.relief_object)
copy_relief_object.fit(X, y)
features_per_iteration.pop(0)
for num_features_to_use_in_iteration in features_per_iteration:
#Find top raw features indices
best_raw_indices = copy_relief_object.top_features_[:num_features_to_use_in_iteration]
#Map raw features indices to original feature indices array
onesCounter = 0
copy_tracker = copy.deepcopy(binary_scores_existence_tracker)
for i in range(len(binary_scores_existence_tracker)):
if not (onesCounter in best_raw_indices):
binary_scores_existence_tracker[i] = 0
if copy_tracker[i] == 1:
onesCounter+=1
#Get new X
new_indices = []
for i in range(len(binary_scores_existence_tracker)):
if binary_scores_existence_tracker[i] == 1:
new_indices.append(i)
###DEBUGGING
# print(num_features_to_use_in_iteration)
# print(best_raw_indices)
# print(binary_scores_existence_tracker)
# print(new_indices)
# print()
new_X = X[:,new_indices]
#fit
copy_relief_object = copy.deepcopy(self.relief_object)
copy_relief_object.fit(new_X, y)
#Return remaining scores in their original indices, having zeros for the rest
raw_scores = copy_relief_object.feature_importances_
counter = 0
for i in range(len(binary_scores_existence_tracker)):
if binary_scores_existence_tracker[i] == 1:
binary_scores_existence_tracker[i] = raw_scores[counter]
counter += 1
# Save FI as feature_importances_
self.feature_importances_ = binary_scores_existence_tracker
if self.rank_absolute:
self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1]
else:
self.top_features_ = np.argsort(self.feature_importances_)[::-1]
return self
def get_features_per_iteration(self,num_features,pct,num_scores_to_return):
features_per_iteration = [num_features]
features_left = num_features
if num_features != num_scores_to_return:
if self.check_is_int(pct): # Is int
while True:
if features_left - pct > num_scores_to_return:
features_left -= pct
features_per_iteration.append(features_left)
else:
features_per_iteration.append(num_scores_to_return)
break
else: # Is float
while True:
if int(features_left * pct) > num_scores_to_return:
features_left = int(features_left * pct)
features_per_iteration.append(features_left)
else:
features_per_iteration.append(num_scores_to_return)
break
return features_per_iteration
def check_is_int(self, num):
try:
n = float(num)
if num - int(num) == 0:
return True
else:
return False
except:
return False
def check_is_float(self, num):
try:
n = float(num)
return True
except:
return False
def transform(self, X):
if X.shape[1] < self.relief_object.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')
return X[:, self.top_features_[:self.relief_object.n_features_to_select]]
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)