-
Notifications
You must be signed in to change notification settings - Fork 73
/
relieff.py
551 lines (477 loc) · 27.6 KB
/
relieff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# -*- coding: utf-8 -*-
"""
scikit-rebate was primarily developed at the University of Pennsylvania by:
- Randal S. Olson (rso@randalolson.com)
- Pete Schmitt (pschmitt@upenn.edu)
- Ryan J. Urbanowicz (ryanurb@upenn.edu)
- Weixuan Fu (weixuanf@upenn.edu)
- and many more generous open source contributors
Permission is hereby granted, free of charge, to any person obtaining a copy of this software
and associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
from __future__ import print_function
import numpy as np
import time
import warnings
import sys
from sklearn.base import BaseEstimator
from joblib import Parallel, delayed
from .scoring_utils import get_row_missing, ReliefF_compute_scores, get_row_missing_iter
class ReliefF(BaseEstimator):
"""Feature selection using data-mined expert knowledge.
Based on the ReliefF algorithm as introduced in:
Igor et al. Overcoming the myopia of inductive learning
algorithms with RELIEFF (1997), Applied Intelligence, 7(1), p39-55"""
"""Note that ReliefF class establishes core functionality that is inherited by all other Relief-based algorithms.
Assumes: * There are no missing values in the label/outcome/dependent variable.
* For ReliefF, the setting of k is <= to the number of instances that have the least frequent class label
(binary and multiclass endpoint data. """
def __init__(self, n_features_to_select=10, n_neighbors=100, discrete_threshold=10, verbose=False, n_jobs=1,weight_final_scores=False,rank_absolute=False):
"""Sets up ReliefF to perform feature selection. Note that an approximation of the original 'Relief'
algorithm may be run by setting 'n_features_to_select' to 1. Also note that the original Relief parameter 'm'
is not included in this software. 'm' specifies the number of random training instances out of 'n' (total
training instances) used to update feature scores. Since scores are most representative when m=n, all
available training instances are utilized in all Relief-based algorithm score updates here. If the user
wishes to utilize a smaller 'm' in Relief-based scoring, simply pass any of these algorithms a subset of the
original training dataset samples.
Parameters
----------
n_features_to_select: int (default: 10)
the number of top features (according to the relieff score) to
retain after feature selection is applied.
n_neighbors: int or float (default: 100)
The number of neighbors to consider when assigning feature
importance scores. If a float number is provided, that percentage of
training samples is used as the number of neighbors.
More neighbors results in more accurate scores, but takes longer.
discrete_threshold: int (default: 10)
Value used to determine if a feature is discrete or continuous.
If the number of unique levels in a feature is > discrete_threshold, then it is
considered continuous, or discrete otherwise.
verbose: bool (default: False)
If True, output timing of distance array and scoring
n_jobs: int (default: 1)
The number of cores to dedicate to computing the scores with joblib.
Assigning this parameter to -1 will dedicate as many cores as are available on your system.
We recommend setting this parameter to -1 to speed up the algorithm as much as possible.
weight_final_scores: bool (default: False)
Whether to multiply given weights (in fit) to final scores. Only applicable if weights are given.
rank_absolute: bool (default: False)
Whether to give top features as by ranking features by absolute value.
"""
self.n_features_to_select = n_features_to_select
self.n_neighbors = n_neighbors
self.discrete_threshold = discrete_threshold
self.verbose = verbose
self.n_jobs = n_jobs
self.weight_final_scores = weight_final_scores
self.rank_absolute = rank_absolute
#=========================================================================#
def fit(self, X, y, weights=None):
"""Scikit-learn required: Computes the feature importance scores from the training data.
Parameters
----------
X: array-like {n_samples, n_features}
Training instances to compute the feature importance scores from
y: array-like {n_samples}
Training labels
weights: parameter for iterative relief
Returns
-------
Copy of the ReliefF instance
"""
self._X = X # matrix of predictive variables ('independent variables')
self._y = y # vector of values for outcome variable ('dependent variable')
if isinstance(weights, np.ndarray):
if isinstance(weights, np.ndarray):
if len(weights) != len(X[0]):
raise Exception('Dimension of weights param must match number of features')
elif isinstance(weights, list):
if len(weights) != len(X[0]):
raise Exception('Dimension of weights param must match number of features')
weights = np.ndarray(weights)
else:
raise Exception('weights param must be numpy array or list')
self._weights = weights
# Set up the properties for ReliefF -------------------------------------------------------------------------------------
self._datalen = len(self._X) # Number of training instances ('n')
""""Below: Handles special case where user requests that a proportion of training instances be neighbors for
ReliefF rather than a specified 'k' number of neighbors. Note that if k is specified, then k 'hits' and k
'misses' will be used to update feature scores. Thus total number of neighbors is 2k. If instead a proportion
is specified (say 0.1 out of 1000 instances) this represents the total number of neighbors (e.g. 100). In this
case, k would be set to 50 (i.e. 50 hits and 50 misses). """
if hasattr(self, 'n_neighbors') and type(self.n_neighbors) is float:
# Halve the number of neighbors because ReliefF uses n_neighbors matches and n_neighbors misses
self.n_neighbors = int(self.n_neighbors * self._datalen * 0.5)
# Number of unique outcome (label) values (used to determine outcome variable type)
self._label_list = list(set(self._y))
# Determine if label is discrete
discrete_label = (len(self._label_list) <= self.discrete_threshold)
# Identify label type (binary, multiclass, or continuous)
if discrete_label:
if len(self._label_list) == 2:
self._class_type = 'binary'
self.mcmap = 0
elif len(self._label_list) > 2:
self._class_type = 'multiclass'
self.mcmap = self._getMultiClassMap()
else:
raise ValueError('All labels are of the same class.')
else:
self._class_type = 'continuous'
self.mcmap = 0
# Training labels standard deviation -- only used if the training labels are continuous
self._labels_std = 0.
if len(self._label_list) > self.discrete_threshold:
self._labels_std = np.std(self._y, ddof=1)
self._num_attributes = len(self._X[0]) # Number of features in training data
# Number of missing data values in predictor variable matrix.
self._missing_data_count = np.isnan(self._X).sum()
"""Assign internal headers for the features (scikit-learn does not accept external headers from dataset):
The pre_normalize() function relies on the headers being ordered, e.g., X01, X02, etc.
If this is changed, then the sort in the pre_normalize() function needs to be adapted as well. """
xlen = len(self._X[0])
mxlen = len(str(xlen + 1))
self._headers = ['X{}'.format(str(i).zfill(mxlen)) for i in range(1, xlen + 1)]
start = time.time() # Runtime tracking
# Determine data types for all features/attributes in training data (i.e. discrete or continuous)
C = D = False
# Examines each feature and applies discrete_threshold to determine variable type.
self.attr = self._get_attribute_info()
for key in self.attr.keys():
if self.attr[key][0] == 'discrete':
D = True
if self.attr[key][0] == 'continuous':
C = True
# For downstream computational efficiency, determine if dataset is comprised of all discrete, all continuous, or a mix of discrete/continuous features.
if C and D:
self.data_type = 'mixed'
elif D and not C:
self.data_type = 'discrete'
elif C and not D:
self.data_type = 'continuous'
else:
raise ValueError('Invalid data type in data set.')
#--------------------------------------------------------------------------------------------------------------------
# Compute the distance array between all data points ----------------------------------------------------------------
# For downstream efficiency, separate features in dataset by type (i.e. discrete/continuous)
diffs, cidx, didx = self._dtype_array()
cdiffs = diffs[cidx] # max/min continuous value difference for continuous features.
xc = self._X[:, cidx] # Subset of continuous-valued feature data
xd = self._X[:, didx] # Subset of discrete-valued feature data
""" For efficiency, the distance array is computed more efficiently for data with no missing values.
This distance array will only be used to identify nearest neighbors. """
if self._missing_data_count > 0:
if not isinstance(self._weights, np.ndarray):
self._distance_array = self._distarray_missing(xc, xd, cdiffs)
else:
self._distance_array = self._distarray_missing_iter(xc, xd, cdiffs, self._weights)
else:
if not isinstance(self._weights, np.ndarray):
self._distance_array = self._distarray_no_missing(xc, xd)
else:
self._distance_array = self._distarray_no_missing_iter(xc, xd, self._weights)
if self.verbose:
elapsed = time.time() - start
print('Created distance array in {} seconds.'.format(elapsed))
print('Feature scoring under way ...')
start = time.time()
#--------------------------------------------------------------------------------------------------------------------
# Run remainder of algorithm (i.e. identification of 'neighbors' for each instance, and feature scoring).------------
# Stores feature importance scores for ReliefF or respective Relief-based algorithm.
self.feature_importances_ = self._run_algorithm()
# Delete the internal distance array because it is no longer needed
del self._distance_array
if self.verbose:
elapsed = time.time() - start
print('Completed scoring in {} seconds.'.format(elapsed))
# Compute indices of top features
if self.rank_absolute:
self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1]
else:
self.top_features_ = np.argsort(self.feature_importances_)[::-1]
return self
#=========================================================================#
def transform(self, X):
"""Scikit-learn required: Reduces the feature set down to the top `n_features_to_select` features.
Parameters
----------
X: array-like {n_samples, n_features}
Feature matrix to perform feature selection on
Returns
-------
X_reduced: array-like {n_samples, n_features_to_select}
Reduced feature matrix
"""
if self._num_attributes < self.n_features_to_select:
raise ValueError('Number of features to select is larger than the number of features in the dataset.')
return X[:, self.top_features_[:self.n_features_to_select]]
#=========================================================================#
def fit_transform(self, X, y):
"""Scikit-learn required: Computes the feature importance scores from the training data, then reduces the feature set down to the top `n_features_to_select` features.
Parameters
----------
X: array-like {n_samples, n_features}
Training instances to compute the feature importance scores from
y: array-like {n_samples}
Training labels
Returns
-------
X_reduced: array-like {n_samples, n_features_to_select}
Reduced feature matrix
"""
self.fit(X, y)
return self.transform(X)
######################### SUPPORTING FUNCTIONS ###########################
def _getMultiClassMap(self):
""" Relief algorithms handle the scoring updates a little differently for data with multiclass outcomes. In ReBATE we implement multiclass scoring in line with
the strategy described by Kononenko 1994 within the RELIEF-F variant which was suggested to outperform the RELIEF-E multiclass variant. This strategy weights
score updates derived from misses of different classes by the class frequency observed in the training data. 'The idea is that the algorithm should estimate the
ability of attributes to separate each pair of classes regardless of which two classes are closest to each other'. In this method we prepare for this normalization
by creating a class dictionary, and storing respective class frequencies. This is needed for ReliefF multiclass score update normalizations. """
mcmap = dict()
for i in range(self._datalen):
if(self._y[i] not in mcmap):
mcmap[self._y[i]] = 0
else:
mcmap[self._y[i]] += 1
for each in self._label_list:
mcmap[each] = mcmap[each]/float(self._datalen)
return mcmap
def _get_attribute_info(self):
""" Preprocess the training dataset to identify which features/attributes are discrete vs. continuous valued. Ignores missing values in this determination."""
attr = dict()
d = 0
limit = self.discrete_threshold
w = self._X.transpose()
for idx in range(len(w)):
h = self._headers[idx]
z = w[idx]
if self._missing_data_count > 0:
z = z[np.logical_not(np.isnan(z))] # Exclude any missing values from consideration
zlen = len(np.unique(z))
if zlen <= limit:
attr[h] = ('discrete', 0, 0, 0, 0)
d += 1
else:
mx = np.max(z)
mn = np.min(z)
sd = np.std(z)
attr[h] = ('continuous', mx, mn, mx - mn, sd)
# For each feature/attribute we store (type, max value, min value, max min difference, average, standard deviation) - the latter three values are set to zero if feature is discrete.
return attr
def _distarray_no_missing(self, xc, xd):
"""Distance array calculation for data with no missing values. The 'pdist() function outputs a condense distance array, and squareform() converts this vector-form
distance vector to a square-form, redundant distance matrix.
*This could be a target for saving memory in the future, by not needing to expand to the redundant square-form matrix. """
from scipy.spatial.distance import pdist, squareform
#------------------------------------------#
def pre_normalize(x):
"""Normalizes continuous features so they are in the same range (0 to 1)"""
idx = 0
# goes through all named features (doesn really need to) this method is only applied to continuous features
for i in sorted(self.attr.keys()):
if self.attr[i][0] == 'discrete':
continue
cmin = self.attr[i][2]
diff = self.attr[i][3]
x[:, idx] -= cmin
x[:, idx] /= diff
idx += 1
return x
#------------------------------------------#
if self.data_type == 'discrete': # discrete features only
return squareform(pdist(self._X, metric='hamming'))
elif self.data_type == 'mixed': # mix of discrete and continuous features
d_dist = squareform(pdist(xd, metric='hamming'))
# Cityblock is also known as Manhattan distance
c_dist = squareform(pdist(pre_normalize(xc), metric='cityblock'))
return np.add(d_dist * self._num_attributes, c_dist)
else: #continuous features only
#xc = pre_normalize(xc)
return squareform(pdist(pre_normalize(xc), metric='cityblock'))
#==================================================================#
def _dtype_array(self):
"""Return mask for discrete(0)/continuous(1) attributes and their indices. Return array of max/min diffs of attributes."""
attrtype = []
attrdiff = []
for key in self._headers:
if self.attr[key][0] == 'continuous':
attrtype.append(1)
else:
attrtype.append(0)
attrdiff.append(self.attr[key][3])
attrtype = np.array(attrtype)
cidx = np.where(attrtype == 1)[0]
didx = np.where(attrtype == 0)[0]
attrdiff = np.array(attrdiff)
return attrdiff, cidx, didx
#==================================================================#
def _distarray_missing(self, xc, xd, cdiffs):
"""Distance array calculation for data with missing values"""
cindices = []
dindices = []
# Get Boolean mask locating missing values for continuous and discrete features separately. These correspond to xc and xd respectively.
for i in range(self._datalen):
cindices.append(np.where(np.isnan(xc[i]))[0])
dindices.append(np.where(np.isnan(xd[i]))[0])
if self.n_jobs != 1:
dist_array = Parallel(n_jobs=self.n_jobs)(delayed(get_row_missing)(
xc, xd, cdiffs, index, cindices, dindices) for index in range(self._datalen))
else:
# For each instance calculate distance from all other instances (in non-redundant manner) (i.e. computes triangle, and puts zeros in for rest to form square).
dist_array = [get_row_missing(xc, xd, cdiffs, index, cindices, dindices)
for index in range(self._datalen)]
return np.array(dist_array)
#==================================================================#
# For Iter Relief
def _distarray_no_missing_iter(self, xc, xd, weights):
"""Distance array calculation for data with no missing values. The 'pdist() function outputs a condense distance array, and squareform() converts this vector-form
distance vector to a square-form, redundant distance matrix.
*This could be a target for saving memory in the future, by not needing to expand to the redundant square-form matrix. """
from scipy.spatial.distance import pdist, squareform
# ------------------------------------------#
def pre_normalize(x):
"""Normalizes continuous features so they are in the same range (0 to 1)"""
idx = 0
# goes through all named features (doesn really need to) this method is only applied to continuous features
for i in sorted(self.attr.keys()):
if self.attr[i][0] == 'discrete':
continue
cmin = self.attr[i][2]
diff = self.attr[i][3]
x[:, idx] -= cmin
x[:, idx] /= diff
idx += 1
return x
# ------------------------------------------#
if self.data_type == 'discrete': # discrete features only
return squareform(pdist(self._X, metric='hamming', w=weights))
elif self.data_type == 'mixed': # mix of discrete and continuous features
d_dist = squareform(pdist(xd, metric='hamming', w=weights))
# Cityblock is also known as Manhattan distance
c_dist = squareform(pdist(pre_normalize(xc), metric='cityblock', w=weights))
return np.add(d_dist, c_dist) / self._num_attributes
else: # continuous features only
# xc = pre_normalize(xc)
return squareform(pdist(pre_normalize(xc), metric='cityblock', w=weights))
# ==================================================================#
# For Iterrelief - get_row_missing_iter is called
def _distarray_missing_iter(self, xc, xd, cdiffs, weights):
"""Distance array calculation for data with missing values"""
cindices = []
dindices = []
# Get Boolean mask locating missing values for continuous and discrete features separately. These correspond to xc and xd respectively.
for i in range(self._datalen):
cindices.append(np.where(np.isnan(xc[i]))[0])
dindices.append(np.where(np.isnan(xd[i]))[0])
if self.n_jobs != 1:
dist_array = Parallel(n_jobs=self.n_jobs)(delayed(get_row_missing_iter)(
xc, xd, cdiffs, index, cindices, dindices, weights) for index in range(self._datalen))
else:
# For each instance calculate distance from all other instances (in non-redundant manner) (i.e. computes triangle, and puts zeros in for rest to form square).
dist_array = [get_row_missing_iter(xc, xd, cdiffs, index, cindices, dindices, weights)
for index in range(self._datalen)]
return np.array(dist_array)
# ==================================================================#
############################# ReliefF ############################################
def _find_neighbors(self, inst):
""" Identify k nearest hits and k nearest misses for given instance. This is accomplished differently based on the type of endpoint (i.e. binary, multiclass, and continuous). """
# Make a vector of distances between target instance (inst) and all others
dist_vect = []
for j in range(self._datalen):
if inst != j:
locator = [inst, j]
if inst < j:
locator.reverse()
dist_vect.append(self._distance_array[locator[0]][locator[1]])
else:
# Ensures that target instance is never selected as neighbor.
dist_vect.append(sys.maxsize)
dist_vect = np.array(dist_vect)
# Identify neighbors-------------------------------------------------------
""" NN for Binary Endpoints: """
if self._class_type == 'binary':
nn_list = []
match_count = 0
miss_count = 0
for nn_index in np.argsort(dist_vect):
if self._y[inst] == self._y[nn_index]: # Hit neighbor identified
if match_count >= self.n_neighbors:
continue
nn_list.append(nn_index)
match_count += 1
else: # Miss neighbor identified
if miss_count >= self.n_neighbors:
continue
nn_list.append(nn_index)
miss_count += 1
if match_count >= self.n_neighbors and miss_count >= self.n_neighbors:
break
elif self._class_type == 'multiclass':
nn_list = []
match_count = 0
miss_count = dict.fromkeys(self._label_list, 0)
for nn_index in np.argsort(dist_vect):
if self._y[inst] == self._y[nn_index]: # Hit neighbor identified
if match_count >= self.n_neighbors:
continue
nn_list.append(nn_index)
match_count += 1
else:
for label in self._label_list:
if self._y[nn_index] == label:
if miss_count[label] >= self.n_neighbors:
continue
nn_list.append(nn_index)
miss_count[label] += 1
if match_count >= self.n_neighbors and all(v >= self.n_neighbors for v in miss_count.values()):
break
else:
nn_list = []
match_count = 0
miss_count = 0
for nn_index in np.argsort(dist_vect):
if abs(self._y[inst]-self._y[nn_index]) < self._labels_std: # Hit neighbor identified
if match_count >= self.n_neighbors:
continue
nn_list.append(nn_index)
match_count += 1
else: # Miss neighbor identified
if miss_count >= self.n_neighbors:
continue
nn_list.append(nn_index)
miss_count += 1
if match_count >= self.n_neighbors and miss_count >= self.n_neighbors:
break
return np.array(nn_list)
def _run_algorithm(self):
""" Runs nearest neighbor (NN) identification and feature scoring to yield ReliefF scores. """
# Find nearest neighbors
NNlist = map(self._find_neighbors, range(self._datalen))
# Feature scoring - using identified nearest neighbors
nan_entries = np.isnan(self._X) # boolean mask for missing data values
# Call the scoring method for the ReliefF algorithm
if isinstance(self._weights, np.ndarray) and self.weight_final_scores:
# Call the scoring method for the ReliefF algorithm for IRelief
scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights)
for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0)
else:
# Call the scoring method for the ReliefF algorithm
scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type)
for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0)
return np.array(scores)