Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Implemented performance speedup for binary ReliefF + bug fixes #79

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 69 additions & 4 deletions skrebate/relieff.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,42 @@ def find_neighbors_binary(
return np.array(nn_list)


# interestingly, this is sometimes faster without numba
def compute_score_binary(
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the core of the performance improvements for score computation - it expresses the computation as a sequence of numpy matrix operations.

inst_x: np.array,
nearest_neighbors_x: np.ndarray,
inst_y: np.array,
nearest_neighbors_y: np.ndarray
) -> np.array:
"""
Compute the ReliefF score for a binary dataset

Input:
inst_x - numpy array of instance values
nearest_neighbors_x - numpy 2d array containing instance values for
nearest neighbors
inst_y - class value for instance
nearest_neighbors_y - numpy array of class values for nearest neighbors

Returns:
numpy array with a score for each attribute
"""

# get diffs
diffs = (inst_x != nearest_neighbors_x) * 1

# compare classes to determine hits and misses
hits = np.array((nearest_neighbors_y == inst_y), ndmin=2).T

# for hits, subtract 1 for each difference
# for misses, add 1 for each difference

# convert to -1 for hits (to penalize diffs)
# and 1 for misses (to reward far differences)
hits = (hits * -2) + 1
diffs *= hits

return np.nansum(diffs, axis=0)


class ReliefF(BaseEstimator):
Expand Down Expand Up @@ -726,9 +762,38 @@ def _run_algorithm(self):
), axis=0)
else:
# Call the scoring method for the ReliefF algorithm
scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
ReliefF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
NN, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type)
for instance_num, NN in zip(range(self._datalen), NNlist)), axis=0)

if self._class_type == 'binary' and \
all([x[0] == 'discrete' for x in self.attr.values()]):
# use optimized function
scores = np.sum(Parallel(n_jobs=self.n_jobs)(
delayed(compute_score_binary)(
self._X[instance_num],
self._X[NNs],
self._y[instance_num],
self._y[NNs]
) for instance_num, NNs in zip(
range(self._datalen),
NNlist)
), axis=0) / (self._datalen * self.n_neighbors * 2)
else:
scores = np.sum(Parallel(n_jobs=self.n_jobs)(
delayed(ReliefF_compute_scores)(
instance_num,
self.attr,
nan_entries,
self._num_attributes,
self.mcmap,
NN,
self._headers,
self._class_type,
self._X,
self._y,
self._labels_std,
self.data_type
) for instance_num, NN in zip(
range(self._datalen),
NNlist)
), axis=0)

return np.array(scores)