joblibMerge branch 'development'

aadu · Jul 18, 2020 · a6c3421 · a6c3421
2 parents 1830056 + a5b05ef
commit a6c3421
Show file tree

Hide file tree

Showing 22 changed files with 3,109 additions and 49 deletions.
diff --git a/analysis_pipeline/.DS_Store b/analysis_pipeline/.DS_Store
diff --git a/analysis_pipeline/skrebatewip/__init__.py b/analysis_pipeline/skrebatewip/__init__.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+
+"""
+scikit-rebate was primarily developed at the University of Pennsylvania by:
+    - Randal S. Olson (rso@randalolson.com)
+    - Pete Schmitt (pschmitt@upenn.edu)
+    - Ryan J. Urbanowicz (ryanurb@upenn.edu)
+    - Weixuan Fu (weixuanf@upenn.edu)
+    - and many more generous open source contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+and associated documentation files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+from ._version import __version__
+from .relieff import ReliefF
+from .surf import SURF
+from .surfstar import SURFstar
+from .multisurf import MultiSURF
+from .multisurfstar import MultiSURFstar
+from .turf import TURF
+from .vls import VLS
+from .iter import Iter
diff --git a/analysis_pipeline/skrebatewip/_version.py b/analysis_pipeline/skrebatewip/_version.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+
+"""
+scikit-rebate was primarily developed at the University of Pennsylvania by:
+    - Randal S. Olson (rso@randalolson.com)
+    - Pete Schmitt (pschmitt@upenn.edu)
+    - Ryan J. Urbanowicz (ryanurb@upenn.edu)
+    - Weixuan Fu (weixuanf@upenn.edu)
+    - and many more generous open source contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+and associated documentation files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+__version__ = '0.7'
diff --git a/analysis_pipeline/skrebatewip/iter.py b/analysis_pipeline/skrebatewip/iter.py
@@ -0,0 +1,139 @@
+from sklearn.base import BaseEstimator
+import copy
+import numpy as np
+
+class Iter(BaseEstimator):
+
+    def __init__(self,relief_object,max_iter=10,convergence_threshold=0.0001,beta=0.1):
+        '''
+        :param relief_object:           Must be an object that implements the standard sklearn fit function, and after fit, has attribute feature_importances_
+                                        that can be accessed. Scores must be a 1D np.ndarray of length # of features. The fit function must also be able to
+                                        take in an optional 1D np.ndarray 'weights' parameter of length num_features.
+        :param max_iter:                Maximum number of iterations to run
+        :param convergence_threshold    Difference between iteration feature weights to determine convergence
+        :param beta                     Learning Rate for Widrow Hoff Weight Update
+        '''
+
+        if not self.check_is_int(max_iter) or max_iter < 0:
+            raise Exception('max_iter must be a nonnegative integer')
+
+        if not self.check_is_float(convergence_threshold) or convergence_threshold < 0:
+            raise Exception('convergence_threshold must be a nonnegative float')
+
+        if not self.check_is_float(beta):
+            raise Exception('beta must be a float')
+
+        self.relief_object = relief_object
+        self.max_iter = max_iter
+        self.converage_threshold = convergence_threshold
+        self.rank_absolute = self.relief_object.rank_absolute
+        self.beta = beta
+
+    def fit(self, X, y):
+        """Scikit-learn required: Computes the feature importance scores from the training data.
+        Parameters
+        ----------
+        X: array-like {n_samples, n_features} Training instances to compute the feature importance scores from
+        y: array-like {n_samples}             Training labels
+
+        Returns
+         -------
+         self
+        """
+
+        #Iterate, feeding the resulting weights of the first run into the fit of the next run (how are they translated?)
+        last_iteration_scores = None
+        last_last_iteration_scores = None
+        for i in range(self.max_iter):
+            copy_relief_object = copy.deepcopy(self.relief_object)
+            if i == 0:
+                copy_relief_object.fit(X,y)
+                last_iteration_scores = copy_relief_object.feature_importances_
+            elif i == 1:
+                if self.rank_absolute:
+                    absolute_weights = np.absolute(last_iteration_scores)
+                    transformed_weights = absolute_weights/np.max(absolute_weights)
+                else:
+                    transformed_weights = self.transform_weights(last_iteration_scores)
+                copy_relief_object.fit(X, y, weights=transformed_weights)
+                if self.has_converged(last_iteration_scores,copy_relief_object.feature_importances_):
+                    last_iteration_scores = copy_relief_object.feature_importances_
+                    break
+                last_last_iteration_scores = copy.deepcopy(transformed_weights)
+                last_iteration_scores = copy_relief_object.feature_importances_
+            else:
+                if self.rank_absolute:
+                    absolute_weights = np.absolute(last_iteration_scores)
+                    new_weights = absolute_weights/np.max(absolute_weights)
+                else:
+                    new_weights = self.transform_weights(last_iteration_scores)
+
+                transformed_weights = self.widrow_hoff(last_last_iteration_scores,new_weights,self.beta)
+                copy_relief_object.fit(X,y,weights=transformed_weights)
+                if self.has_converged(last_iteration_scores,copy_relief_object.feature_importances_):
+                    last_iteration_scores = copy_relief_object.feature_importances_
+                    break
+                last_last_iteration_scores = copy.deepcopy(transformed_weights)
+                last_iteration_scores = copy_relief_object.feature_importances_
+
+            #DEBUGGING
+            #print(last_iteration_scores)
+
+        #Save final FI as feature_importances_
+        self.feature_importances_ = last_iteration_scores
+
+        if self.rank_absolute:
+            self.top_features_ = np.argsort(np.absolute(self.feature_importances_))[::-1]
+        else:
+            self.top_features_ = np.argsort(self.feature_importances_)[::-1]
+
+        return self
+
+    def widrow_hoff(self,originalw, neww,beta):
+        diff = neww-originalw
+        return originalw + (beta*diff)
+
+    def has_converged(self,weight1,weight2):
+        for i in range(len(weight1)):
+            if abs(weight1[i] - weight2[i]) >= self.converage_threshold:
+                return False
+        return True
+
+    def transform_weights(self,weights):
+        max_val = np.max(weights)
+        for i in range(len(weights)):
+            if weights[i] < 0:
+                weights[i] = 0
+            else:
+                if max_val == 0:
+                    weights[i] = 0
+                else:
+                    weights[i] = weights[i]/max_val
+        return weights
+
+    def check_is_int(self, num):
+        try:
+            n = float(num)
+            if num - int(num) == 0:
+                return True
+            else:
+                return False
+        except:
+            return False
+
+    def check_is_float(self, num):
+        try:
+            n = float(num)
+            return True
+        except:
+            return False
+
+    def transform(self, X):
+        if X.shape[1] < self.relief_object.n_features_to_select:
+            raise ValueError('Number of features to select is larger than the number of features in the dataset.')
+
+        return X[:, self.top_features_[:self.relief_object.n_features_to_select]]
+
+    def fit_transform(self, X, y):
+        self.fit(X, y)
+        return self.transform(X)
diff --git a/analysis_pipeline/skrebatewip/multisurf.py b/analysis_pipeline/skrebatewip/multisurf.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+"""
+scikit-rebate was primarily developed at the University of Pennsylvania by:
+    - Randal S. Olson (rso@randalolson.com)
+    - Pete Schmitt (pschmitt@upenn.edu)
+    - Ryan J. Urbanowicz (ryanurb@upenn.edu)
+    - Weixuan Fu (weixuanf@upenn.edu)
+    - and many more generous open source contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+and associated documentation files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+from __future__ import print_function
+import numpy as np
+from .surfstar import SURFstar
+from joblib import Parallel, delayed
+from .scoring_utils import MultiSURF_compute_scores
+
+
+class MultiSURF(SURFstar):
+
+    """Feature selection using data-mined expert knowledge.
+
+    Based on the MultiSURF algorithm as introduced in:
+
+    Moore, Jason et al. Multiple Threshold Spatially Uniform ReliefF
+    for the Genetic Analysis of Complex Human Diseases.
+
+    """
+
+############################# MultiSURF ########################################
+    def _find_neighbors(self, inst):
+        """ Identify nearest hits and misses within radius defined by average distance and standard deviation around each target training instance.
+        This works the same regardless of endpoint type. """
+        dist_vect = []
+        for j in range(self._datalen):
+            if inst != j:
+                locator = [inst, j]
+                if inst < j:
+                    locator.reverse()
+                dist_vect.append(self._distance_array[locator[0]][locator[1]])
+
+        dist_vect = np.array(dist_vect)
+        inst_avg_dist = np.average(dist_vect)
+        inst_std = np.std(dist_vect) / 2.
+        # Defining a narrower radius based on the average instance distance minus the standard deviation of instance distances.
+        near_threshold = inst_avg_dist - inst_std
+
+        NN_near = []
+        for j in range(self._datalen):
+            if inst != j:
+                locator = [inst, j]
+                if inst < j:
+                    locator.reverse()
+                if self._distance_array[locator[0]][locator[1]] < near_threshold:
+                    NN_near.append(j)
+
+        return np.array(NN_near)
+
+    def _run_algorithm(self):
+        """ Runs nearest neighbor (NN) identification and feature scoring to yield MultiSURF scores. """
+        nan_entries = np.isnan(self._X)
+
+        NNlist = [self._find_neighbors(datalen) for datalen in range(self._datalen)]
+
+        if isinstance(self._weights,np.ndarray) and self.weight_final_scores:
+            scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
+                MultiSURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
+                                          NN_near, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights)
+                for instance_num, NN_near in zip(range(self._datalen), NNlist)), axis=0)
+        else:
+            scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
+                MultiSURF_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
+                                          NN_near, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type)
+                for instance_num, NN_near in zip(range(self._datalen), NNlist)), axis=0)
+
+        return np.array(scores)
diff --git a/analysis_pipeline/skrebatewip/multisurfstar.py b/analysis_pipeline/skrebatewip/multisurfstar.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+"""
+scikit-rebate was primarily developed at the University of Pennsylvania by:
+    - Randal S. Olson (rso@randalolson.com)
+    - Pete Schmitt (pschmitt@upenn.edu)
+    - Ryan J. Urbanowicz (ryanurb@upenn.edu)
+    - Weixuan Fu (weixuanf@upenn.edu)
+    - and many more generous open source contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+and associated documentation files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+from __future__ import print_function
+import numpy as np
+from .surfstar import SURFstar
+from .scoring_utils import MultiSURFstar_compute_scores
+from joblib import Parallel, delayed
+
+
+class MultiSURFstar(SURFstar):
+
+    """Feature selection using data-mined expert knowledge.
+
+    Based on the MultiSURF algorithm as introduced in:
+
+    Moore, Jason et al. Multiple Threshold Spatially Uniform ReliefF
+    for the Genetic Analysis of Complex Human Diseases.
+
+    """
+
+############################# MultiSURF* ########################################
+    def _find_neighbors(self, inst):
+        """ Identify nearest as well as farthest hits and misses within radius defined by average distance and standard deviation of distances from target instanace.
+        This works the same regardless of endpoint type. """
+        dist_vect = []
+        for j in range(self._datalen):
+            if inst != j:
+                locator = [inst, j]
+                if inst < j:
+                    locator.reverse()
+                dist_vect.append(self._distance_array[locator[0]][locator[1]])
+
+        dist_vect = np.array(dist_vect)
+        inst_avg_dist = np.average(dist_vect)
+        inst_std = np.std(dist_vect) / 2.
+        near_threshold = inst_avg_dist - inst_std
+        far_threshold = inst_avg_dist + inst_std
+
+        NN_near = []
+        NN_far = []
+        for j in range(self._datalen):
+            if inst != j:
+                locator = [inst, j]
+                if inst < j:
+                    locator.reverse()
+                if self._distance_array[locator[0]][locator[1]] < near_threshold:
+                    NN_near.append(j)
+                elif self._distance_array[locator[0]][locator[1]] > far_threshold:
+                    NN_far.append(j)
+
+        return np.array(NN_near), np.array(NN_far)
+
+    def _run_algorithm(self):
+        """ Runs nearest neighbor (NN) identification and feature scoring to yield MultiSURF* scores. """
+        nan_entries = np.isnan(self._X)
+
+        NNlist = [self._find_neighbors(datalen) for datalen in range(self._datalen)]
+        NN_near_list = [i[0] for i in NNlist]
+        NN_far_list = [i[1] for i in NNlist]
+
+        if isinstance(self._weights,np.ndarray) and self.weight_final_scores:
+            scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
+                MultiSURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
+                                              NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type, self._weights)
+                for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0)
+        else:
+            scores = np.sum(Parallel(n_jobs=self.n_jobs)(delayed(
+                MultiSURFstar_compute_scores)(instance_num, self.attr, nan_entries, self._num_attributes, self.mcmap,
+                                              NN_near, NN_far, self._headers, self._class_type, self._X, self._y, self._labels_std, self.data_type)
+                for instance_num, NN_near, NN_far in zip(range(self._datalen), NN_near_list, NN_far_list)), axis=0)
+
+        return np.array(scores)