Merge pull request #9 from joachimwolff/dev

Dev
joachimwolff · Dec 3, 2020 · af057c2 · af057c2
2 parents a9dd633 + 021d083
commit af057c2
Show file tree

Hide file tree

Showing 19 changed files with 689 additions and 495 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,36 +1,81 @@
-# Python package
-# Create and test a Python package on multiple Python versions.
-# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
-# https://docs.microsoft.com/azure/devops/pipelines/languages/python
+pr:
+  autoCancel: true
+  
+jobs:
 
-trigger:
-- master
+- job: 'Linux'
+  timeoutInMinutes: 0
+  pool:
+    vmImage: 'ubuntu-latest'
+  strategy:
+    matrix:
+      Python36:
+        python.version: '3.6'
+      Python37:
+        python.version: '3.7'
+      Python38:
+        python.version: '3.8'
 
-pool:
-  vmImage: 'ubuntu-latest'
-strategy:
-  matrix:
-    Python27:
-      python.version: '2.7'
-    Python35:
-      python.version: '3.5'
-    Python36:
-      python.version: '3.6'
-    Python37:
-      python.version: '3.7'
+  steps:
+  - bash: |
+      echo "##vso[task.prependpath]$CONDA/bin"
+      hash -r
+    displayName: Add conda to PATH
+  - bash: |
+      conda config --set always_yes yes --set changeps1 no
+      conda info -a
+      conda create -n sparse-neighbors-search --yes -c conda-forge -c bioconda python=$(python.version) --file requirements_linux.txt
+      source activate sparse-neighbors-search
+      conda install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked
+      conda install --yes -c conda-forge -c bioconda nose
+      conda install --yes pathlib
+      conda install --yes -c defaults -c conda-forge -c bioconda configparser
+      python setup.py install
+    displayName: installing dependencies
+  - script: |
+      source activate sparse-neighbors-search
+      flake8 sparse_neighbors_search/cluster --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
+      flake8 sparse_neighbors_search/neighbors --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
+    displayName: linting
+  - script: |
+      source activate sparse-neighbors-search
+      py.test sparse_neighbors_search/test/
+    displayName: pytest
+- job: 'OSX'
+  timeoutInMinutes: 0
+  pool:
+    vmImage: 'macOS-10.14'
+  strategy:
+    matrix:
+      Python36:
+        python.version: '3.6'
+      Python37:
+        python.version: '3.7'
+      Python38:
+        python.version: '3.8'
 
-steps:
-- task: UsePythonVersion@0
-  inputs:
-    versionSpec: '$(python.version)'
-  displayName: 'Use Python $(python.version)'
-
-- script: |
-    python -m pip install --upgrade pip
-    pip install -r requirements.txt
-  displayName: 'Install dependencies'
-
-- script: |
-    pip install pytest pytest-azurepipelines
-    pytest
-  displayName: 'pytest'
+  steps:
+  - bash: |
+      echo "##vso[task.prependpath]$CONDA/bin"
+      hash -r
+    displayName: Add conda to PATH
+  - bash: |
+      conda config --set always_yes yes --set changeps1 no
+      conda info -a
+      conda create -n sparse-neighbors-search --yes -c conda-forge -c bioconda python=$(python.version) --file requirements_macos.txt
+      source activate sparse-neighbors-search
+      conda install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked
+      conda install --yes -c conda-forge -c bioconda nose
+      conda install --yes pathlib
+      conda install --yes -c defaults -c conda-forge -c bioconda configparser
+      python setup.py install
+    displayName: installing dependencies
+  - script: |
+      source activate sparse-neighbors-search
+      flake8 sparse_neighbors_search/cluster --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
+      flake8 sparse_neighbors_search/neighbors --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
+    displayName: linting
+  - script: |
+      source activate sparse-neighbors-search
+      py.test sparse_neighbors_search/test/
+    displayName: pytest
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,6 @@ cython
 numpy >=1.17
 scipy >=1.3
 scikit-learn >=0.21
-gxx_linux-64
+# gxx_linux-64
+clangxx_osx-64
 openmp
diff --git a/requirements_linux.txt b/requirements_linux.txt
@@ -0,0 +1,8 @@
+python >=3.6
+cython
+numpy >=1.17
+scipy >=1.3
+scikit-learn >=0.21
+gxx_linux-64
+openmp
+umap-learn
diff --git a/requirements_macos.txt b/requirements_macos.txt
@@ -0,0 +1,8 @@
+python >=3.6
+cython
+numpy >=1.17
+scipy >=1.3
+scikit-learn >=0.21
+clangxx_osx-64
+openmp
+umap-learn
diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@
 __credits__ = ["Milad Miladi", "Fabrizio Costa"]
 __license__ = "MIT"
 __date__ = time.strftime("%d/%m/%Y")
-__version__ = "0.6"
+__version__ = "0.7"
 
 from setuptools import setup, find_packages
 import platform

diff --git a/sparse_neighbors_search/__init__.py b/sparse_neighbors_search/__init__.py
@@ -20,4 +20,9 @@
 # import cluster
 from .cluster.minHashSpectralClustering import MinHashSpectralClustering
 from .cluster.minHashDBSCAN import MinHashDBSCAN
-from .cluster.minHashClustering import MinHashClustering
+from .cluster.minHashClustering import MinHashClustering
+
+import logging
+# logging.basicConfig(level=logging.DEBUG)
+logging.basicConfig(level=logging.INFO)
+logging.getLogger('numba').setLevel(logging.ERROR)
diff --git a/sparse_neighbors_search/cluster/__init__.py b/sparse_neighbors_search/cluster/__init__.py
@@ -1,3 +1,8 @@
 from .minHashSpectralClustering import MinHashSpectralClustering
 from .minHashDBSCAN import MinHashDBSCAN
-from .minHashClustering import MinHashClustering
+from .minHashClustering import MinHashClustering
+
+import logging
+# logging.basicConfig(level=logging.DEBUG)
+logging.basicConfig(level=logging.INFO)
+logging.getLogger('numba').setLevel(logging.ERROR)
diff --git a/sparse_neighbors_search/cluster/minHashClustering.py b/sparse_neighbors_search/cluster/minHashClustering.py
@@ -15,55 +15,85 @@
 import numpy as np
 from scipy.sparse import vstack
 from sklearn.decomposition import PCA
+# from scanpy import tl, pp
+# from anndata import AnnData
+import umap
+
 
 class MinHashClustering():
     def __init__(self, minHashObject, clusteringObject):
         self._minHashObject = minHashObject
         self._clusteringObject = clusteringObject
         self._precomputed_graph = None
-    def fit(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None):
+
+    def fit(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None, pUmap=None, pUmapDict=None):
         if pSaveMemory is not None and pSaveMemory > 0:
             if pSaveMemory > 1:
                 pSaveMemory = 1
             number_of_elements = X.shape[0]
             batch_size = int(np.floor(number_of_elements * pSaveMemory))
             if batch_size < 1:
                 batch_size = 1
-            self._minHashObject.fit(X[0:batch_size, :])
+
+            if y is None:
+                self._minHashObject.fit(X[0:batch_size, :])
+            else:
+                self._minHashObject.fit(X[0:batch_size, :], y[0:batch_size])
             if batch_size < number_of_elements:
                 for i in range(batch_size, X.shape[0], batch_size):
-                    self._minHashObject.partial_fit(X[i:i+batch_size, :])
+                    if y is None:
+                        self._minHashObject.partial_fit(X[i:i + batch_size, :])
+                    else:
+                        self._minHashObject.partial_fit(X[i:i + batch_size, :], y[i:i + batch_size])
         else:
-            self._minHashObject.fit(X)
+            self._minHashObject.fit(X, y=y)
         self._precomputed_graph = self._minHashObject.kneighbors_graph(mode='distance')
 
         if pPca:
-            pca = PCA(n_components = min(self._precomputed_graph.shape) - 1)
+            pca = PCA(n_components=min(self._precomputed_graph.shape) - 1)
             self._precomputed_graph = pca.fit_transform(self._precomputed_graph.todense())
-
             if pPcaDimensions:
                 pPcaDimensions = min(pPcaDimensions, self._precomputed_graph.shape[0])
-                self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions])
-                return 
+                self._precomputed_graph = self._precomputed_graph[:, :pPcaDimensions]
+        if pUmap:
+
+            if pUmapDict is None:
+                reducer = umap.UMAP()
+            else:
+                reducer = umap.UMAP(n_neighbors=pUmapDict['umap_n_neighbors'], n_components=pUmapDict['umap_n_components'], metric=pUmapDict['umap_metric'],
+                                    n_epochs=pUmapDict['umap_n_epochs'],
+                                    learning_rate=pUmapDict['umap_learning_rate'], init=pUmapDict['umap_init'], min_dist=pUmapDict['umap_min_dist'], spread=pUmapDict['umap_spread'],
+                                    set_op_mix_ratio=pUmapDict['umap_set_op_mix_ratio'], local_connectivity=pUmapDict['umap_local_connectivity'],
+                                    repulsion_strength=pUmapDict['umap_repulsion_strength'], negative_sample_rate=pUmapDict['umap_negative_sample_rate'], transform_queue_size=pUmapDict['umap_transform_queue_size'],
+                                    a=pUmapDict['umap_a'], b=pUmapDict['umap_b'], angular_rp_forest=pUmapDict['umap_angular_rp_forest'],
+                                    target_n_neighbors=pUmapDict['umap_target_n_neighbors'], target_metric=pUmapDict['umap_target_metric'],
+                                    target_weight=pUmapDict['umap_target_weight'], random_state=pUmapDict['umap_random'],
+                                    force_approximation_algorithm=pUmapDict['umap_force_approximation_algorithm'], verbose=pUmapDict['umap_verbose'], unique=pUmapDict['umap_unique'])
+            self._precomputed_graph = reducer.fit_transform(self._precomputed_graph)
+        if pPca or pUmap:
+            self._clusteringObject.fit(self._precomputed_graph)
+            return
         try:
             self._clusteringObject.fit(self._precomputed_graph)
-        except:
+        except Exception:
             self._clusteringObject.fit(self._precomputed_graph.todense())
         return
-	
-    def fit_predict(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None):
+
+    def fit_predict(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None, pUmap=None, **pUmapDict):
 
         self.fit(X, y, pSaveMemory=pSaveMemory, pPca=pPca, pPcaDimensions=pPcaDimensions)
 
-        return self.predict(self._precomputed_graph, y, pPca=pPca, pPcaDimensions=pPcaDimensions )
-		
-    def predict(self, X, y=None, pPca=None, pPcaDimensions=None):
+        return self.predict(self._precomputed_graph, y, pPca=pPca, pPcaDimensions=pPcaDimensions, pUmap=pUmap, pUmapDict=pUmapDict)
+
+    def predict(self, X, y=None, pPca=None, pPcaDimensions=None, pUmap=None, pUmapDict=None):
         if hasattr(self._clusteringObject, 'labels_'):
             return self._clusteringObject.labels_.astype(np.int)
         else:
             if pPca:
                 if pPcaDimensions:
                     pPcaDimensions = min(pPcaDimensions, self._precomputed_graph.shape[0])
-                    return self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions])
-
-            return self._clusteringObject.predict(X)
+                    self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions], pUmap, pUmapDict)
+            elif pUmap:
+                self._clusteringObject.fit(self._precomputed_graph, pUmap, pUmapDict)
+
+            return self._clusteringObject.predict(X)
diff --git a/sparse_neighbors_search/cluster/minHashDBSCAN.py b/sparse_neighbors_search/cluster/minHashDBSCAN.py
@@ -16,14 +16,15 @@
 
 import numpy as np
 
+
 class MinHashDBSCAN():
-    def __init__(self, eps=0.5, min_samples=5, 
-        algorithm='auto', leaf_size=30, p=None, random_state=None, 
-        fast=False, n_neighbors=5, radius=1.0,
-        number_of_hash_functions=400,
-        max_bin_size = 50, minimal_blocks_in_common = 1,
-        shingle_size = 4, excess_factor = 5,
-        number_of_cores=None, chunk_size=None):
+    def __init__(self, eps=0.5, min_samples=5,
+                 algorithm='auto', leaf_size=30, p=None, random_state=None,
+                 fast=False, n_neighbors=5, radius=1.0,
+                 number_of_hash_functions=400,
+                 max_bin_size=50, minimal_blocks_in_common=1,
+                 shingle_size=4, excess_factor=5,
+                 number_of_cores=None, chunk_size=None):
 
         self.eps = eps
         self.min_samples = min_samples
@@ -44,20 +45,21 @@ def __init__(self, eps=0.5, min_samples=5,
         self.n_neighbors = n_neighbors
 
         self._dbscan = DBSCAN(eps=self.eps, min_samples=min_samples, metric='precomputed',
-                algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p)
+                              algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p)
         self.labels_ = None
         self._precomputed_graph = None
         # only for compatible issues
+
     def fit(self, X, y=None, pSaveMemory=None):
-        minHashNeighbors = MinHash(n_neighbors = self.n_neighbors, 
-        radius = self.radius, fast = self.fast,
-        number_of_hash_functions = self.number_of_hash_functions,
-        max_bin_size = self.max_bin_size,
-        minimal_blocks_in_common = self.minimal_blocks_in_common,
-        shingle_size = self.shingle_size,
-        excess_factor = self.excess_factor,
-        number_of_cores = self.number_of_cores,
-        chunk_size = self.chunk_size, similarity=False)
+        minHashNeighbors = MinHash(n_neighbors=self.n_neighbors,
+                                   radius=self.radius, fast=self.fast,
+                                   number_of_hash_functions=self.number_of_hash_functions,
+                                   max_bin_size=self.max_bin_size,
+                                   minimal_blocks_in_common=self.minimal_blocks_in_common,
+                                   shingle_size=self.shingle_size,
+                                   excess_factor=self.excess_factor,
+                                   number_of_cores=self.number_of_cores,
+                                   chunk_size=self.chunk_size, similarity=False)
 
         if pSaveMemory is not None and pSaveMemory > 0:
             if pSaveMemory > 1:
@@ -69,15 +71,15 @@ def fit(self, X, y=None, pSaveMemory=None):
             minHashNeighbors.fit(X[0:batch_size, :])
             if batch_size < number_of_elements:
                 for i in range(batch_size, X.shape[0], batch_size):
-                    minHashNeighbors.partial_fit(X[i:i+batch_size, :])
+                    minHashNeighbors.partial_fit(X[i:i + batch_size, :])
         else:
             minHashNeighbors.fit(X)
 
-
         # minHashNeighbors.fit(X, y)
         self._precomputed_graph = minHashNeighbors.kneighbors_graph(mode='distance')
         self._dbscan.fit(self._precomputed_graph)
         self.labels_ = self._dbscan.labels_
+
     def fit_predict(self, X, y=None, pSaveMemory=None):
         self.fit(X, y, pSaveMemory=None)
-        return self.labels_
+        return self.labels_