Skip to content

Commit

Permalink
Merge pull request #9 from joachimwolff/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
joachimwolff authored Dec 3, 2020
2 parents a9dd633 + 021d083 commit af057c2
Show file tree
Hide file tree
Showing 19 changed files with 689 additions and 495 deletions.
111 changes: 78 additions & 33 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
@@ -1,36 +1,81 @@
# Python package
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
pr:
autoCancel: true
jobs:

trigger:
- master
- job: 'Linux'
timeoutInMinutes: 0
pool:
vmImage: 'ubuntu-latest'
strategy:
matrix:
Python36:
python.version: '3.6'
Python37:
python.version: '3.7'
Python38:
python.version: '3.8'

pool:
vmImage: 'ubuntu-latest'
strategy:
matrix:
Python27:
python.version: '2.7'
Python35:
python.version: '3.5'
Python36:
python.version: '3.6'
Python37:
python.version: '3.7'
steps:
- bash: |
echo "##vso[task.prependpath]$CONDA/bin"
hash -r
displayName: Add conda to PATH
- bash: |
conda config --set always_yes yes --set changeps1 no
conda info -a
conda create -n sparse-neighbors-search --yes -c conda-forge -c bioconda python=$(python.version) --file requirements_linux.txt
source activate sparse-neighbors-search
conda install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked
conda install --yes -c conda-forge -c bioconda nose
conda install --yes pathlib
conda install --yes -c defaults -c conda-forge -c bioconda configparser
python setup.py install
displayName: installing dependencies
- script: |
source activate sparse-neighbors-search
flake8 sparse_neighbors_search/cluster --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
flake8 sparse_neighbors_search/neighbors --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
displayName: linting
- script: |
source activate sparse-neighbors-search
py.test sparse_neighbors_search/test/
displayName: pytest
- job: 'OSX'
timeoutInMinutes: 0
pool:
vmImage: 'macOS-10.14'
strategy:
matrix:
Python36:
python.version: '3.6'
Python37:
python.version: '3.7'
Python38:
python.version: '3.8'

steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '$(python.version)'
displayName: 'Use Python $(python.version)'

- script: |
python -m pip install --upgrade pip
pip install -r requirements.txt
displayName: 'Install dependencies'

- script: |
pip install pytest pytest-azurepipelines
pytest
displayName: 'pytest'
steps:
- bash: |
echo "##vso[task.prependpath]$CONDA/bin"
hash -r
displayName: Add conda to PATH
- bash: |
conda config --set always_yes yes --set changeps1 no
conda info -a
conda create -n sparse-neighbors-search --yes -c conda-forge -c bioconda python=$(python.version) --file requirements_macos.txt
source activate sparse-neighbors-search
conda install --yes -c conda-forge -c bioconda pytest flake8 pytest-xdist pytest-forked
conda install --yes -c conda-forge -c bioconda nose
conda install --yes pathlib
conda install --yes -c defaults -c conda-forge -c bioconda configparser
python setup.py install
displayName: installing dependencies
- script: |
source activate sparse-neighbors-search
flake8 sparse_neighbors_search/cluster --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
flake8 sparse_neighbors_search/neighbors --exclude=.venv,.build,planemo_test_env,build --ignore=E501,F401,F403,E402,F999,F405,E712,W504
displayName: linting
- script: |
source activate sparse-neighbors-search
py.test sparse_neighbors_search/test/
displayName: pytest
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ cython
numpy >=1.17
scipy >=1.3
scikit-learn >=0.21
gxx_linux-64
# gxx_linux-64
clangxx_osx-64
openmp
8 changes: 8 additions & 0 deletions requirements_linux.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
python >=3.6
cython
numpy >=1.17
scipy >=1.3
scikit-learn >=0.21
gxx_linux-64
openmp
umap-learn
8 changes: 8 additions & 0 deletions requirements_macos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
python >=3.6
cython
numpy >=1.17
scipy >=1.3
scikit-learn >=0.21
clangxx_osx-64
openmp
umap-learn
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
__credits__ = ["Milad Miladi", "Fabrizio Costa"]
__license__ = "MIT"
__date__ = time.strftime("%d/%m/%Y")
__version__ = "0.6"
__version__ = "0.7"

from setuptools import setup, find_packages
import platform
Expand Down
7 changes: 6 additions & 1 deletion sparse_neighbors_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,9 @@
# import cluster
from .cluster.minHashSpectralClustering import MinHashSpectralClustering
from .cluster.minHashDBSCAN import MinHashDBSCAN
from .cluster.minHashClustering import MinHashClustering
from .cluster.minHashClustering import MinHashClustering

import logging
# logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
logging.getLogger('numba').setLevel(logging.ERROR)
7 changes: 6 additions & 1 deletion sparse_neighbors_search/cluster/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from .minHashSpectralClustering import MinHashSpectralClustering
from .minHashDBSCAN import MinHashDBSCAN
from .minHashClustering import MinHashClustering
from .minHashClustering import MinHashClustering

import logging
# logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)
logging.getLogger('numba').setLevel(logging.ERROR)
64 changes: 47 additions & 17 deletions sparse_neighbors_search/cluster/minHashClustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,55 +15,85 @@
import numpy as np
from scipy.sparse import vstack
from sklearn.decomposition import PCA
# from scanpy import tl, pp
# from anndata import AnnData
import umap


class MinHashClustering():
def __init__(self, minHashObject, clusteringObject):
self._minHashObject = minHashObject
self._clusteringObject = clusteringObject
self._precomputed_graph = None
def fit(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None):

def fit(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None, pUmap=None, pUmapDict=None):
if pSaveMemory is not None and pSaveMemory > 0:
if pSaveMemory > 1:
pSaveMemory = 1
number_of_elements = X.shape[0]
batch_size = int(np.floor(number_of_elements * pSaveMemory))
if batch_size < 1:
batch_size = 1
self._minHashObject.fit(X[0:batch_size, :])

if y is None:
self._minHashObject.fit(X[0:batch_size, :])
else:
self._minHashObject.fit(X[0:batch_size, :], y[0:batch_size])
if batch_size < number_of_elements:
for i in range(batch_size, X.shape[0], batch_size):
self._minHashObject.partial_fit(X[i:i+batch_size, :])
if y is None:
self._minHashObject.partial_fit(X[i:i + batch_size, :])
else:
self._minHashObject.partial_fit(X[i:i + batch_size, :], y[i:i + batch_size])
else:
self._minHashObject.fit(X)
self._minHashObject.fit(X, y=y)
self._precomputed_graph = self._minHashObject.kneighbors_graph(mode='distance')

if pPca:
pca = PCA(n_components = min(self._precomputed_graph.shape) - 1)
pca = PCA(n_components=min(self._precomputed_graph.shape) - 1)
self._precomputed_graph = pca.fit_transform(self._precomputed_graph.todense())

if pPcaDimensions:
pPcaDimensions = min(pPcaDimensions, self._precomputed_graph.shape[0])
self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions])
return
self._precomputed_graph = self._precomputed_graph[:, :pPcaDimensions]
if pUmap:

if pUmapDict is None:
reducer = umap.UMAP()
else:
reducer = umap.UMAP(n_neighbors=pUmapDict['umap_n_neighbors'], n_components=pUmapDict['umap_n_components'], metric=pUmapDict['umap_metric'],
n_epochs=pUmapDict['umap_n_epochs'],
learning_rate=pUmapDict['umap_learning_rate'], init=pUmapDict['umap_init'], min_dist=pUmapDict['umap_min_dist'], spread=pUmapDict['umap_spread'],
set_op_mix_ratio=pUmapDict['umap_set_op_mix_ratio'], local_connectivity=pUmapDict['umap_local_connectivity'],
repulsion_strength=pUmapDict['umap_repulsion_strength'], negative_sample_rate=pUmapDict['umap_negative_sample_rate'], transform_queue_size=pUmapDict['umap_transform_queue_size'],
a=pUmapDict['umap_a'], b=pUmapDict['umap_b'], angular_rp_forest=pUmapDict['umap_angular_rp_forest'],
target_n_neighbors=pUmapDict['umap_target_n_neighbors'], target_metric=pUmapDict['umap_target_metric'],
target_weight=pUmapDict['umap_target_weight'], random_state=pUmapDict['umap_random'],
force_approximation_algorithm=pUmapDict['umap_force_approximation_algorithm'], verbose=pUmapDict['umap_verbose'], unique=pUmapDict['umap_unique'])
self._precomputed_graph = reducer.fit_transform(self._precomputed_graph)
if pPca or pUmap:
self._clusteringObject.fit(self._precomputed_graph)
return
try:
self._clusteringObject.fit(self._precomputed_graph)
except:
except Exception:
self._clusteringObject.fit(self._precomputed_graph.todense())
return
def fit_predict(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None):

def fit_predict(self, X, y=None, pSaveMemory=None, pPca=None, pPcaDimensions=None, pUmap=None, **pUmapDict):

self.fit(X, y, pSaveMemory=pSaveMemory, pPca=pPca, pPcaDimensions=pPcaDimensions)

return self.predict(self._precomputed_graph, y, pPca=pPca, pPcaDimensions=pPcaDimensions )
def predict(self, X, y=None, pPca=None, pPcaDimensions=None):
return self.predict(self._precomputed_graph, y, pPca=pPca, pPcaDimensions=pPcaDimensions, pUmap=pUmap, pUmapDict=pUmapDict)

def predict(self, X, y=None, pPca=None, pPcaDimensions=None, pUmap=None, pUmapDict=None):
if hasattr(self._clusteringObject, 'labels_'):
return self._clusteringObject.labels_.astype(np.int)
else:
if pPca:
if pPcaDimensions:
pPcaDimensions = min(pPcaDimensions, self._precomputed_graph.shape[0])
return self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions])

return self._clusteringObject.predict(X)
self._clusteringObject.fit(self._precomputed_graph[:, :pPcaDimensions], pUmap, pUmapDict)
elif pUmap:
self._clusteringObject.fit(self._precomputed_graph, pUmap, pUmapDict)

return self._clusteringObject.predict(X)
42 changes: 22 additions & 20 deletions sparse_neighbors_search/cluster/minHashDBSCAN.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@

import numpy as np


class MinHashDBSCAN():
def __init__(self, eps=0.5, min_samples=5,
algorithm='auto', leaf_size=30, p=None, random_state=None,
fast=False, n_neighbors=5, radius=1.0,
number_of_hash_functions=400,
max_bin_size = 50, minimal_blocks_in_common = 1,
shingle_size = 4, excess_factor = 5,
number_of_cores=None, chunk_size=None):
def __init__(self, eps=0.5, min_samples=5,
algorithm='auto', leaf_size=30, p=None, random_state=None,
fast=False, n_neighbors=5, radius=1.0,
number_of_hash_functions=400,
max_bin_size=50, minimal_blocks_in_common=1,
shingle_size=4, excess_factor=5,
number_of_cores=None, chunk_size=None):

self.eps = eps
self.min_samples = min_samples
Expand All @@ -44,20 +45,21 @@ def __init__(self, eps=0.5, min_samples=5,
self.n_neighbors = n_neighbors

self._dbscan = DBSCAN(eps=self.eps, min_samples=min_samples, metric='precomputed',
algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p)
algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p)
self.labels_ = None
self._precomputed_graph = None
# only for compatible issues

def fit(self, X, y=None, pSaveMemory=None):
minHashNeighbors = MinHash(n_neighbors = self.n_neighbors,
radius = self.radius, fast = self.fast,
number_of_hash_functions = self.number_of_hash_functions,
max_bin_size = self.max_bin_size,
minimal_blocks_in_common = self.minimal_blocks_in_common,
shingle_size = self.shingle_size,
excess_factor = self.excess_factor,
number_of_cores = self.number_of_cores,
chunk_size = self.chunk_size, similarity=False)
minHashNeighbors = MinHash(n_neighbors=self.n_neighbors,
radius=self.radius, fast=self.fast,
number_of_hash_functions=self.number_of_hash_functions,
max_bin_size=self.max_bin_size,
minimal_blocks_in_common=self.minimal_blocks_in_common,
shingle_size=self.shingle_size,
excess_factor=self.excess_factor,
number_of_cores=self.number_of_cores,
chunk_size=self.chunk_size, similarity=False)

if pSaveMemory is not None and pSaveMemory > 0:
if pSaveMemory > 1:
Expand All @@ -69,15 +71,15 @@ def fit(self, X, y=None, pSaveMemory=None):
minHashNeighbors.fit(X[0:batch_size, :])
if batch_size < number_of_elements:
for i in range(batch_size, X.shape[0], batch_size):
minHashNeighbors.partial_fit(X[i:i+batch_size, :])
minHashNeighbors.partial_fit(X[i:i + batch_size, :])
else:
minHashNeighbors.fit(X)


# minHashNeighbors.fit(X, y)
self._precomputed_graph = minHashNeighbors.kneighbors_graph(mode='distance')
self._dbscan.fit(self._precomputed_graph)
self.labels_ = self._dbscan.labels_

def fit_predict(self, X, y=None, pSaveMemory=None):
self.fit(X, y, pSaveMemory=None)
return self.labels_
return self.labels_
Loading

0 comments on commit af057c2

Please sign in to comment.