diff --git a/.bettercodehub.yml b/.bettercodehub.yml index 5bfde9ec..55771255 100644 --- a/.bettercodehub.yml +++ b/.bettercodehub.yml @@ -4,3 +4,4 @@ languages: exclude: - /divik/core/gin_sklearn_configurables\.py - /divik/core/_gin_bugfix\.py +- /gamred_native diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index e1cd8943..bb2ef8d6 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -12,7 +12,7 @@ on: env: MAJOR: ${{ 2 }} MINOR: ${{ 5 }} - FIXUP: ${{ 10 }} + FIXUP: ${{ 11 }} PACKAGE_INIT_FILE: ${{ 'divik/__init__.py' }} PACKAGE_INIT_FILE_VERSION_LINE: ${{ 1 }} PACKAGE_SETUP_FILE: ${{ 'setup.py' }} diff --git a/README.md b/README.md index 2132b434..3ee7c23f 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ docker pull gmrukwa/divik To install specific version, you can specify it in the command, e.g.: ```bash -docker pull gmrukwa/divik:2.5.10 +docker pull gmrukwa/divik:2.5.11 ``` ## Python package @@ -79,7 +79,7 @@ pip install divik or any stable tagged version, e.g.: ```bash -pip install divik==2.5.10 +pip install divik==2.5.11 ``` If you want to have compatibility with @@ -92,27 +92,26 @@ pip install divik[gin] **Note:** Remember about `\` before `[` and `]` in `zsh` shell. +# High-Volume Data Considerations + +If you are using DiviK to run the analysis that could fail to fit RAM of your +computer, consider disabling the default parallelism and switch to +[dask](https://dask.org/). It's easy to achieve through configuration: + +- set all parameters named `n_jobs` to `1`; +- set all parameters named `allow_dask` to `True`. + +Never set `n_jobs>1` and `allow_dask=True` at the same time, the computations +will freeze due to how `multiprocessing` and `dask` handle parallelism. + # References This software is part of contribution made by [Data Mining Group of Silesian University of Technology](http://www.zaed.polsl.pl/), rest of which is published [here](https://github.com/ZAEDPolSl). -+ [P. Widlak, G. Mrukwa, M. Kalinowska, M. Pietrowska, M. Chekan, J. Wierzgon, M. -Gawin, G. Drazek and J. Polanska, "Detection of molecular signatures of oral -squamous cell carcinoma and normal epithelium - application of a novel -methodology for unsupervised segmentation of imaging mass spectrometry data," -Proteomics, vol. 16, no. 11-12, pp. 1613-21, 2016][1] -+ [M. Pietrowska, H. C. Diehl, G. Mrukwa, M. Kalinowska-Herok, M. Gawin, M. -Chekan, J. Elm, G. Drazek, A. Krawczyk, D. Lange, H. E. Meyer, J. Polanska, C. -Henkel, P. Widlak, "Molecular profiles of thyroid cancer subtypes: -Classification based on features of tissue revealed by mass spectrometry -imaging," Biochimica et Biophysica Acta (BBA)-Proteins and Proteomics, 2016][2] -+ [G. Mrukwa, G. Drazek, M. Pietrowska, P. Widlak and J. Polanska, "A Novel -Divisive iK-Means Algorithm with Region-Driven Feature Selection as a Tool for -Automated Detection of Tumour Heterogeneity in MALDI IMS Experiments," in -International Conference on Bioinformatics and Biomedical Engineering, 2016][3] - -[1]: http://onlinelibrary.wiley.com/doi/10.1002/pmic.201500458/pdf -[2]: http://www.sciencedirect.com/science/article/pii/S1570963916302175 -[3]: http://link.springer.com/chapter/10.1007/978-3-319-31744-1_11 ++ [Mrukwa, G. and Polanska, J., 2020. DiviK: Divisive intelligent K-means for +hands-free unsupervised clustering in biological big data. *arXiv preprint +arXiv:2009.10706.*][1] + +[1]: https://arxiv.org/abs/2009.10706 diff --git a/divik/__init__.py b/divik/__init__.py index cb94878b..f144a9fd 100644 --- a/divik/__init__.py +++ b/divik/__init__.py @@ -1,4 +1,4 @@ -__version__ = '2.5.10' +__version__ = '2.5.11' from ._summary import plot, reject_split diff --git a/divik/cluster/_kmeans/_core.py b/divik/cluster/_kmeans/_core.py index d154b314..441e6216 100644 --- a/divik/cluster/_kmeans/_core.py +++ b/divik/cluster/_kmeans/_core.py @@ -28,11 +28,13 @@ class Labeling(object): """Labels observations by closest centroids""" - def __init__(self, distance_metric: str): + def __init__(self, distance_metric: str, allow_dask: bool=False): """ @param distance_metric: distance metric for estimation of closest + @param allow_dask: should be False if `multiprocessing.Pool` is spawned """ self.distance_metric = distance_metric + self.allow_dask = allow_dask def __call__(self, data: Data, centroids: Centroids) -> IntLabels: """Find closest centroids @@ -47,7 +49,7 @@ def __call__(self, data: Data, centroids: Centroids) -> IntLabels: logging.error(msg) raise ValueError(msg) - if data.shape[0] > 10000 or data.shape[1] > 1000: + if self.allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000): X1 = da.from_array(data) X2 = da.from_array(centroids) distances = ddst.cdist(X1, X2, self.distance_metric) @@ -59,12 +61,13 @@ def __call__(self, data: Data, centroids: Centroids) -> IntLabels: def redefine_centroids(data: Data, labeling: IntLabels, - label_set: IntLabels) -> Centroids: + label_set: IntLabels, allow_dask: bool=False) -> Centroids: """Recompute centroids in data for given labeling @param data: observations @param labeling: partition of dataset into groups @param label_set: set of labels used for partitioning + @param allow_dask: should be False if `multiprocessing.Pool` is spawned @return: centroids """ if data.shape[0] != labeling.size: @@ -73,7 +76,7 @@ def redefine_centroids(data: Data, labeling: IntLabels, f"number of observations: {data.shape[0]}." logging.error(msg) raise ValueError(msg) - if data.shape[0] > 10000 or data.shape[1] > 1000: + if allow_dask and (data.shape[0] > 10000 or data.shape[1] > 1000): X = dd.from_array(data) y = dd.from_array(labeling) centroids = X.groupby(y).mean().compute().values @@ -106,17 +109,20 @@ def _validate_normalizable(data): class _KMeans(SegmentationMethod): """K-means clustering""" def __init__(self, labeling: Labeling, initialize: Initialization, - number_of_iterations: int=100, normalize_rows: bool=False): + number_of_iterations: int=100, normalize_rows: bool=False, + allow_dask: bool = False): """ @param labeling: labeling method @param initialize: initialization method @param number_of_iterations: number of iterations @param normalize_rows: sets mean of row to 0 and norm to 1 + @param allow_dask: should be False if `multiprocessing.Pool` is spawned """ self.labeling = labeling self.initialize = initialize self.number_of_iterations = number_of_iterations self.normalize_rows = normalize_rows + self.allow_dask = allow_dask def _fix_labels(self, data, centroids, labels, n_clusters, retries=10): logging.debug('A label vanished - fixing') @@ -169,7 +175,8 @@ def __call__(self, data: Data, number_of_clusters: int) \ logging.debug('Stability achieved.') break old_labels = labels - centroids = redefine_centroids(data, old_labels, label_set) + centroids = redefine_centroids( + data, old_labels, label_set, self.allow_dask) labels = self.labeling(data, centroids) return labels, centroids @@ -238,6 +245,11 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): normalize_rows : bool, default: False If True, rows are translated to mean of 0.0 and scaled to norm of 1.0. + + allow_dask : bool, default: False + If True, automatically selects dask as computations backend whenever + reasonable. Default `False` since it cannot be used together with + `multiprocessing.Pool` and everywhere `n_jobs` must be set to `1`. Attributes ---------- @@ -253,7 +265,8 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): def __init__(self, n_clusters: int, distance: str = 'euclidean', init: str = 'percentile', percentile: float = 95., leaf_size : Union[int, float] = 0.01, - max_iter: int = 100, normalize_rows: bool = False): + max_iter: int = 100, normalize_rows: bool = False, + allow_dask: bool = False): super().__init__() self.n_clusters = n_clusters self.distance = distance @@ -262,6 +275,7 @@ def __init__(self, n_clusters: int, distance: str = 'euclidean', self.leaf_size = leaf_size self.max_iter = max_iter self.normalize_rows = normalize_rows + self.allow_dask = allow_dask def fit(self, X, y=None): """Compute k-means clustering. @@ -280,10 +294,11 @@ def fit(self, X, y=None): initialize = _parse_initialization( self.init, self.distance, self.percentile, self.leaf_size) kmeans = _KMeans( - labeling=Labeling(self.distance), + labeling=Labeling(self.distance, allow_dask=self.allow_dask), initialize=initialize, number_of_iterations=self.max_iter, - normalize_rows=self.normalize_rows + normalize_rows=self.normalize_rows, + allow_dask=self.allow_dask, ) X = np.asanyarray(X) self.labels_, self.cluster_centers_ = kmeans( diff --git a/docs/instructions/installation.rst b/docs/instructions/installation.rst index 7d2ba331..49514156 100644 --- a/docs/instructions/installation.rst +++ b/docs/instructions/installation.rst @@ -14,7 +14,7 @@ To install latest stable version use:: To install specific version, you can specify it in the command, e.g.:: - docker pull gmrukwa/divik:2.5.10 + docker pull gmrukwa/divik:2.5.11 Python package -------------- @@ -31,7 +31,7 @@ package:: or any stable tagged version, e.g.:: - pip install divik==2.5.10 + pip install divik==2.5.11 If you want to have compatibility with `gin-config `_, you can install diff --git a/requirements-base.txt b/requirements-base.txt index 5ef60aa3..c9411d6d 100644 --- a/requirements-base.txt +++ b/requirements-base.txt @@ -14,6 +14,7 @@ networkx numpy pandas parameterized +polyaxon==1.1.9 pylint scikit-image scikit-learn diff --git a/requirements.txt b/requirements.txt index 5a14b079..36d56b46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,27 @@ absl-py==0.9.0 astroid==2.3.3 attrs==19.3.0 +cachetools==4.1.1 +certifi==2020.6.20 +chardet==3.0.4 Click==7.0 +click-completion==0.5.2 cycler==0.10.0 dash==0.34.0 dash-core-components==0.42.0 dash-html-components==0.13.4 dash-renderer==0.17.0 dash-table==3.1.11 +dask==2.14.0 dask-distance==0.2.0 -dask[dataframe]==2.14.0 decorator==4.4.1 Flask==1.1.1 Flask-Compress==1.4.0 +fsspec==0.8.4 gin-config==0.3.0 +google-auth==1.22.1 h5py==2.8.0 +idna==2.10 imageio==2.6.1 importlib-metadata==1.3.0 isort==4.3.21 @@ -23,20 +30,31 @@ Jinja2==2.10.3 joblib==0.14.1 kiwisolver==1.1.0 kneed==0.5.1 +kubernetes==12.0.0 lazy-object-proxy==1.4.3 +locket==0.2.0 MarkupSafe==1.1.1 +marshmallow==3.7.1 matplotlib==3.1.2 mccabe==0.6.1 more-itertools==8.0.2 networkx==2.4 numpy==1.18.0 +nvidia-ml-py3==7.352.0 +oauthlib==3.1.0 packaging==19.2 pandas==0.25.3 parameterized==0.7.1 +partd==1.1.0 Pillow==6.2.1 plotly==4.4.1 pluggy==0.13.1 +polyaxon==1.1.9 +polyaxon-sdk==1.1.9 +psutil==5.7.2 py==1.8.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 pybind11==2.4.3 pylint==2.4.4 pyparsing==2.4.5 @@ -44,14 +62,26 @@ pytest==5.3.2 python-dateutil==2.8.1 pytz==2019.3 PyWavelets==1.1.1 +PyYAML==5.3.1 +requests==2.24.0 +requests-oauthlib==1.3.0 +requests-toolbelt==0.9.1 retrying==1.3.3 +rsa==4.6 scikit-image==0.16.2 scikit-learn==0.22 scipy==1.4.1 +sentry-sdk==0.19.1 +shellingham==1.3.2 six==1.13.0 +tabulate==0.8.7 +toolz==0.11.1 tqdm==4.41.0 typed-ast==1.4.0 +ujson==4.0.1 +urllib3==1.25.11 wcwidth==0.1.7 +websocket-client==0.57.0 Werkzeug==0.16.0 wrapt==1.11.2 zipp==0.6.0 diff --git a/setup.py b/setup.py index c1c06c35..b5d62aaa 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ import sys import numpy -__version__ = '2.5.10' +__version__ = '2.5.11' LINUX_OPTS = { 'extra_link_args': [ @@ -108,10 +108,18 @@ 'numpy>=0.12.1', ], extras_require={ + 'all': [ + 'absl-py', + 'gin-config', + 'polyaxon', + ], 'gin': [ "absl-py", "gin-config", ], + 'polyaxon': [ + "polyaxon", + ], }, python_requires='>=3.6', package_data={