From f480264dde8a4e22b0aa3879b330c9eb7b2416a3 Mon Sep 17 00:00:00 2001 From: ktpolanski Date: Thu, 10 Jun 2021 10:42:28 +0100 Subject: [PATCH 1/3] update bbknn arguments and docstring --- scanpy/external/pp/_bbknn.py | 130 ++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 48 deletions(-) diff --git a/scanpy/external/pp/_bbknn.py b/scanpy/external/pp/_bbknn.py index 2e78e0b05f..42158404f1 100644 --- a/scanpy/external/pp/_bbknn.py +++ b/scanpy/external/pp/_bbknn.py @@ -1,4 +1,5 @@ from typing import Union, Optional +import types from anndata import AnnData import sklearn @@ -14,13 +15,20 @@ def bbknn( adata: AnnData, batch_key: str = 'batch', + use_rep: str = 'X_pca', approx: bool = True, - metric: Union[str, 'sklearn.neighbors.DistanceMetric'] = 'angular', + use_annoy: bool = True, + metric: Union[ + str, 'types.FunctionType', 'sklearn.neighbors.DistanceMetric' + ] = 'euclidean', copy: bool = False, *, + neighbors_within_batch: int = 3, n_pcs: int = 50, trim: Optional[int] = None, - n_trees: int = 10, + annoy_n_trees: int = 10, + pynndescent_n_neighbors: int = 30, + pynndescent_random_state: int = 0, use_faiss: bool = True, set_op_mix_ratio: float = 1.0, local_connectivity: int = 1, @@ -29,13 +37,12 @@ def bbknn( """\ Batch balanced kNN [Polanski19]_. - Batch balanced kNN alters the kNN procedure to identify each - cell's top neighbours in each batch separately instead of the - entire cell pool with no accounting for batch. Aligns batches in a - quick and lightweight manner. + Batch balanced kNN alters the kNN procedure to identify each cell's top neighbours in + each batch separately instead of the entire cell pool with no accounting for batch. + The nearest neighbours for each batch are then merged to create a final list of + neighbours for the cell. Aligns batches in a quick and lightweight manner. - For use in the scanpy workflow as an alternative to - :func:`~scanpy.pp.neighbors`. + For use in the scanpy workflow as an alternative to :func:`~scanpy.pp.neighbors`. .. note:: @@ -45,57 +52,79 @@ def bbknn( Params ------ adata - Needs the PCA computed and stored in `adata.obsm["X_pca"]`. + Needs the PCA computed and stored in ``adata.obsm["X_pca"]``. batch_key - `adata.obs` column name discriminating between your batches. + ``adata.obs`` column name discriminating between your batches. + use_rep + The dimensionality reduction in ``.obsm`` to use for neighbour detection. Defaults to PCA. approx - If `True`, use annoy's approximate neighbour finding. - This results in a quicker run time for large datasets while also - potentially increasing the degree of batch correction. + If ``True``, use approximate neighbour finding - annoy or pyNNDescent. This results + in a quicker run time for large datasets while also potentially increasing the degree of + batch correction. + use_annoy + Only used when ``approx=True``. If ``True``, will use annoy for neighbour finding. If + ``False``, will use pyNNDescent instead. metric - What distance metric to use. If using `approx=True`, the options are - `'angular'`, `'euclidean'`, `'manhattan'`, and `'hamming'`. - Otherwise, the options are `"euclidean"`, - an element of :class:`sklearn.neighbors.KDTree`’s `valid_metrics`, - or parameterised :class:`sklearn.neighbors.DistanceMetric` objects: - - >>> from sklearn import neighbors - >>> neighbors.KDTree.valid_metrics + What distance metric to use. The options depend on the choice of neighbour algorithm. + + "euclidean", the default, is always available. + + Annoy supports "angular", "manhattan" and "hamming". + + PyNNDescent supports metrics listed in ``pynndescent.distances.named_distances`` + and custom functions, including compiled Numba code. + + >>> pynndescent.distances.named_distances.keys() + dict_keys(['euclidean', 'l2', 'sqeuclidean', 'manhattan', 'taxicab', 'l1', 'chebyshev', 'linfinity', + 'linfty', 'linf', 'minkowski', 'seuclidean', 'standardised_euclidean', 'wminkowski', 'weighted_minkowski', + 'mahalanobis', 'canberra', 'cosine', 'dot', 'correlation', 'hellinger', 'haversine', 'braycurtis', 'spearmanr', + 'kantorovich', 'wasserstein', 'tsss', 'true_angular', 'hamming', 'jaccard', 'dice', 'matching', 'kulsinski', + 'rogerstanimoto', 'russellrao', 'sokalsneath', 'sokalmichener', 'yule']) + + KDTree supports members of the ``sklearn.neighbors.KDTree.valid_metrics`` list, or parameterised + ``sklearn.neighbors.DistanceMetric`` `objects + `_: + + >>> sklearn.neighbors.KDTree.valid_metrics ['p', 'chebyshev', 'cityblock', 'minkowski', 'infinity', 'l2', 'euclidean', 'manhattan', 'l1'] - >>> pass_this_as_metric = neighbors.DistanceMetric.get_metric('minkowski',p=3) copy - If `True`, return a copy instead of writing to the supplied adata. + If ``True``, return a copy instead of writing to the supplied adata. neighbors_within_batch - How many top neighbours to report for each batch; total number of neighbours - will be this number times the number of batches. + How many top neighbours to report for each batch; total number of neighbours in + the initial k-nearest-neighbours computation will be this number times the number + of batches. This then serves as the basis for the construction of a symmetrical + matrix of connectivities. n_pcs - How many principal components to use in the analysis. + How many dimensions (in case of PCA, principal components) to use in the analysis. trim - Trim the neighbours of each cell to these many top connectivities. - May help with population independence and improve the tidiness of clustering. - The lower the value the more independent the individual populations, - at the cost of more conserved batch effect. If `None`, - sets the parameter value automatically to 10 times the total number of - neighbours for each cell. Set to 0 to skip. - n_trees - Only used when `approx=True`. - The number of trees to construct in the annoy forest. - More trees give higher precision when querying, + Trim the neighbours of each cell to these many top connectivities. May help with + population independence and improve the tidiness of clustering. The lower the value the + more independent the individual populations, at the cost of more conserved batch effect. + If ``None``, sets the parameter value automatically to 10 times ``neighbors_within_batch`` + times the number of batches. Set to 0 to skip. + annoy_n_trees + Only used with annoy neighbour identification. The number of trees to construct in the + annoy forest. More trees give higher precision when querying, at the cost of increased + run time and resource intensity. + pynndescent_n_neighbors + Only used with pyNNDescent neighbour identification. The number of neighbours to include + in the approximate neighbour graph. More neighbours give higher precision when querying, at the cost of increased run time and resource intensity. + pynndescent_random_state + Only used with pyNNDescent neighbour identification. The RNG seed to use when creating + the graph. use_faiss - If `approx=False` and the metric is `"euclidean"`, - use the `faiss` package to compute nearest neighbours if installed. - This improves performance at a minor cost to numerical - precision as `faiss` operates on 32 bit floats. + If ``approx=False`` and the metric is "euclidean", use the faiss package to compute + nearest neighbours if installed. This improves performance at a minor cost to numerical + precision as faiss operates on float32. set_op_mix_ratio - UMAP connectivity computation parameter, float between 0 and 1, - controlling the blend between a connectivity matrix formed exclusively - from mutual nearest neighbour pairs (0) and a union of all observed - neighbour relationships with the mutual pairs emphasised (1) + UMAP connectivity computation parameter, float between 0 and 1, controlling the + blend between a connectivity matrix formed exclusively from mutual nearest neighbour + pairs (0) and a union of all observed neighbour relationships with the mutual pairs + emphasised (1) local_connectivity - UMAP connectivity computation parameter, - how many nearest neighbors per cell are assumed to be fully connected - (and given a connectivity value of 1) + UMAP connectivity computation parameter, how many nearest neighbors of each cell + are assumed to be fully connected (and given a connectivity value of 1) Returns ------- @@ -108,12 +137,17 @@ def bbknn( return bbknn( adata=adata, batch_key=batch_key, + use_rep=use_rep, approx=approx, + use_annoy=use_annoy, metric=metric, copy=copy, + neighbors_within_batch=neighbors_within_batch, n_pcs=n_pcs, trim=trim, - n_trees=n_trees, + annoy_n_trees=annoy_n_trees, + pynndescent_n_neighbors=pynndescent_n_neighbors, + pynndescent_random_state=pynndescent_random_state, use_faiss=use_faiss, set_op_mix_ratio=set_op_mix_ratio, local_connectivity=local_connectivity, From ecd68cb2e6c4cb3c44589217322bd95ca9ab7d69 Mon Sep 17 00:00:00 2001 From: ktpolanski Date: Tue, 15 Jun 2021 12:19:34 +0200 Subject: [PATCH 2/3] revert to single tick --- scanpy/external/pp/_bbknn.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scanpy/external/pp/_bbknn.py b/scanpy/external/pp/_bbknn.py index 42158404f1..388d1ab688 100644 --- a/scanpy/external/pp/_bbknn.py +++ b/scanpy/external/pp/_bbknn.py @@ -52,18 +52,18 @@ def bbknn( Params ------ adata - Needs the PCA computed and stored in ``adata.obsm["X_pca"]``. + Needs the PCA computed and stored in `adata.obsm["X_pca"]`. batch_key - ``adata.obs`` column name discriminating between your batches. + `adata.obs` column name discriminating between your batches. use_rep - The dimensionality reduction in ``.obsm`` to use for neighbour detection. Defaults to PCA. + The dimensionality reduction in `.obsm` to use for neighbour detection. Defaults to PCA. approx - If ``True``, use approximate neighbour finding - annoy or pyNNDescent. This results + If `True`, use approximate neighbour finding - annoy or pyNNDescent. This results in a quicker run time for large datasets while also potentially increasing the degree of batch correction. use_annoy - Only used when ``approx=True``. If ``True``, will use annoy for neighbour finding. If - ``False``, will use pyNNDescent instead. + Only used when `approx=True`. If `True`, will use annoy for neighbour finding. If + `False`, will use pyNNDescent instead. metric What distance metric to use. The options depend on the choice of neighbour algorithm. @@ -71,7 +71,7 @@ def bbknn( Annoy supports "angular", "manhattan" and "hamming". - PyNNDescent supports metrics listed in ``pynndescent.distances.named_distances`` + PyNNDescent supports metrics listed in `pynndescent.distances.named_distances` and custom functions, including compiled Numba code. >>> pynndescent.distances.named_distances.keys() @@ -81,14 +81,14 @@ def bbknn( 'kantorovich', 'wasserstein', 'tsss', 'true_angular', 'hamming', 'jaccard', 'dice', 'matching', 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalsneath', 'sokalmichener', 'yule']) - KDTree supports members of the ``sklearn.neighbors.KDTree.valid_metrics`` list, or parameterised - ``sklearn.neighbors.DistanceMetric`` `objects + KDTree supports members of the `sklearn.neighbors.KDTree.valid_metrics` list, or parameterised + `sklearn.neighbors.DistanceMetric` `objects `_: >>> sklearn.neighbors.KDTree.valid_metrics ['p', 'chebyshev', 'cityblock', 'minkowski', 'infinity', 'l2', 'euclidean', 'manhattan', 'l1'] copy - If ``True``, return a copy instead of writing to the supplied adata. + If `True`, return a copy instead of writing to the supplied adata. neighbors_within_batch How many top neighbours to report for each batch; total number of neighbours in the initial k-nearest-neighbours computation will be this number times the number @@ -100,7 +100,7 @@ def bbknn( Trim the neighbours of each cell to these many top connectivities. May help with population independence and improve the tidiness of clustering. The lower the value the more independent the individual populations, at the cost of more conserved batch effect. - If ``None``, sets the parameter value automatically to 10 times ``neighbors_within_batch`` + If `None`, sets the parameter value automatically to 10 times `neighbors_within_batch` times the number of batches. Set to 0 to skip. annoy_n_trees Only used with annoy neighbour identification. The number of trees to construct in the @@ -114,7 +114,7 @@ def bbknn( Only used with pyNNDescent neighbour identification. The RNG seed to use when creating the graph. use_faiss - If ``approx=False`` and the metric is "euclidean", use the faiss package to compute + If `approx=False` and the metric is "euclidean", use the faiss package to compute nearest neighbours if installed. This improves performance at a minor cost to numerical precision as faiss operates on float32. set_op_mix_ratio From 75266636c05497b8e16aac73f39b34def5bbd4ab Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 16 Jun 2021 20:27:08 +1000 Subject: [PATCH 3/3] types.FunctionType -> typing.Callable --- scanpy/external/pp/_bbknn.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scanpy/external/pp/_bbknn.py b/scanpy/external/pp/_bbknn.py index 388d1ab688..891124c61e 100644 --- a/scanpy/external/pp/_bbknn.py +++ b/scanpy/external/pp/_bbknn.py @@ -1,5 +1,4 @@ -from typing import Union, Optional -import types +from typing import Union, Optional, Callable from anndata import AnnData import sklearn @@ -18,9 +17,7 @@ def bbknn( use_rep: str = 'X_pca', approx: bool = True, use_annoy: bool = True, - metric: Union[ - str, 'types.FunctionType', 'sklearn.neighbors.DistanceMetric' - ] = 'euclidean', + metric: Union[str, Callable, 'sklearn.neighbors.DistanceMetric'] = 'euclidean', copy: bool = False, *, neighbors_within_batch: int = 3,