From 48c06bd91137e4755aecc56e48ff1e33bd9926ba Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Tue, 23 Jun 2020 08:56:42 -0500 Subject: [PATCH] Rank-specific binning (#96) * :art: Add taxonomy specific splitting control :art::racehorse: Add reverse-ranks parameter :art::racehorse: Add starting-rank parameter * :memo: Update --reverse-ranks parameter help text * :memo: Update help text for --reverse-ranks parameter --- autometa/binning/recursive_dbscan.py | 58 +++++++++++++++++++++------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index 8b28fb908..f85825a0f 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -505,8 +505,9 @@ def binning( completeness=20.0, purity=90.0, taxonomy=True, + starting_rank="superkingdom", method="dbscan", - reverse=True, + reverse_ranks=False, verbose=False, ): """Perform clustering of contigs by provided `method` and use metrics to @@ -522,23 +523,26 @@ def binning( i.e. [taxid,superkingdom,phylum,class,order,family,genus,species] markers : pd.DataFrame wide format, i.e. index=contig cols=[marker,marker,...] - domain : str + domain : str, optional Kingdom to determine metrics (the default is 'bacteria'). choices=['bacteria','archaea'] - completeness : float + completeness : float, optional Description of parameter `completeness` (the default is 20.). - purity : float + purity : float, optional Description of parameter `purity` (the default is 90.). - taxonomy : bool + taxonomy : bool, optional Split canonical ranks and subset based on rank then attempt to find clusters (the default is True). taxonomic_levels = [superkingdom,phylum,class,order,family,genus,species] - method : str + starting_rank : str, optional + Starting canonical rank at which to begin subsetting taxonomy (the default is superkingdom). + Choices are superkingdom, phylum, class, order, family, genus, species. + method : str, optional Clustering `method` (the default is 'dbscan'). choices = ['dbscan','hdbscan'] - reverse : bool - True - [superkingdom,phylum,class,order,family,genus,species] - False - [species,genus,family,order,class,phylum,superkingdom] - verbose : bool + reverse_ranks : bool, optional + False - [superkingdom,phylum,class,order,family,genus,species] (Default) + True - [species,genus,family,order,class,phylum,superkingdom] + verbose : bool, optional log stats for each recursive_dbscan clustering iteration Returns @@ -570,13 +574,16 @@ def binning( ) # Use taxonomy method - if reverse: - # superkingdom, phylum, class, order, family, genus, species - ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS)] - else: + if reverse_ranks: # species, genus, family, order, class, phylum, superkingdom ranks = [rank for rank in NCBI.CANONICAL_RANKS] + else: + # superkingdom, phylum, class, order, family, genus, species + ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS)] ranks.remove("root") + starting_rank_index = ranks.index(starting_rank) + ranks = ranks[starting_rank_index:] + logger.debug(f"Using ranks: {', '.join(ranks)}") clustered_contigs = set() num_clusters = 0 clusters = [] @@ -678,6 +685,27 @@ def main(): ) parser.add_argument("--purity", help="", default=90.0, type=float) parser.add_argument("--taxonomy", help="") + parser.add_argument( + "--starting-rank", + help="Canonical rank at which to begin subsetting taxonomy", + default="superkingdom", + choices=[ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], + ) + parser.add_argument( + "--reverse-ranks", + action="store_true", + default=False, + help="Reverse order at which to split taxonomy by canonical-rank." + " When --reverse-ranks given, contigs will be split in order of species, genus, family, order, class, phylum, superkingdom.", + ) parser.add_argument( "--domain", help="Kingdom to consider (archaea|bacteria)", @@ -719,6 +747,8 @@ def main(): master=master_df, markers=markers_df, taxonomy=taxa_present, + starting_rank=args.starting_rank, + reverse_ranks=args.reverse_ranks, domain=args.domain, completeness=args.completeness, purity=args.purity,