From 7f9f1c8dcc78b002d34ae0e02fa797d38e9a3141 Mon Sep 17 00:00:00 2001 From: Valentin Ambroise <113367796+vaamb@users.noreply.github.com> Date: Tue, 12 Dec 2023 09:38:23 +0100 Subject: [PATCH] Reduce the memory footprint of `find_consensus_annotation` (#189) --- .../_consensus_assignment.py | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/q2_feature_classifier/_consensus_assignment.py b/q2_feature_classifier/_consensus_assignment.py index 3b614be..0538dcd 100644 --- a/q2_feature_classifier/_consensus_assignment.py +++ b/q2_feature_classifier/_consensus_assignment.py @@ -32,19 +32,19 @@ def find_consensus_annotation(search_results: pd.DataFrame, unassignable_label: str = DEFAULTUNASSIGNABLELABEL ) -> pd.DataFrame: - '''Find consensus taxonomy from BLAST6Format alignment summary. + """Find consensus taxonomy from BLAST6Format alignment summary. search_results: pd.dataframe BLAST6Format search results with canonical headers attached. reference_taxonomy: pd.Series Annotations of reference database used for original search. min_consensus : float - The minimum fraction of the annotations that a specfic annotation + The minimum fraction of the annotations that a specific annotation must be present in for that annotation to be accepted. Current lower boundary is 0.51. unassignable_label : str The label to apply if no acceptable annotations are identified. - ''' + """ # load and convert blast6format results to dict of taxa hits obs_taxa = _blast6format_df_to_series_of_lists( search_results, reference_taxonomy, @@ -85,9 +85,11 @@ def find_consensus_annotation(search_results: pd.DataFrame, def _blast6format_df_to_series_of_lists( - assignments, ref_taxa, - unassignable_label=DEFAULTUNASSIGNABLELABEL): - '''import observed assignments in blast6 format to series of lists. + assignments: pd.DataFrame, + ref_taxa: pd.Series, + unassignable_label: str = DEFAULTUNASSIGNABLELABEL +) -> pd.Series: + """import observed assignments in blast6 format to series of lists. assignments: pd.DataFrame Taxonomy observation map in blast format 6. Each line consists of @@ -99,13 +101,12 @@ def _blast6format_df_to_series_of_lists( Annotation The accession IDs in this taxonomy should match the subject-seq-ids in the "assignment" input. - ''' - taxa_hits = assignments.set_index('qseqid')['sseqid'] - + """ # validate that assignments are present in reference taxonomy # (i.e., that the correct reference taxonomy was used). # Note that we drop unassigned labels from this set. - missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''} + missing_ids = \ + set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''} if len(missing_ids) > 0: raise KeyError('Reference taxonomy and search results do not match. ' 'The following identifiers were reported in the search ' @@ -115,9 +116,12 @@ def _blast6format_df_to_series_of_lists( # if vsearch fails to find assignment, it reports '*' as the # accession ID, so we will add this mapping to the reference taxonomy. ref_taxa['*'] = unassignable_label - # map accession IDs to taxonomy - taxa_hits.replace(ref_taxa, inplace=True) + assignments_copy = assignments.copy(deep=True) + for index, value in assignments_copy.iterrows(): + sseqid = assignments_copy.iloc[index]['sseqid'] + assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid] # convert to dict of {accession_id: [annotations]} + taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid'] taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list) return taxa_hits @@ -131,7 +135,7 @@ def _compute_consensus_annotations( ---------- query_annotations : pd.Series of lists Indices are query identifiers, and values are lists of all - taxonomic annotations associated with that identfier. + taxonomic annotations associated with that identifier. Returns ------- pd.DataFrame @@ -191,7 +195,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label): annotations : list of lists Taxonomic annotations to form consensus. min_consensus : float - The minimum fraction of the annotations that a specfic annotation + The minimum fraction of the annotations that a specific annotation must be present in for that annotation to be accepted. Current lower boundary is 0.51. unassignable_label : str @@ -211,7 +215,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label): # This assumes that a hierarchical taxonomy with even numbers of # ranks was used. taxa_comparison = [Counter(rank) for rank in zip(*annotations)] - # interate rank comparisons in reverse + # iterate rank comparisons in reverse # to find rank with consensus count > threshold for rank in taxa_comparison[::-1]: # grab most common label and its count