From 7f9f1c8dcc78b002d34ae0e02fa797d38e9a3141 Mon Sep 17 00:00:00 2001
From: Valentin Ambroise <113367796+vaamb@users.noreply.github.com>
Date: Tue, 12 Dec 2023 09:38:23 +0100
Subject: [PATCH] Reduce the memory footprint of `find_consensus_annotation`
 (#189)

---
 .../_consensus_assignment.py                  | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)
diff --git a/q2_feature_classifier/_consensus_assignment.py b/q2_feature_classifier/_consensus_assignment.py
index 3b614be..0538dcd 100644
--- a/q2_feature_classifier/_consensus_assignment.py
+++ b/q2_feature_classifier/_consensus_assignment.py
@@ -32,19 +32,19 @@ def find_consensus_annotation(search_results: pd.DataFrame,
                               unassignable_label: str =
                               DEFAULTUNASSIGNABLELABEL
                               ) -> pd.DataFrame:
-    '''Find consensus taxonomy from BLAST6Format alignment summary.
+    """Find consensus taxonomy from BLAST6Format alignment summary.
 
     search_results: pd.dataframe
         BLAST6Format search results with canonical headers attached.
     reference_taxonomy: pd.Series
         Annotations of reference database used for original search.
     min_consensus : float
-        The minimum fraction of the annotations that a specfic annotation
+        The minimum fraction of the annotations that a specific annotation
         must be present in for that annotation to be accepted. Current
         lower boundary is 0.51.
     unassignable_label : str
         The label to apply if no acceptable annotations are identified.
-    '''
+    """
     # load and convert blast6format results to dict of taxa hits
     obs_taxa = _blast6format_df_to_series_of_lists(
         search_results, reference_taxonomy,
@@ -85,9 +85,11 @@ def find_consensus_annotation(search_results: pd.DataFrame,
 
 
 def _blast6format_df_to_series_of_lists(
-        assignments, ref_taxa,
-        unassignable_label=DEFAULTUNASSIGNABLELABEL):
-    '''import observed assignments in blast6 format to series of lists.
+        assignments: pd.DataFrame,
+        ref_taxa: pd.Series,
+        unassignable_label: str = DEFAULTUNASSIGNABLELABEL
+) -> pd.Series:
+    """import observed assignments in blast6 format to series of lists.
 
     assignments: pd.DataFrame
         Taxonomy observation map in blast format 6. Each line consists of
@@ -99,13 +101,12 @@ def _blast6format_df_to_series_of_lists(
             <accession ID>  Annotation
         The accession IDs in this taxonomy should match the subject-seq-ids in
         the "assignment" input.
-    '''
-    taxa_hits = assignments.set_index('qseqid')['sseqid']
-
+    """
     # validate that assignments are present in reference taxonomy
     # (i.e., that the correct reference taxonomy was used).
     # Note that we drop unassigned labels from this set.
-    missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''}
+    missing_ids = \
+        set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''}
     if len(missing_ids) > 0:
         raise KeyError('Reference taxonomy and search results do not match. '
                        'The following identifiers were reported in the search '
@@ -115,9 +116,12 @@ def _blast6format_df_to_series_of_lists(
     # if vsearch fails to find assignment, it reports '*' as the
     # accession ID, so we will add this mapping to the reference taxonomy.
     ref_taxa['*'] = unassignable_label
-    # map accession IDs to taxonomy
-    taxa_hits.replace(ref_taxa, inplace=True)
+    assignments_copy = assignments.copy(deep=True)
+    for index, value in assignments_copy.iterrows():
+        sseqid = assignments_copy.iloc[index]['sseqid']
+        assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid]
     # convert to dict of {accession_id: [annotations]}
+    taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid']
     taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list)
 
     return taxa_hits
@@ -131,7 +135,7 @@ def _compute_consensus_annotations(
         ----------
         query_annotations : pd.Series of lists
             Indices are query identifiers, and values are lists of all
-            taxonomic annotations associated with that identfier.
+            taxonomic annotations associated with that identifier.
         Returns
         -------
         pd.DataFrame
@@ -191,7 +195,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
         annotations : list of lists
             Taxonomic annotations to form consensus.
         min_consensus : float
-            The minimum fraction of the annotations that a specfic annotation
+            The minimum fraction of the annotations that a specific annotation
             must be present in for that annotation to be accepted. Current
             lower boundary is 0.51.
         unassignable_label : str
@@ -211,7 +215,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
     # This assumes that a hierarchical taxonomy with even numbers of
     # ranks was used.
     taxa_comparison = [Counter(rank) for rank in zip(*annotations)]
-    # interate rank comparisons in reverse
+    # iterate rank comparisons in reverse
     # to find rank with consensus count > threshold
     for rank in taxa_comparison[::-1]:
         # grab most common label and its count