Skip to content

Commit

Permalink
Reduce the memory footprint of find_consensus_annotation (#189)
Browse files Browse the repository at this point in the history
  • Loading branch information
vaamb authored Dec 12, 2023
1 parent 787c45f commit 7f9f1c8
Showing 1 changed file with 19 additions and 15 deletions.
34 changes: 19 additions & 15 deletions q2_feature_classifier/_consensus_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,19 @@ def find_consensus_annotation(search_results: pd.DataFrame,
unassignable_label: str =
DEFAULTUNASSIGNABLELABEL
) -> pd.DataFrame:
'''Find consensus taxonomy from BLAST6Format alignment summary.
"""Find consensus taxonomy from BLAST6Format alignment summary.
search_results: pd.dataframe
BLAST6Format search results with canonical headers attached.
reference_taxonomy: pd.Series
Annotations of reference database used for original search.
min_consensus : float
The minimum fraction of the annotations that a specfic annotation
The minimum fraction of the annotations that a specific annotation
must be present in for that annotation to be accepted. Current
lower boundary is 0.51.
unassignable_label : str
The label to apply if no acceptable annotations are identified.
'''
"""
# load and convert blast6format results to dict of taxa hits
obs_taxa = _blast6format_df_to_series_of_lists(
search_results, reference_taxonomy,
Expand Down Expand Up @@ -85,9 +85,11 @@ def find_consensus_annotation(search_results: pd.DataFrame,


def _blast6format_df_to_series_of_lists(
assignments, ref_taxa,
unassignable_label=DEFAULTUNASSIGNABLELABEL):
'''import observed assignments in blast6 format to series of lists.
assignments: pd.DataFrame,
ref_taxa: pd.Series,
unassignable_label: str = DEFAULTUNASSIGNABLELABEL
) -> pd.Series:
"""import observed assignments in blast6 format to series of lists.
assignments: pd.DataFrame
Taxonomy observation map in blast format 6. Each line consists of
Expand All @@ -99,13 +101,12 @@ def _blast6format_df_to_series_of_lists(
<accession ID> Annotation
The accession IDs in this taxonomy should match the subject-seq-ids in
the "assignment" input.
'''
taxa_hits = assignments.set_index('qseqid')['sseqid']

"""
# validate that assignments are present in reference taxonomy
# (i.e., that the correct reference taxonomy was used).
# Note that we drop unassigned labels from this set.
missing_ids = set(taxa_hits.values) - set(ref_taxa.index) - {'*', ''}
missing_ids = \
set(assignments['sseqid'].values) - set(ref_taxa.index) - {'*', ''}
if len(missing_ids) > 0:
raise KeyError('Reference taxonomy and search results do not match. '
'The following identifiers were reported in the search '
Expand All @@ -115,9 +116,12 @@ def _blast6format_df_to_series_of_lists(
# if vsearch fails to find assignment, it reports '*' as the
# accession ID, so we will add this mapping to the reference taxonomy.
ref_taxa['*'] = unassignable_label
# map accession IDs to taxonomy
taxa_hits.replace(ref_taxa, inplace=True)
assignments_copy = assignments.copy(deep=True)
for index, value in assignments_copy.iterrows():
sseqid = assignments_copy.iloc[index]['sseqid']
assignments_copy.at[index, 'sseqid'] = ref_taxa.at[sseqid]
# convert to dict of {accession_id: [annotations]}
taxa_hits: pd.Series = assignments_copy.set_index('qseqid')['sseqid']
taxa_hits = taxa_hits.groupby(taxa_hits.index).apply(list)

return taxa_hits
Expand All @@ -131,7 +135,7 @@ def _compute_consensus_annotations(
----------
query_annotations : pd.Series of lists
Indices are query identifiers, and values are lists of all
taxonomic annotations associated with that identfier.
taxonomic annotations associated with that identifier.
Returns
-------
pd.DataFrame
Expand Down Expand Up @@ -191,7 +195,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
annotations : list of lists
Taxonomic annotations to form consensus.
min_consensus : float
The minimum fraction of the annotations that a specfic annotation
The minimum fraction of the annotations that a specific annotation
must be present in for that annotation to be accepted. Current
lower boundary is 0.51.
unassignable_label : str
Expand All @@ -211,7 +215,7 @@ def _lca_consensus(annotations, min_consensus, unassignable_label):
# This assumes that a hierarchical taxonomy with even numbers of
# ranks was used.
taxa_comparison = [Counter(rank) for rank in zip(*annotations)]
# interate rank comparisons in reverse
# iterate rank comparisons in reverse
# to find rank with consensus count > threshold
for rank in taxa_comparison[::-1]:
# grab most common label and its count
Expand Down

0 comments on commit 7f9f1c8

Please sign in to comment.