Fix empty output files when no targets are found in inputs (#51)

* adds 1 row DF for null subtyping result; refactoring Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca> * Add unit tests, refactoring, Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca> * Update main.py organize imports * Update checks.py to use import pandas as pd for consistency Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca> * Update subtyper.py; fix docstring, renaming a few vars, refactoring Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
phac-nml · Jul 13, 2018 · 3d97c26 · 3d97c26
1 parent ebfb547
commit 3d97c26
Show file tree

Hide file tree

Showing 9 changed files with 580 additions and 264 deletions.
diff --git a/bio_hansel/main.py b/bio_hansel/main.py
@@ -4,18 +4,20 @@
 import argparse
 import logging
 import sys
-import os
 import re
+import os
 from typing import Optional, List, Any, Tuple
+
+import attr
 import pandas as pd
 
 from . import program_desc, __version__
 from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
 from .subtype import Subtype
 from .subtype_stats import subtype_counts
 from .subtyper import \
-    query_contigs_ac, \
-    query_reads_ac
+    subtype_contigs_samples, \
+    subtype_reads_samples
 from .metadata import read_metadata_table, merge_metadata_with_summary_results
 from .utils import \
     genome_name_from_fasta_path, \
@@ -192,36 +194,36 @@ def main():
     scheme_subtype_counts = subtype_counts(scheme_fasta)
     logging.debug(args)
     subtyping_params = init_subtyping_params(args, scheme)
-    input_genomes, reads = collect_inputs(args)
-    if len(input_genomes) == 0 and len(reads) == 0:
+    input_contigs, input_reads = collect_inputs(args)
+    if len(input_contigs) == 0 and len(input_reads) == 0:
         raise Exception('No input files specified!')
     df_md = None
     if args.scheme_metadata:
         df_md = read_metadata_table(args.scheme_metadata)
     n_threads = args.threads
 
-    subtype_results = []  # type: List[Subtype]
-    dfs = []  # type: List[pd.DataFrame]
-    if len(input_genomes) > 0:
-        query_contigs_ac(subtype_results=subtype_results,
-                         dfs=dfs,
-                         input_genomes=input_genomes,
-                         scheme=scheme,
-                         scheme_name=scheme_name,
-                         subtyping_params=subtyping_params,
-                         scheme_subtype_counts=scheme_subtype_counts,
-                         n_threads=n_threads)
-    if len(reads) > 0:
-        query_reads_ac(subtype_results=subtype_results,
-                       dfs=dfs,
-                       reads=reads,
-                       scheme=scheme,
-                       scheme_name=scheme_name,
-                       subtyping_params=subtyping_params,
-                       scheme_subtype_counts=scheme_subtype_counts,
-                       n_threads=n_threads)
+    subtype_results = []  # type: List[Tuple[Subtype, pd.DataFrame]]
+    if len(input_contigs) > 0:
+        contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
+                                                  scheme=scheme,
+                                                  scheme_name=scheme_name,
+                                                  subtyping_params=subtyping_params,
+                                                  scheme_subtype_counts=scheme_subtype_counts,
+                                                  n_threads=n_threads)
+        logging.info('Generated %s subtyping results from %s contigs samples', len(contigs_results), len(input_contigs))
+        subtype_results += contigs_results
+    if len(input_reads) > 0:
+        reads_results = subtype_reads_samples(reads=input_reads,
+                                              scheme=scheme,
+                                              scheme_name=scheme_name,
+                                              subtyping_params=subtyping_params,
+                                              scheme_subtype_counts=scheme_subtype_counts,
+                                              n_threads=n_threads)
+        logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
+        subtype_results += reads_results
 
-    dfsummary = pd.DataFrame(subtype_results)
+    dfs = [df for st, df in subtype_results]  # type: List[pd.DataFrame]
+    dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])
     dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]
 
     if dfsummary['avg_tile_coverage'].isnull().all():

diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py
@@ -2,15 +2,15 @@
 
 from typing import Tuple, Optional
 
-from pandas import DataFrame
+import pandas as pd
 
-from ..subtyping_params import SubtypingParams
 from ..qc.const import QC
 from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts
 from ..subtype import Subtype
+from ..subtyping_params import SubtypingParams
 
 
-def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
+def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     if not st.are_subtypes_consistent \
             or st.subtype is None \
             or not st.is_fastq_input():
@@ -21,7 +21,7 @@ def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> T
     return None, None
 
 
-def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
+def is_missing_tiles(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     """Are there more missing tiles than tolerated?
 
     Note:
@@ -39,7 +39,7 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
 
     if st.are_subtypes_consistent:
         return check_for_missing_tiles(is_fastq=st.is_fastq_input(),
-                                       curr_subtype=st.subtype,
+                                       subtype_result=st.subtype,
                                        scheme=st.scheme,
                                        df=df,
                                        exp=int(st.n_tiles_matching_all_expected),
@@ -55,51 +55,75 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
         mixed_subtype_counts = get_mixed_subtype_tile_counts(dfpos=dfpos,
                                                              subtype_list=subtype_list)
 
+        tiles_matching_negative = st.n_tiles_matching_negative
         for curr_subtype, exp in zip(subtype_list, n_tiles_matching_expected):
             # We can omit the status because there will be a fail status already from non consistent subtypes.
+
+            obs = mixed_subtype_counts.get(curr_subtype) + int(tiles_matching_negative)
             _, curr_messages = check_for_missing_tiles(is_fastq=st.is_fastq_input(),
-                                                       curr_subtype=curr_subtype,
+                                                       subtype_result=curr_subtype,
                                                        scheme=st.scheme,
                                                        df=df,
                                                        exp=int(exp),
-                                                       obs=(mixed_subtype_counts.get(curr_subtype) +
-                                                            int(st.n_tiles_matching_negative)),
+                                                       obs=obs,
                                                        p=p)
 
             message_list.append(curr_messages)
 
         error_messages = ' | '.join(filter(None.__ne__, message_list))
 
-        return _, error_messages
+        return QC.FAIL, error_messages
+
+
+def check_for_missing_tiles(is_fastq: bool,
+                            subtype_result: str,
+                            scheme: str,
+                            df: pd.DataFrame,
+                            exp: int,
+                            obs: int,
+                            p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
+    """Check if there are too many missing tiles
 
+    Also check if the mean tile coverage depth is above the low coverage threshold.
 
-def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str,
-                            df: DataFrame, exp: int, obs: int, p: SubtypingParams):
-    error_status = None
-    error_messages = None
+    Args:
+        is_fastq: Is input sample reads?
+        subtype_result: Single subtype designation
+        scheme: Scheme name
+        df: Subtyping results dataframe
+        exp: Expected number of tiles that should be found
+        obs: Actual observed number of tiles found
+        p: Subtyping parameters
+
+    Returns:
+        Tuple of QC status and any QC messages
+    """
+    status = None
+    messages = None
 
+    # proportion of missing tiles
     p_missing = (exp - obs) / exp  # type: float
     if p_missing > p.max_perc_missing_tiles:
-        error_status = QC.FAIL
+        status = QC.FAIL
         if is_fastq:
-            tiles_with_hits = df[df['is_kmer_freq_okay']]  # type: DataFrame
+            tiles_with_hits = df[df['is_kmer_freq_okay']]  # type: pd.DataFrame
             depth = tiles_with_hits['freq'].mean()
             if depth < p.low_coverage_depth_freq:
                 coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
                                f'you may need more WGS data.'
             else:
                 coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
                                f'but this may be the wrong serovar or species for scheme "{scheme}"'
-            error_messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
-                             f'tiles threshold. {coverage_msg}'
+            messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
+                       f'tiles threshold. {coverage_msg}'
         else:
-            error_messages = f'{p_missing:.2%} missing tiles for subtype "{curr_subtype}"; more than ' \
-                             f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
+            messages = f'{p_missing:.2%} missing tiles for subtype "{subtype_result}"; more than ' \
+                       f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
 
-    return error_status, error_messages
+    return status, messages
 
 
-def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
+def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
     """Is the subtype result mixed?
 
     Note:
@@ -126,7 +150,7 @@ def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str],
                     f'the same target site{s} {positions} for subtype "{st.subtype}".'
 
 
-def is_missing_too_many_target_sites(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
+def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
     Optional[str], Optional[str]]:
     """Are there too many missing target sites for an expected subtype?
 
@@ -180,7 +204,7 @@ def is_missing_downstream_targets(st: Subtype, *args) -> Tuple[Optional[str], Op
     return None, None
 
 
-def is_maybe_intermediate_subtype(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
+def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
     Optional[str], Optional[str]]:
     """Is the result a possible intermediate subtype?
 

diff --git a/bio_hansel/qc/const.py b/bio_hansel/qc/const.py
@@ -7,3 +7,4 @@ class QC:
     AMBIGUOUS_RESULTS_ERROR_3 = 'Ambiguous Results Error 3'
     UNCONFIDENT_RESULTS_ERROR_4 = 'Unconfident Results Error 4'
     NO_SUBTYPE_RESULT = 'No subtype result!'
+    NO_TARGETS_FOUND = 'No tiles/targets were found in this sample.'