Skip to content

Commit

Permalink
Fix empty output files when no targets are found in inputs (#51)
Browse files Browse the repository at this point in the history
* adds 1 row DF for null subtyping result; refactoring

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>

* Add unit tests, refactoring,

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>

* Update main.py

organize imports

* Update checks.py to use import pandas as pd for consistency

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>

* Update subtyper.py; fix docstring, renaming a few vars, refactoring

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
  • Loading branch information
peterk87 authored and gcttong committed Jul 13, 2018
1 parent ebfb547 commit 3d97c26
Show file tree
Hide file tree
Showing 9 changed files with 580 additions and 264 deletions.
54 changes: 28 additions & 26 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
import argparse
import logging
import sys
import os
import re
import os
from typing import Optional, List, Any, Tuple

import attr
import pandas as pd

from . import program_desc, __version__
from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from .subtype import Subtype
from .subtype_stats import subtype_counts
from .subtyper import \
query_contigs_ac, \
query_reads_ac
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_metadata_with_summary_results
from .utils import \
genome_name_from_fasta_path, \
Expand Down Expand Up @@ -192,36 +194,36 @@ def main():
scheme_subtype_counts = subtype_counts(scheme_fasta)
logging.debug(args)
subtyping_params = init_subtyping_params(args, scheme)
input_genomes, reads = collect_inputs(args)
if len(input_genomes) == 0 and len(reads) == 0:
input_contigs, input_reads = collect_inputs(args)
if len(input_contigs) == 0 and len(input_reads) == 0:
raise Exception('No input files specified!')
df_md = None
if args.scheme_metadata:
df_md = read_metadata_table(args.scheme_metadata)
n_threads = args.threads

subtype_results = [] # type: List[Subtype]
dfs = [] # type: List[pd.DataFrame]
if len(input_genomes) > 0:
query_contigs_ac(subtype_results=subtype_results,
dfs=dfs,
input_genomes=input_genomes,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
if len(reads) > 0:
query_reads_ac(subtype_results=subtype_results,
dfs=dfs,
reads=reads,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
subtype_results = [] # type: List[Tuple[Subtype, pd.DataFrame]]
if len(input_contigs) > 0:
contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
logging.info('Generated %s subtyping results from %s contigs samples', len(contigs_results), len(input_contigs))
subtype_results += contigs_results
if len(input_reads) > 0:
reads_results = subtype_reads_samples(reads=input_reads,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
subtype_results += reads_results

dfsummary = pd.DataFrame(subtype_results)
dfs = [df for st, df in subtype_results] # type: List[pd.DataFrame]
dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])
dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

if dfsummary['avg_tile_coverage'].isnull().all():
Expand Down
70 changes: 47 additions & 23 deletions bio_hansel/qc/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from typing import Tuple, Optional

from pandas import DataFrame
import pandas as pd

from ..subtyping_params import SubtypingParams
from ..qc.const import QC
from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts
from ..subtype import Subtype
from ..subtyping_params import SubtypingParams


def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
if not st.are_subtypes_consistent \
or st.subtype is None \
or not st.is_fastq_input():
Expand All @@ -21,7 +21,7 @@ def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> T
return None, None


def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
def is_missing_tiles(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
"""Are there more missing tiles than tolerated?
Note:
Expand All @@ -39,7 +39,7 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op

if st.are_subtypes_consistent:
return check_for_missing_tiles(is_fastq=st.is_fastq_input(),
curr_subtype=st.subtype,
subtype_result=st.subtype,
scheme=st.scheme,
df=df,
exp=int(st.n_tiles_matching_all_expected),
Expand All @@ -55,51 +55,75 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
mixed_subtype_counts = get_mixed_subtype_tile_counts(dfpos=dfpos,
subtype_list=subtype_list)

tiles_matching_negative = st.n_tiles_matching_negative
for curr_subtype, exp in zip(subtype_list, n_tiles_matching_expected):
# We can omit the status because there will be a fail status already from non consistent subtypes.

obs = mixed_subtype_counts.get(curr_subtype) + int(tiles_matching_negative)
_, curr_messages = check_for_missing_tiles(is_fastq=st.is_fastq_input(),
curr_subtype=curr_subtype,
subtype_result=curr_subtype,
scheme=st.scheme,
df=df,
exp=int(exp),
obs=(mixed_subtype_counts.get(curr_subtype) +
int(st.n_tiles_matching_negative)),
obs=obs,
p=p)

message_list.append(curr_messages)

error_messages = ' | '.join(filter(None.__ne__, message_list))

return _, error_messages
return QC.FAIL, error_messages


def check_for_missing_tiles(is_fastq: bool,
subtype_result: str,
scheme: str,
df: pd.DataFrame,
exp: int,
obs: int,
p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
"""Check if there are too many missing tiles
Also check if the mean tile coverage depth is above the low coverage threshold.
def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str,
df: DataFrame, exp: int, obs: int, p: SubtypingParams):
error_status = None
error_messages = None
Args:
is_fastq: Is input sample reads?
subtype_result: Single subtype designation
scheme: Scheme name
df: Subtyping results dataframe
exp: Expected number of tiles that should be found
obs: Actual observed number of tiles found
p: Subtyping parameters
Returns:
Tuple of QC status and any QC messages
"""
status = None
messages = None

# proportion of missing tiles
p_missing = (exp - obs) / exp # type: float
if p_missing > p.max_perc_missing_tiles:
error_status = QC.FAIL
status = QC.FAIL
if is_fastq:
tiles_with_hits = df[df['is_kmer_freq_okay']] # type: DataFrame
tiles_with_hits = df[df['is_kmer_freq_okay']] # type: pd.DataFrame
depth = tiles_with_hits['freq'].mean()
if depth < p.low_coverage_depth_freq:
coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
f'you may need more WGS data.'
else:
coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
f'but this may be the wrong serovar or species for scheme "{scheme}"'
error_messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
f'tiles threshold. {coverage_msg}'
messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
f'tiles threshold. {coverage_msg}'
else:
error_messages = f'{p_missing:.2%} missing tiles for subtype "{curr_subtype}"; more than ' \
f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
messages = f'{p_missing:.2%} missing tiles for subtype "{subtype_result}"; more than ' \
f'{p.max_perc_missing_tiles:.2%} missing tile threshold'

return error_status, error_messages
return status, messages


def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
"""Is the subtype result mixed?
Note:
Expand All @@ -126,7 +150,7 @@ def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str],
f'the same target site{s} {positions} for subtype "{st.subtype}".'


def is_missing_too_many_target_sites(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
"""Are there too many missing target sites for an expected subtype?
Expand Down Expand Up @@ -180,7 +204,7 @@ def is_missing_downstream_targets(st: Subtype, *args) -> Tuple[Optional[str], Op
return None, None


def is_maybe_intermediate_subtype(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
"""Is the result a possible intermediate subtype?
Expand Down
1 change: 1 addition & 0 deletions bio_hansel/qc/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ class QC:
AMBIGUOUS_RESULTS_ERROR_3 = 'Ambiguous Results Error 3'
UNCONFIDENT_RESULTS_ERROR_4 = 'Unconfident Results Error 4'
NO_SUBTYPE_RESULT = 'No subtype result!'
NO_TARGETS_FOUND = 'No tiles/targets were found in this sample.'
Loading

0 comments on commit 3d97c26

Please sign in to comment.