Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix empty output files when no targets are found in inputs #51

Merged
merged 7 commits into from
Jul 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions bio_hansel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@
import argparse
import logging
import sys
import os
import re
import os
from typing import Optional, List, Any, Tuple

import attr
import pandas as pd

from . import program_desc, __version__
from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
from .subtype import Subtype
from .subtype_stats import subtype_counts
from .subtyper import \
query_contigs_ac, \
query_reads_ac
subtype_contigs_samples, \
subtype_reads_samples
from .metadata import read_metadata_table, merge_metadata_with_summary_results
from .utils import \
genome_name_from_fasta_path, \
Expand Down Expand Up @@ -192,36 +194,36 @@ def main():
scheme_subtype_counts = subtype_counts(scheme_fasta)
logging.debug(args)
subtyping_params = init_subtyping_params(args, scheme)
input_genomes, reads = collect_inputs(args)
if len(input_genomes) == 0 and len(reads) == 0:
input_contigs, input_reads = collect_inputs(args)
if len(input_contigs) == 0 and len(input_reads) == 0:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an "or" statement, that if either of these are empty, then it raises the error? Or is it ok if one of them is empty, that's still ok?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're allowing the user to specify a mix of file types. They can specify contigs (FASTA) and reads (FASTQ) in the same analysis. If both lists are empty then no input whatsoever has been specified so we can't do anything except let the user know they haven't specified anything to analyze.

raise Exception('No input files specified!')
df_md = None
if args.scheme_metadata:
df_md = read_metadata_table(args.scheme_metadata)
n_threads = args.threads

subtype_results = [] # type: List[Subtype]
dfs = [] # type: List[pd.DataFrame]
if len(input_genomes) > 0:
query_contigs_ac(subtype_results=subtype_results,
dfs=dfs,
input_genomes=input_genomes,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
if len(reads) > 0:
query_reads_ac(subtype_results=subtype_results,
dfs=dfs,
reads=reads,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
subtype_results = [] # type: List[Tuple[Subtype, pd.DataFrame]]
if len(input_contigs) > 0:
contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
logging.info('Generated %s subtyping results from %s contigs samples', len(contigs_results), len(input_contigs))
subtype_results += contigs_results
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is subtype_results iterable? did you mean subtype_results.append(contigs_results)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a nice thing you can do in Python where you can add 2 lists together with +:

list1 = [1, 2, 3] 
list1 += [4, 5]
assert list1 == [1,2,3,4,5]
#or
list1 = [1, 2, 3] 
list2 = [4, 5]
list3 = list1 + list2
assert list3 == [1,2,3,4,5]
# list1 and list2 are unmodified

Hopefully that makes sense!

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh ok I see, that makes sense, thanks!

if len(input_reads) > 0:
reads_results = subtype_reads_samples(reads=input_reads,
scheme=scheme,
scheme_name=scheme_name,
subtyping_params=subtyping_params,
scheme_subtype_counts=scheme_subtype_counts,
n_threads=n_threads)
logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
subtype_results += reads_results
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


dfsummary = pd.DataFrame(subtype_results)
dfs = [df for st, df in subtype_results] # type: List[pd.DataFrame]
dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])
dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]

if dfsummary['avg_tile_coverage'].isnull().all():
Expand Down
70 changes: 47 additions & 23 deletions bio_hansel/qc/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

from typing import Tuple, Optional

from pandas import DataFrame
import pandas as pd

from ..subtyping_params import SubtypingParams
from ..qc.const import QC
from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts
from ..subtype import Subtype
from ..subtyping_params import SubtypingParams


def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
if not st.are_subtypes_consistent \
or st.subtype is None \
or not st.is_fastq_input():
Expand All @@ -21,7 +21,7 @@ def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> T
return None, None


def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
def is_missing_tiles(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
"""Are there more missing tiles than tolerated?

Note:
Expand All @@ -39,7 +39,7 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op

if st.are_subtypes_consistent:
return check_for_missing_tiles(is_fastq=st.is_fastq_input(),
curr_subtype=st.subtype,
subtype_result=st.subtype,
scheme=st.scheme,
df=df,
exp=int(st.n_tiles_matching_all_expected),
Expand All @@ -55,51 +55,75 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
mixed_subtype_counts = get_mixed_subtype_tile_counts(dfpos=dfpos,
subtype_list=subtype_list)

tiles_matching_negative = st.n_tiles_matching_negative
for curr_subtype, exp in zip(subtype_list, n_tiles_matching_expected):
# We can omit the status because there will be a fail status already from non consistent subtypes.

obs = mixed_subtype_counts.get(curr_subtype) + int(tiles_matching_negative)
_, curr_messages = check_for_missing_tiles(is_fastq=st.is_fastq_input(),
curr_subtype=curr_subtype,
subtype_result=curr_subtype,
scheme=st.scheme,
df=df,
exp=int(exp),
obs=(mixed_subtype_counts.get(curr_subtype) +
int(st.n_tiles_matching_negative)),
obs=obs,
p=p)

message_list.append(curr_messages)

error_messages = ' | '.join(filter(None.__ne__, message_list))

return _, error_messages
return QC.FAIL, error_messages


def check_for_missing_tiles(is_fastq: bool,
subtype_result: str,
scheme: str,
df: pd.DataFrame,
exp: int,
obs: int,
p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
"""Check if there are too many missing tiles

Also check if the mean tile coverage depth is above the low coverage threshold.

def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str,
df: DataFrame, exp: int, obs: int, p: SubtypingParams):
error_status = None
error_messages = None
Args:
is_fastq: Is input sample reads?
subtype_result: Single subtype designation
scheme: Scheme name
df: Subtyping results dataframe
exp: Expected number of tiles that should be found
obs: Actual observed number of tiles found
p: Subtyping parameters

Returns:
Tuple of QC status and any QC messages
"""
status = None
messages = None

# proportion of missing tiles
p_missing = (exp - obs) / exp # type: float
if p_missing > p.max_perc_missing_tiles:
error_status = QC.FAIL
status = QC.FAIL
if is_fastq:
tiles_with_hits = df[df['is_kmer_freq_okay']] # type: DataFrame
tiles_with_hits = df[df['is_kmer_freq_okay']] # type: pd.DataFrame
depth = tiles_with_hits['freq'].mean()
if depth < p.low_coverage_depth_freq:
coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
f'you may need more WGS data.'
else:
coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
f'but this may be the wrong serovar or species for scheme "{scheme}"'
error_messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
f'tiles threshold. {coverage_msg}'
messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
f'tiles threshold. {coverage_msg}'
else:
error_messages = f'{p_missing:.2%} missing tiles for subtype "{curr_subtype}"; more than ' \
f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
messages = f'{p_missing:.2%} missing tiles for subtype "{subtype_result}"; more than ' \
f'{p.max_perc_missing_tiles:.2%} missing tile threshold'

return error_status, error_messages
return status, messages


def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
"""Is the subtype result mixed?

Note:
Expand All @@ -126,7 +150,7 @@ def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str],
f'the same target site{s} {positions} for subtype "{st.subtype}".'


def is_missing_too_many_target_sites(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
"""Are there too many missing target sites for an expected subtype?

Expand Down Expand Up @@ -180,7 +204,7 @@ def is_missing_downstream_targets(st: Subtype, *args) -> Tuple[Optional[str], Op
return None, None


def is_maybe_intermediate_subtype(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
Optional[str], Optional[str]]:
"""Is the result a possible intermediate subtype?

Expand Down
1 change: 1 addition & 0 deletions bio_hansel/qc/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ class QC:
AMBIGUOUS_RESULTS_ERROR_3 = 'Ambiguous Results Error 3'
UNCONFIDENT_RESULTS_ERROR_4 = 'Unconfident Results Error 4'
NO_SUBTYPE_RESULT = 'No subtype result!'
NO_TARGETS_FOUND = 'No tiles/targets were found in this sample.'
Loading