From 73365e772e47b6d69429563ab1253e3f5225f26f Mon Sep 17 00:00:00 2001 From: pkruczkiewicz Date: Fri, 6 Jul 2018 17:03:11 -0500 Subject: [PATCH 1/5] adds 1 row DF for null subtyping result; refactoring Signed-off-by: pkruczkiewicz --- bio_hansel/qc/const.py | 1 + bio_hansel/subtyper.py | 260 ++++++++++++++++++++++++++++++----------- bio_hansel/utils.py | 6 +- 3 files changed, 193 insertions(+), 74 deletions(-) diff --git a/bio_hansel/qc/const.py b/bio_hansel/qc/const.py index f30a86f..b2effc9 100644 --- a/bio_hansel/qc/const.py +++ b/bio_hansel/qc/const.py @@ -7,3 +7,4 @@ class QC: AMBIGUOUS_RESULTS_ERROR_3 = 'Ambiguous Results Error 3' UNCONFIDENT_RESULTS_ERROR_4 = 'Unconfident Results Error 4' NO_SUBTYPE_RESULT = 'No subtype result!' + NO_TARGETS_FOUND = 'No tiles/targets were found in this sample.' diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index 41456f2..1ac345d 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- - +""" +Functions for subtyping of reads (e.g. FASTQ) and contigs (e.g. FASTA) using bio_hansel-compatible subtyping schemes. +""" import re import logging from typing import Optional, List, Dict, Union, Tuple @@ -9,7 +11,7 @@ from .aho_corasick import init_automaton, find_in_fasta, find_in_fastqs from .const import COLUMNS_TO_REMOVE -from .qc import perform_quality_check +from .qc import perform_quality_check, QC from .subtype import Subtype from .subtype_stats import SubtypeCounts from .subtype_stats import subtype_counts @@ -23,7 +25,21 @@ def subtype_contigs_ac(fasta_path: str, subtyping_params: Optional[SubtypingParams] = None, scheme_name: Optional[str] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ - Subtype, Optional[pd.DataFrame]]: + Subtype, pd.DataFrame]: + """Subtype input contigs using a particular scheme. + + Args: + fasta_path: Input FASTA file path + genome_name: Input genome name + scheme: bio_hansel scheme FASTA path + subtyping_params: scheme specific subtyping parameters + scheme_name: optional scheme name + scheme_subtype_counts: summary information about scheme + + Returns: + - Subtype result + - pd.DataFrame of detailed subtyping results + """ scheme_fasta = get_scheme_fasta(scheme) if scheme_subtype_counts is None: scheme_subtype_counts = subtype_counts(scheme_fasta) @@ -41,48 +57,17 @@ def subtype_contigs_ac(fasta_path: str, if df is None or df.shape[0] == 0: logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme) + st.qc_status = QC.FAIL + st.qc_message = QC.NO_SUBTYPE_RESULT st.are_subtypes_consistent = False - return st, None + return st, empty_results(st) refpositions = [x for x, y in df.tilename.str.split('-')] subtypes = [y for x, y in df.tilename.str.split('-')] df['refposition'] = [int(x.replace('negative', '')) for x in refpositions] df['subtype'] = subtypes df['is_pos_tile'] = ~df.tilename.str.contains('negative') - dfpos = df[df.is_pos_tile] - subtype_lens = dfpos.subtype.apply(len) - max_subtype_strlen = subtype_lens.max() - logging.debug('max substype str len: %s', max_subtype_strlen) - dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen] - logging.debug('dfpos_highest_res: %s', dfpos_highest_res) - pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()] - pos_subtypes.sort(key=lambda a: len(a)) - logging.debug('pos_subtypes: %s', pos_subtypes) - inconsistent_subtypes = find_inconsistent_subtypes(pos_subtypes) - logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes) - st.n_tiles_matching_all = df.tilename.unique().size - st.n_tiles_matching_positive = dfpos.tilename.unique().size - st.n_tiles_matching_subtype = dfpos_highest_res.tilename.unique().size - pos_subtypes_str = [x for x in dfpos.subtype.unique()] - pos_subtypes_str.sort(key=lambda x: len(x)) - st.all_subtypes = '; '.join(pos_subtypes_str) - subtype_list = [x for x in dfpos_highest_res.subtype.unique()] - st.subtype = '; '.join(subtype_list) - st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list]) - st.n_tiles_matching_positive_expected = ';'.join( - [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list]) - st.n_tiles_matching_subtype_expected = ';'.join( - [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list]) - st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename.unique()]) - - if len(inconsistent_subtypes) > 0: - st.are_subtypes_consistent = False - st.inconsistent_subtypes = inconsistent_subtypes - - possible_downstream_subtypes = [s for s in scheme_subtype_counts - if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)] - st.non_present_subtypes = [x for x in possible_downstream_subtypes - if not (df.subtype == x).any()] + process_contigs_subtyping_results(st, df, scheme_subtype_counts) st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) logging.info(st) @@ -98,12 +83,29 @@ def subtype_contigs_ac(fasta_path: str, return st, df + + def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]], scheme: str, scheme_name: Optional[str] = None, subtyping_params: Optional[SubtypingParams] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, n_threads: int = 1): + """Parallel subtyping of input contigs + + Subtype and analyse each input in parallel using a multiprocessing thread pool. + + Args: + input_genomes: Input genome FASTA paths + scheme: bio_hansel scheme FASTA path to subtype with + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: scheme summary information + n_threads: Number of threads to use + + Returns: + A list of tuples of Subtype results and a pd.DataFrame of detailed subtyping results for each input + """ from multiprocessing import Pool logging.info('Initializing thread pool with %s threads', n_threads) pool = Pool(processes=n_threads) @@ -128,6 +130,21 @@ def query_contigs_ac(subtype_results: List[Subtype], subtyping_params: Optional[SubtypingParams] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, n_threads: int = 1) -> None: + """Subtype input genomes using a scheme. + + Args: + subtype_results: Subtyping results for each input genome + dfs: detailed subtyping results for each input genome + input_genomes: input genomes; tuple of FASTA file path and genome name + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: summary information about scheme + n_threads: number of threads to use for subtyping analysis + + Returns: + `subtype_results` and `dfs` by reference + """ if n_threads == 1: logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes)) outputs = [subtype_contigs_ac(fasta_path=input_fasta, @@ -143,6 +160,9 @@ def query_contigs_ac(subtype_results: List[Subtype], for subtype, df in outputs: if df is not None: dfs.append(df) + else: + logging.error(f'Subtyping results DataFrame is "None" for {subtype}. Building empty results DF.') + dfs.append(empty_results(subtype)) subtype_results.append(attr.asdict(subtype)) @@ -152,6 +172,21 @@ def parallel_query_reads_ac(reads: List[Tuple[List[str], str]], subtyping_params: Optional[SubtypingParams] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]: + """Parallel subtyping of input reads + + Subtype and analyse each input in parallel using a multiprocessing thread pool. + + Args: + reads: Input reads; list of tuples of FASTQ file paths and genome names + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: scheme summary information + n_threads: number of threads to use + + Returns: + A list of tuples of Subtype results and a pd.DataFrame of detailed subtyping results for each input + """ from multiprocessing import Pool logging.info('Initializing thread pool with %s threads', n_threads) pool = Pool(processes=n_threads) @@ -175,8 +210,20 @@ def subtype_reads_ac(reads: Union[str, List[str]], subtyping_params: Optional[SubtypingParams] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ Subtype, Optional[pd.DataFrame]]: - logging.info("genome_name %s", genome_name) + """Subtype input reads using a particular scheme. + Args: + reads: Input FASTQ file path(s) + genome_name: Input genome name + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: summary information about scheme + + Returns: + - Subtype result + - pd.DataFrame of detailed subtyping results + """ scheme_fasta = get_scheme_fasta(scheme) if scheme_subtype_counts is None: scheme_subtype_counts = subtype_counts(scheme_fasta) @@ -201,7 +248,9 @@ def subtype_reads_ac(reads: Union[str, List[str]], if df is None or df.shape[0] == 0: logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', reads, scheme) st.are_subtypes_consistent = False - return st, None + st.qc_status = QC.FAIL + st.qc_message = QC.NO_SUBTYPE_RESULT + return st, empty_results(st) refpositions = [x for x, y in df.tilename.str.split('-')] subtypes = [y for x, y in df.tilename.str.split('-')] @@ -209,49 +258,80 @@ def subtype_reads_ac(reads: Union[str, List[str]], df['subtype'] = subtypes df['is_pos_tile'] = ~df.tilename.str.contains('negative') df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq) - dfgood = df[df.is_kmer_freq_okay] - dfpos = dfgood[dfgood.is_pos_tile] - dfneg = dfgood[~dfgood.is_pos_tile] - subtype_lens = dfpos.subtype.apply(len) - max_subtype_strlen = subtype_lens.max() - logging.debug('max substype str len: %s', max_subtype_strlen) - dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen] - pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()] - pos_subtypes.sort(key=lambda a: len(a)) - logging.debug('pos_subtypes: %s', pos_subtypes) - inconsistent_subtypes = find_inconsistent_subtypes(pos_subtypes) - logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes) - st.n_tiles_matching_all = dfgood.shape[0] - st.n_tiles_matching_positive = dfpos.shape[0] - st.n_tiles_matching_negative = dfneg.shape[0] - st.n_tiles_matching_subtype = dfpos_highest_res.shape[0] + st.avg_tile_coverage = df['freq'].mean() + process_contigs_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts) + st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) + df['sample'] = genome_name + df['scheme'] = scheme_name or scheme + df['scheme_version'] = scheme_version + df['qc_status'] = st.qc_status + df['qc_message'] = st.qc_message + df = df[df.columns[~df.columns.isin(COLUMNS_TO_REMOVE)]] + return st, df + + +def process_contigs_subtyping_results(st, df, scheme_subtype_counts): + dfpos = df[df.is_pos_tile] + dfpos_highest_res = highest_resolution_subtype_results(dfpos) + subtype_list = [x for x in dfpos_highest_res.subtype.unique()] + set_subtype_results(st, dfpos, subtype_list) + set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_positive_subtypes(dfpos))) + calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts) + set_non_present_subtypes(st, df, scheme_subtype_counts) + + +def set_subtype_results(st, dfpos, subtype_list): + st.subtype = '; '.join(subtype_list) + st.tiles_matching_subtype = '; '.join(subtype_list) pos_subtypes_str = [x for x in dfpos.subtype.unique()] pos_subtypes_str.sort(key=lambda x: len(x)) st.all_subtypes = '; '.join(pos_subtypes_str) - subtype_list = [x for x in dfpos_highest_res.subtype.unique()] - st.subtype = '; '.join(subtype_list) + + +def calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts): + st.n_tiles_matching_all = df.tilename.unique().size + st.n_tiles_matching_positive = dfpos.tilename.unique().size + st.n_tiles_matching_negative = df[~df.is_pos_tile] + st.n_tiles_matching_subtype = len(subtype_list) st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list]) st.n_tiles_matching_positive_expected = ';'.join( [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list]) st.n_tiles_matching_subtype_expected = ';'.join( [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list]) - st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename.unique()]) - if len(inconsistent_subtypes) > 0: - st.are_subtypes_consistent = False - st.inconsistent_subtypes = inconsistent_subtypes + + +def count_periods(subtype_with_periods: str) -> pd.Series: + return (pd.Series(list(subtype_with_periods)) == '.').sum() + + +def highest_resolution_subtype_results(dfpos): + subtype_lens = dfpos.subtype.apply(count_periods) + max_subtype_strlen = subtype_lens.max() + logging.debug('max substype str len: %s', max_subtype_strlen) + dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen] + return dfpos_highest_res + + +def sorted_positive_subtypes(dfpos: pd.DataFrame) -> List[List[int]]: + pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()] + pos_subtypes.sort(key=lambda a: len(a)) + return pos_subtypes + + +def set_non_present_subtypes(st, df, scheme_subtype_counts): possible_downstream_subtypes = [s for s in scheme_subtype_counts if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)] st.non_present_subtypes = [x for x in possible_downstream_subtypes if not (df.subtype == x).any()] - st.avg_tile_coverage = df['freq'].mean() - st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) - df['sample'] = genome_name - df['scheme'] = scheme_name or scheme - df['scheme_version'] = scheme_version - df['qc_status'] = st.qc_status - df['qc_message'] = st.qc_message - df = df[df.columns[~df.columns.isin(COLUMNS_TO_REMOVE)]] - return st, df + + +def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List): + if len(inconsistent_subtypes) > 0: + st.are_subtypes_consistent = False + st.inconsistent_subtypes = inconsistent_subtypes + else: + st.are_subtypes_consistent = True + st.are_subtypes_consistent = None def query_reads_ac(subtype_results: List[Subtype], @@ -262,6 +342,21 @@ def query_reads_ac(subtype_results: List[Subtype], subtyping_params: Optional[SubtypingParams] = None, scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, n_threads: int = 1): + """Subtype input genomes using a scheme. + + Args: + subtype_results: Subtyping results for each input genome + dfs: detailed subtyping results for each input genome + input_genomes: input genomes; tuple of FASTA file path and genome name + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: summary information about scheme + n_threads: number of threads to use for subtyping analysis + + Returns: + `subtype_results` and `dfs` by reference + """ if n_threads == 1: logging.info('Serial single threaded run mode on %s input genomes', len(reads)) outputs = [subtype_reads_ac(reads=fastq_files, @@ -282,3 +377,26 @@ def query_reads_ac(subtype_results: List[Subtype], if df is not None: dfs.append(df) subtype_results.append(attr.asdict(subtype)) + + +def empty_results(st: Subtype) -> pd.DataFrame: + """Get a results DataFrame with 1 entry when no results are retrieved. + + When the output DataFrame for subtyping is empty, generate a "dummy" DataFrame with one row showing that subtyping + was performed on the sample, but no results were generated because nothing was found. + + Args: + st: Subtype results; should be a null result + + Returns: + pd.DataFrame with 1 row with null result + """ + return pd.DataFrame({0: dict(sample=st.sample, + file_path=st.file_path, + subtype=st.subtype, + refposition=None, + is_pos_tile=None, + scheme=st.scheme, + scheme_version=st.scheme_version, + qc_status=st.qc_status, + qc_message=st.qc_message)}).transpose() \ No newline at end of file diff --git a/bio_hansel/utils.py b/bio_hansel/utils.py index 499c799..97124ed 100644 --- a/bio_hansel/utils.py +++ b/bio_hansel/utils.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import logging -from typing import List, Any, Optional, Tuple +from typing import List, Any, Optional, Tuple, Union import os import re @@ -51,7 +51,7 @@ def compare_subtypes(a: List[Any], b: List[Any]) -> bool: return True -def find_inconsistent_subtypes(subtypes: List[Any]) -> List[str]: +def find_inconsistent_subtypes(subtypes: List[List[int]]) -> List[str]: from collections import Counter incon = [] for i in range(len(subtypes) - 1): @@ -98,7 +98,7 @@ def get_scheme_version(scheme: str) -> Optional[str]: return None -def collect_fastq_from_dir(input_directory): +def collect_fastq_from_dir(input_directory: str) -> List[Union[str, Tuple[List[str], str]]]: fastqs = [] for x in os.listdir(input_directory): full_file_path = os.path.abspath(os.path.join(input_directory, x)) From 0159bd4b270b7928fb501f84a722b01c47e1f861 Mon Sep 17 00:00:00 2001 From: pkruczkiewicz Date: Tue, 10 Jul 2018 13:20:34 -0500 Subject: [PATCH 2/5] Add unit tests, refactoring, Signed-off-by: pkruczkiewicz --- bio_hansel/main.py | 56 ++-- bio_hansel/qc/checks.py | 52 ++-- bio_hansel/subtyper.py | 394 +++++++++++++++++------------ tests/test_qc.py | 31 ++- tests/test_subtyping_contigs.py | 45 ++-- tests/test_subtyping_processing.py | 97 +++++++ tests/test_subtyping_reads.py | 9 +- 7 files changed, 438 insertions(+), 246 deletions(-) create mode 100644 tests/test_subtyping_processing.py diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 1a851cd..837bd5c 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -4,18 +4,20 @@ import argparse import logging import sys -import os -import re from typing import Optional, List, Any, Tuple + +import attr +import os import pandas as pd +import re from . import program_desc, __version__ from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL from .subtype import Subtype from .subtype_stats import subtype_counts from .subtyper import \ - query_contigs_ac, \ - query_reads_ac + subtype_contigs_samples, \ + subtype_reads_samples from .utils import \ genome_name_from_fasta_path, \ get_scheme_fasta, \ @@ -189,34 +191,34 @@ def main(): scheme_subtype_counts = subtype_counts(scheme_fasta) logging.debug(args) subtyping_params = init_subtyping_params(args, scheme) - input_genomes, reads = collect_inputs(args) - if len(input_genomes) == 0 and len(reads) == 0: + input_contigs, input_reads = collect_inputs(args) + if len(input_contigs) == 0 and len(input_reads) == 0: raise Exception('No input files specified!') n_threads = args.threads - subtype_results = [] # type: List[Subtype] - dfs = [] # type: List[pd.DataFrame] - if len(input_genomes) > 0: - query_contigs_ac(subtype_results=subtype_results, - dfs=dfs, - input_genomes=input_genomes, - scheme=scheme, - scheme_name=scheme_name, - subtyping_params=subtyping_params, - scheme_subtype_counts=scheme_subtype_counts, - n_threads=n_threads) - if len(reads) > 0: - query_reads_ac(subtype_results=subtype_results, - dfs=dfs, - reads=reads, - scheme=scheme, - scheme_name=scheme_name, - subtyping_params=subtyping_params, - scheme_subtype_counts=scheme_subtype_counts, - n_threads=n_threads) + subtype_results = [] # type: List[Tuple[Subtype, pd.DataFrame]] + if len(input_contigs) > 0: + contigs_results = subtype_contigs_samples(input_genomes=input_contigs, + scheme=scheme, + scheme_name=scheme_name, + subtyping_params=subtyping_params, + scheme_subtype_counts=scheme_subtype_counts, + n_threads=n_threads) + logging.info('Generated %s subtyping results from %s contigs samples', len(contigs_results), len(input_contigs)) + subtype_results += contigs_results + if len(input_reads) > 0: + reads_results = subtype_reads_samples(reads=input_reads, + scheme=scheme, + scheme_name=scheme_name, + subtyping_params=subtyping_params, + scheme_subtype_counts=scheme_subtype_counts, + n_threads=n_threads) + logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads)) + subtype_results += reads_results - dfsummary = pd.DataFrame(subtype_results) + dfs = [df for st, df in subtype_results] # type: List[pd.DataFrame] + dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results]) dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS] if dfsummary['avg_tile_coverage'].isnull().all(): diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py index 650a518..22fb5c2 100644 --- a/bio_hansel/qc/checks.py +++ b/bio_hansel/qc/checks.py @@ -4,10 +4,10 @@ from pandas import DataFrame -from ..subtyping_params import SubtypingParams from ..qc.const import QC from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts from ..subtype import Subtype +from ..subtyping_params import SubtypingParams def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: @@ -39,7 +39,7 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op if st.are_subtypes_consistent: return check_for_missing_tiles(is_fastq=st.is_fastq_input(), - curr_subtype=st.subtype, + subtype_result=st.subtype, scheme=st.scheme, df=df, exp=int(st.n_tiles_matching_all_expected), @@ -55,32 +55,52 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op mixed_subtype_counts = get_mixed_subtype_tile_counts(dfpos=dfpos, subtype_list=subtype_list) + tiles_matching_negative = st.n_tiles_matching_negative for curr_subtype, exp in zip(subtype_list, n_tiles_matching_expected): # We can omit the status because there will be a fail status already from non consistent subtypes. + + obs = mixed_subtype_counts.get(curr_subtype) + int(tiles_matching_negative) _, curr_messages = check_for_missing_tiles(is_fastq=st.is_fastq_input(), - curr_subtype=curr_subtype, + subtype_result=curr_subtype, scheme=st.scheme, df=df, exp=int(exp), - obs=(mixed_subtype_counts.get(curr_subtype) + - int(st.n_tiles_matching_negative)), + obs=obs, p=p) message_list.append(curr_messages) error_messages = ' | '.join(filter(None.__ne__, message_list)) - return _, error_messages + return QC.FAIL, error_messages + + +def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str, + df: DataFrame, exp: int, obs: int, p: SubtypingParams) -> Tuple[ + Optional[str], Optional[str]]: + """Check if there are too many missing tiles + Also check if the mean tile coverage depth is above the low coverage threshold. -def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str, - df: DataFrame, exp: int, obs: int, p: SubtypingParams): - error_status = None - error_messages = None + Args: + is_fastq: Is input sample reads? + subtype_result: Single subtype designation + scheme: Scheme name + df: Subtyping results dataframe + exp: Expected number of tiles that should be found + obs: Actual observed number of tiles found + p: Subtyping parameters + + Returns: + Tuple of QC status and any QC messages + """ + status = None + messages = None + # proportion of missing tiles p_missing = (exp - obs) / exp # type: float if p_missing > p.max_perc_missing_tiles: - error_status = QC.FAIL + status = QC.FAIL if is_fastq: tiles_with_hits = df[df['is_kmer_freq_okay']] # type: DataFrame depth = tiles_with_hits['freq'].mean() @@ -90,13 +110,13 @@ def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str, else: coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \ f'but this may be the wrong serovar or species for scheme "{scheme}"' - error_messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \ - f'tiles threshold. {coverage_msg}' + messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \ + f'tiles threshold. {coverage_msg}' else: - error_messages = f'{p_missing:.2%} missing tiles for subtype "{curr_subtype}"; more than ' \ - f'{p.max_perc_missing_tiles:.2%} missing tile threshold' + messages = f'{p_missing:.2%} missing tiles for subtype "{subtype_result}"; more than ' \ + f'{p.max_perc_missing_tiles:.2%} missing tile threshold' - return error_status, error_messages + return status, messages def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]: diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index 1ac345d..db44d42 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -2,12 +2,11 @@ """ Functions for subtyping of reads (e.g. FASTQ) and contigs (e.g. FASTA) using bio_hansel-compatible subtyping schemes. """ -import re import logging from typing import Optional, List, Dict, Union, Tuple -import attr import pandas as pd +import re from .aho_corasick import init_automaton, find_in_fasta, find_in_fastqs from .const import COLUMNS_TO_REMOVE @@ -19,12 +18,84 @@ from .utils import find_inconsistent_subtypes, get_scheme_fasta, get_scheme_version, init_subtyping_params -def subtype_contigs_ac(fasta_path: str, - genome_name: str, - scheme: str, - subtyping_params: Optional[SubtypingParams] = None, - scheme_name: Optional[str] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ +def subtype_reads_samples(reads: List[Tuple[List[str], str]], + scheme: str, + scheme_name: Optional[str] = None, + subtyping_params: Optional[SubtypingParams] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, + n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]: + """Subtype input genomes using a scheme. + + Args: + input_genomes: input genomes; tuple of FASTA file path and genome name + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: summary information about scheme + n_threads: number of threads to use for subtyping analysis + + Returns: + List of tuple of Subtype and detailed subtyping results for each sample + """ + if n_threads == 1: + logging.info('Serial single threaded run mode on %s input genomes', len(reads)) + outputs = [subtype_reads(reads=fastq_files, + genome_name=genome_name, + scheme=scheme, + scheme_name=scheme_name, + subtyping_params=subtyping_params, + scheme_subtype_counts=scheme_subtype_counts) + for fastq_files, genome_name in reads] + else: + outputs = parallel_query_reads(reads=reads, + scheme=scheme, + scheme_name=scheme_name, + subtyping_params=subtyping_params, + scheme_subtype_counts=scheme_subtype_counts, + n_threads=n_threads) + return outputs + + +def subtype_contigs_samples(input_genomes: List[Tuple[str, str]], + scheme: str, + scheme_name: Optional[str] = None, + subtyping_params: Optional[SubtypingParams] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, + n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]: + """Subtype input genomes using a scheme. + + Args: + input_genomes: input genomes; tuple of FASTA file path and genome name + scheme: bio_hansel scheme FASTA path + scheme_name: optional scheme name + subtyping_params: scheme specific subtyping parameters + scheme_subtype_counts: summary information about scheme + n_threads: number of threads to use for subtyping analysis + + Returns: + List of tuple of Subtype and detailed subtyping results for each sample + """ + if n_threads == 1: + logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes)) + outputs = [subtype_contigs(fasta_path=input_fasta, + genome_name=genome_name, + scheme=scheme, + scheme_name=scheme_name, + subtyping_params=subtyping_params, + scheme_subtype_counts=scheme_subtype_counts) + for input_fasta, genome_name in input_genomes] + else: + outputs = parallel_query_contigs(input_genomes, scheme, scheme_name, subtyping_params, scheme_subtype_counts, + n_threads) + return outputs + + +def subtype_contigs(fasta_path: str, + genome_name: str, + scheme: str, + subtyping_params: Optional[SubtypingParams] = None, + scheme_name: Optional[str] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ Subtype, pd.DataFrame]: """Subtype input contigs using a particular scheme. @@ -58,7 +129,7 @@ def subtype_contigs_ac(fasta_path: str, if df is None or df.shape[0] == 0: logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme) st.qc_status = QC.FAIL - st.qc_message = QC.NO_SUBTYPE_RESULT + st.qc_message = QC.NO_TARGETS_FOUND st.are_subtypes_consistent = False return st, empty_results(st) @@ -67,7 +138,7 @@ def subtype_contigs_ac(fasta_path: str, df['refposition'] = [int(x.replace('negative', '')) for x in refpositions] df['subtype'] = subtypes df['is_pos_tile'] = ~df.tilename.str.contains('negative') - process_contigs_subtyping_results(st, df, scheme_subtype_counts) + process_subtyping_results(st, df, scheme_subtype_counts) st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) logging.info(st) @@ -83,14 +154,12 @@ def subtype_contigs_ac(fasta_path: str, return st, df - - -def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]], - scheme: str, - scheme_name: Optional[str] = None, - subtyping_params: Optional[SubtypingParams] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, - n_threads: int = 1): +def parallel_query_contigs(input_genomes: List[Tuple[str, str]], + scheme: str, + scheme_name: Optional[str] = None, + subtyping_params: Optional[SubtypingParams] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, + n_threads: int = 1): """Parallel subtyping of input contigs Subtype and analyse each input in parallel using a multiprocessing thread pool. @@ -110,68 +179,24 @@ def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]], logging.info('Initializing thread pool with %s threads', n_threads) pool = Pool(processes=n_threads) logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes)) - res = [pool.apply_async(subtype_contigs_ac, (input_fasta, - genome_name, - scheme, - subtyping_params, - scheme_name, - scheme_subtype_counts)) + res = [pool.apply_async(subtype_contigs, (input_fasta, + genome_name, + scheme, + subtyping_params, + scheme_name, + scheme_subtype_counts)) for input_fasta, genome_name in input_genomes] logging.info('Parallel analysis complete! Retrieving analysis results') outputs = [x.get() for x in res] return outputs -def query_contigs_ac(subtype_results: List[Subtype], - dfs: List[pd.DataFrame], - input_genomes: List[Tuple[str, str]], - scheme: str, - scheme_name: Optional[str] = None, - subtyping_params: Optional[SubtypingParams] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, - n_threads: int = 1) -> None: - """Subtype input genomes using a scheme. - - Args: - subtype_results: Subtyping results for each input genome - dfs: detailed subtyping results for each input genome - input_genomes: input genomes; tuple of FASTA file path and genome name - scheme: bio_hansel scheme FASTA path - scheme_name: optional scheme name - subtyping_params: scheme specific subtyping parameters - scheme_subtype_counts: summary information about scheme - n_threads: number of threads to use for subtyping analysis - - Returns: - `subtype_results` and `dfs` by reference - """ - if n_threads == 1: - logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes)) - outputs = [subtype_contigs_ac(fasta_path=input_fasta, - genome_name=genome_name, - scheme=scheme, - scheme_name=scheme_name, - subtyping_params=subtyping_params, - scheme_subtype_counts=scheme_subtype_counts) - for input_fasta, genome_name in input_genomes] - else: - outputs = parallel_query_contigs_ac(input_genomes, scheme, scheme_name, subtyping_params, scheme_subtype_counts, - n_threads) - for subtype, df in outputs: - if df is not None: - dfs.append(df) - else: - logging.error(f'Subtyping results DataFrame is "None" for {subtype}. Building empty results DF.') - dfs.append(empty_results(subtype)) - subtype_results.append(attr.asdict(subtype)) - - -def parallel_query_reads_ac(reads: List[Tuple[List[str], str]], - scheme: str, - scheme_name: Optional[str] = None, - subtyping_params: Optional[SubtypingParams] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, - n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]: +def parallel_query_reads(reads: List[Tuple[List[str], str]], + scheme: str, + scheme_name: Optional[str] = None, + subtyping_params: Optional[SubtypingParams] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, + n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]: """Parallel subtyping of input reads Subtype and analyse each input in parallel using a multiprocessing thread pool. @@ -191,24 +216,24 @@ def parallel_query_reads_ac(reads: List[Tuple[List[str], str]], logging.info('Initializing thread pool with %s threads', n_threads) pool = Pool(processes=n_threads) logging.info('Running analysis asynchronously on %s input genomes', len(reads)) - res = [pool.apply_async(subtype_reads_ac, (fastqs, - genome_name, - scheme, - scheme_name, - subtyping_params, - scheme_subtype_counts)) + res = [pool.apply_async(subtype_reads, (fastqs, + genome_name, + scheme, + scheme_name, + subtyping_params, + scheme_subtype_counts)) for fastqs, genome_name in reads] logging.info('Parallel analysis complete! Retrieving analysis results') outputs = [x.get() for x in res] return outputs -def subtype_reads_ac(reads: Union[str, List[str]], - genome_name: str, - scheme: str, - scheme_name: Optional[str] = None, - subtyping_params: Optional[SubtypingParams] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ +def subtype_reads(reads: Union[str, List[str]], + genome_name: str, + scheme: str, + scheme_name: Optional[str] = None, + subtyping_params: Optional[SubtypingParams] = None, + scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[ Subtype, Optional[pd.DataFrame]]: """Subtype input reads using a particular scheme. @@ -249,7 +274,7 @@ def subtype_reads_ac(reads: Union[str, List[str]], logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', reads, scheme) st.are_subtypes_consistent = False st.qc_status = QC.FAIL - st.qc_message = QC.NO_SUBTYPE_RESULT + st.qc_message = QC.NO_TARGETS_FOUND return st, empty_results(st) refpositions = [x for x, y in df.tilename.str.split('-')] @@ -259,8 +284,9 @@ def subtype_reads_ac(reads: Union[str, List[str]], df['is_pos_tile'] = ~df.tilename.str.contains('negative') df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq) st.avg_tile_coverage = df['freq'].mean() - process_contigs_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts) + st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts) st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params) + df['file_path'] = str(st.file_path) df['sample'] = genome_name df['scheme'] = scheme_name or scheme df['scheme_version'] = scheme_version @@ -270,113 +296,163 @@ def subtype_reads_ac(reads: Union[str, List[str]], return st, df -def process_contigs_subtyping_results(st, df, scheme_subtype_counts): +def process_subtyping_results(st: Subtype, df: pd.DataFrame, scheme_subtype_counts: Dict[str, SubtypeCounts]) -> Tuple[ + Subtype, pd.DataFrame]: + """Process the subtyping results to get the final subtype result and summary stats + + Args: + st: Subtype result + df: Subtyping results + scheme_subtype_counts: Subtyping scheme summary info + Returns: + Tuple of `st` and `df` + """ dfpos = df[df.is_pos_tile] dfpos_highest_res = highest_resolution_subtype_results(dfpos) subtype_list = [x for x in dfpos_highest_res.subtype.unique()] - set_subtype_results(st, dfpos, subtype_list) - set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_positive_subtypes(dfpos))) - calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts) - set_non_present_subtypes(st, df, scheme_subtype_counts) + st = set_subtype_results(st, dfpos, subtype_list) + st = set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_subtype_ints(dfpos.subtype))) + st = set_subtyping_stats(st, df, dfpos, dfpos_highest_res, subtype_list, scheme_subtype_counts) + st.non_present_subtypes = absent_downstream_subtypes(st.subtype, df.subtype, list(scheme_subtype_counts.keys())) + return (st, df) -def set_subtype_results(st, dfpos, subtype_list): +def set_subtype_results(st: Subtype, df_positive: pd.DataFrame, subtype_list: List[str]) -> Subtype: + """Set subtype results + + Args: + st: Subtype result + df_positive: Positive tiles subtyping results + subtype_list: List of subtypes found + """ st.subtype = '; '.join(subtype_list) st.tiles_matching_subtype = '; '.join(subtype_list) - pos_subtypes_str = [x for x in dfpos.subtype.unique()] + pos_subtypes_str = [x for x in df_positive.subtype.unique()] pos_subtypes_str.sort(key=lambda x: len(x)) st.all_subtypes = '; '.join(pos_subtypes_str) + return st -def calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts): +def set_subtyping_stats(st: Subtype, + df: pd.DataFrame, + dfpos: pd.DataFrame, + dfpos_highest_res: pd.DataFrame, + subtype_list: List[str], + scheme_subtype_counts: Dict[str, SubtypeCounts]) -> Subtype: + """Set subtyping result stats + + - `n_tiles_matching_subtype`: # tiles matching final subtype + - `n_tiles_matching_all`: # tiles found + - `n_tiles_matching_positive`: # positive tiles found + - `n_tiles_matching_negative`: # negative tiles found + - `n_tiles_matching_all_expected`: expected # of all tiles found for each subtype + - `n_tiles_matching_positive_expected`: expected # of positive tiles found for each subtype + - `n_tiles_matching_subtype_expected`: expected # of subtype-specific tiles found for each subtype + + Args: + st: Subtype result + df: Subtyping results + dfpos: Positive tile subtyping results + dfpos_highest_res: Final subtype specific subtyping results + subtype_list: List of subtypes found + scheme_subtype_counts: Subtyping scheme summary info + """ + st.n_tiles_matching_subtype = dfpos_highest_res.shape[0] st.n_tiles_matching_all = df.tilename.unique().size st.n_tiles_matching_positive = dfpos.tilename.unique().size - st.n_tiles_matching_negative = df[~df.is_pos_tile] - st.n_tiles_matching_subtype = len(subtype_list) + st.n_tiles_matching_negative = df[~df.is_pos_tile].shape[0] st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list]) st.n_tiles_matching_positive_expected = ';'.join( [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list]) st.n_tiles_matching_subtype_expected = ';'.join( [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list]) + return st -def count_periods(subtype_with_periods: str) -> pd.Series: - return (pd.Series(list(subtype_with_periods)) == '.').sum() +def count_periods(s: str) -> int: + """Count the number of periods in a string. + + Examples: + + count_periods("2.1.1") => 2 + count_periods("1") => 0 + + Args: + s: Some string with periods + Returns: + The number of periods found in the input string. + """ + return sum((1 for c in list(s) if c == '.')) + + +def highest_resolution_subtype_results(df: pd.DataFrame) -> pd.DataFrame: + """Get the highest resolution subtype results + + Where the highest resolution result has the most periods ('.') in its designation. + + Args: + df: positive subtyping results -def highest_resolution_subtype_results(dfpos): - subtype_lens = dfpos.subtype.apply(count_periods) + Returns: + Highest resolution (most periods in subtype) subtyping results + """ + subtype_lens = df.subtype.apply(count_periods) max_subtype_strlen = subtype_lens.max() - logging.debug('max substype str len: %s', max_subtype_strlen) - dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen] - return dfpos_highest_res + return df[subtype_lens == max_subtype_strlen] -def sorted_positive_subtypes(dfpos: pd.DataFrame) -> List[List[int]]: - pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()] - pos_subtypes.sort(key=lambda a: len(a)) - return pos_subtypes +def sorted_subtype_ints(subtypes: pd.Series) -> List[List[int]]: + """Get the list of subtypes as lists of integers sorted by subtype resolution + Where the subtype resolution is determined by the number of periods ('.') in the subtype. -def set_non_present_subtypes(st, df, scheme_subtype_counts): - possible_downstream_subtypes = [s for s in scheme_subtype_counts - if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)] - st.non_present_subtypes = [x for x in possible_downstream_subtypes - if not (df.subtype == x).any()] + Args: + subtypes: pd.Series of subtype strings + Return: + list of subtypes as lists of integers sorted by subtype resolution + """ + subtypes_ints = [[int(y) for y in x.split('.')] for x in subtypes.unique()] + subtypes_ints.sort(key=lambda a: len(a)) + return subtypes_ints -def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List): - if len(inconsistent_subtypes) > 0: - st.are_subtypes_consistent = False - st.inconsistent_subtypes = inconsistent_subtypes - else: - st.are_subtypes_consistent = True - st.are_subtypes_consistent = None +def absent_downstream_subtypes(subtype: str, subtypes: pd.Series, scheme_subtypes: List[str]) -> Optional[List[str]]: + """Find the downstream subtypes that are not present in the results + + Args: + subtype: Final subtype result + subtypes: Subtypes found + scheme_subtypes: Possible subtyping scheme subtypes + + Returns: + List of downstream subtypes that are not present in the results or `None` if all downstream subtypes are present + """ + escaped_subtype = re.escape(subtype) + re_subtype = re.compile(r'^{}\.\d+$'.format(escaped_subtype)) + downstream_subtypes = [s for s in scheme_subtypes if re_subtype.search(s)] + absentees = [x for x in downstream_subtypes if not (subtypes == x).any()] + return absentees if len(absentees) > 0 else None -def query_reads_ac(subtype_results: List[Subtype], - dfs: List[pd.DataFrame], - reads: List[Tuple[List[str], str]], - scheme: str, - scheme_name: Optional[str] = None, - subtyping_params: Optional[SubtypingParams] = None, - scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None, - n_threads: int = 1): - """Subtype input genomes using a scheme. + +def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List[str]) -> Subtype: + """Save the list of inconsistent subtypes, if any, to the Subtype result Args: - subtype_results: Subtyping results for each input genome - dfs: detailed subtyping results for each input genome - input_genomes: input genomes; tuple of FASTA file path and genome name - scheme: bio_hansel scheme FASTA path - scheme_name: optional scheme name - subtyping_params: scheme specific subtyping parameters - scheme_subtype_counts: summary information about scheme - n_threads: number of threads to use for subtyping analysis + st: Subtype result + inconsistent_subtypes: List of inconsistent subtypes Returns: - `subtype_results` and `dfs` by reference + """ - if n_threads == 1: - logging.info('Serial single threaded run mode on %s input genomes', len(reads)) - outputs = [subtype_reads_ac(reads=fastq_files, - genome_name=genome_name, - scheme=scheme, - scheme_name=scheme_name, - subtyping_params=subtyping_params, - scheme_subtype_counts=scheme_subtype_counts) - for fastq_files, genome_name in reads] + if len(inconsistent_subtypes) > 0: + st.are_subtypes_consistent = False + st.inconsistent_subtypes = inconsistent_subtypes else: - outputs = parallel_query_reads_ac(reads=reads, - scheme=scheme, - scheme_name=scheme_name, - subtyping_params=subtyping_params, - scheme_subtype_counts=scheme_subtype_counts, - n_threads=n_threads) - for subtype, df in outputs: - if df is not None: - dfs.append(df) - subtype_results.append(attr.asdict(subtype)) + st.are_subtypes_consistent = True + st.inconsistent_subtypes = None + return st def empty_results(st: Subtype) -> pd.DataFrame: @@ -399,4 +475,4 @@ def empty_results(st: Subtype) -> pd.DataFrame: scheme=st.scheme, scheme_version=st.scheme_version, qc_status=st.qc_status, - qc_message=st.qc_message)}).transpose() \ No newline at end of file + qc_message=st.qc_message)}).transpose() diff --git a/tests/test_qc.py b/tests/test_qc.py index 9b7b914..55859fb 100644 --- a/tests/test_qc.py +++ b/tests/test_qc.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- -from bio_hansel.qc import is_maybe_intermediate_subtype -from bio_hansel.utils import init_subtyping_params -from pandas import DataFrame +import pandas as pd +from bio_hansel.qc import is_maybe_intermediate_subtype from bio_hansel.qc.const import QC from bio_hansel.subtype import Subtype -from bio_hansel.subtyper import subtype_reads_ac, subtype_contigs_ac -import pandas as pd +from bio_hansel.subtyper import subtype_reads, subtype_contigs +from bio_hansel.utils import init_subtyping_params genome_name = 'test' @@ -15,9 +14,9 @@ def test_low_coverage(): scheme = 'heidelberg' fastq = 'tests/data/SRR1696752/SRR1696752.fastq' - st, df = subtype_reads_ac(reads=fastq, genome_name=genome_name, scheme=scheme) + st, df = subtype_reads(reads=fastq, genome_name=genome_name, scheme=scheme) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.is_fastq_input() assert st.scheme == scheme assert 'Low coverage for all tiles (7.439 < 20 expected)' in st.qc_message @@ -53,7 +52,7 @@ def test_intermediate_subtype(): p = init_subtyping_params(args=None, scheme=scheme) st.qc_status, st.qc_message = is_maybe_intermediate_subtype(st, df, p) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.scheme == scheme assert "Total subtype matches observed (n=3) vs expected (n=6)" in st.qc_message assert st.qc_status == QC.WARNING @@ -62,9 +61,9 @@ def test_intermediate_subtype(): def test_missing_tiles(): scheme = 'heidelberg' fastq = 'tests/data/SRR1696752/SRR1696752.fastq' - st, df = subtype_reads_ac(reads=fastq, genome_name=genome_name, scheme=scheme) + st, df = subtype_reads(reads=fastq, genome_name=genome_name, scheme=scheme) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.is_fastq_input() assert st.scheme == scheme assert 'Low coverage depth (10.9 < 20.0 expected)' in st.qc_message @@ -74,9 +73,9 @@ def test_missing_tiles(): def test_mixed_tiles(): scheme = 'heidelberg' fastqs = ['tests/data/SRR3392166/SRR3392166.fastq', 'tests/data/SRR3392166/SRR3392166.fastq'] - st, df = subtype_reads_ac(reads=fastqs, genome_name=genome_name, scheme=scheme) + st, df = subtype_reads(reads=fastqs, genome_name=genome_name, scheme=scheme) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.scheme == scheme assert 'Mixed subtypes found: "1; 2; 2.1"' in st.qc_message assert st.qc_status == QC.FAIL @@ -85,9 +84,9 @@ def test_mixed_tiles(): def test_mixed_subtype_positive_negative_tiles_same_target(): scheme = 'heidelberg' fasta = 'tests/data/fail-qc-mixed-subtype-pos-neg-tiles.fasta' - st, df = subtype_contigs_ac(fasta_path=fasta, genome_name=genome_name, scheme=scheme) + st, df = subtype_contigs(fasta_path=fasta, genome_name=genome_name, scheme=scheme) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.scheme == scheme assert st.qc_status == QC.FAIL expected_qc_msg = ('FAIL: Mixed subtype; the positive and negative tiles were found for the same ' @@ -103,9 +102,9 @@ def test_mixed_subtype_positive_negative_tiles_same_target(): def test_unconfident_subtype(): scheme = 'enteritidis' fasta = 'tests/data/fail-qc-unconfident-subtype.fasta' - st, df = subtype_contigs_ac(fasta_path=fasta, genome_name=genome_name, scheme=scheme) + st, df = subtype_contigs(fasta_path=fasta, genome_name=genome_name, scheme=scheme) assert isinstance(st, Subtype) - assert isinstance(df, DataFrame) + assert isinstance(df, pd.DataFrame) assert st.scheme == scheme assert st.qc_status == QC.FAIL assert QC.UNCONFIDENT_RESULTS_ERROR_4 in st.qc_message diff --git a/tests/test_subtyping_contigs.py b/tests/test_subtyping_contigs.py index 90c8194..3e1225b 100644 --- a/tests/test_subtyping_contigs.py +++ b/tests/test_subtyping_contigs.py @@ -1,11 +1,10 @@ import pytest -from pandas import DataFrame, Series -import numpy as np -from bio_hansel.subtype import Subtype -from bio_hansel.subtyper import subtype_contigs_ac +from pandas import DataFrame + from bio_hansel.const import SCHEME_FASTAS from bio_hansel.qc.const import QC - +from bio_hansel.subtype import Subtype +from bio_hansel.subtyper import subtype_contigs from . import check_subtype_attrs, check_df_fasta_cols genome_name = 'test' @@ -76,12 +75,12 @@ def subtype_enteritidis_fail(): def test_heidelberg_fasta_ac(subtype_heidelberg_pass): - st, df = subtype_contigs_ac(fasta_path=fasta_heidelberg_pass, - genome_name=genome_name, - scheme=scheme_heidelberg) - stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_heidelberg_pass, - genome_name=genome_name, - scheme=scheme_heidelberg) + st, df = subtype_contigs(fasta_path=fasta_heidelberg_pass, + genome_name=genome_name, + scheme=scheme_heidelberg) + stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_heidelberg_pass, + genome_name=genome_name, + scheme=scheme_heidelberg) assert isinstance(st, Subtype) assert isinstance(df, DataFrame) assert isinstance(stgz, Subtype) @@ -92,12 +91,12 @@ def test_heidelberg_fasta_ac(subtype_heidelberg_pass): def test_enteritidis_scheme_vs_qc_failing_contigs_unconfident_ac(subtype_enteritidis_fail_unconfident): - st, df = subtype_contigs_ac(fasta_path=fasta_enteritidis_unconfident, - genome_name=genome_name, - scheme=scheme_enteritidis) - stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_enteritidis_unconfident, - genome_name=genome_name, - scheme=scheme_enteritidis) + st, df = subtype_contigs(fasta_path=fasta_enteritidis_unconfident, + genome_name=genome_name, + scheme=scheme_enteritidis) + stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_enteritidis_unconfident, + genome_name=genome_name, + scheme=scheme_enteritidis) assert isinstance(st, Subtype) assert isinstance(df, DataFrame) assert isinstance(stgz, Subtype) @@ -110,12 +109,12 @@ def test_enteritidis_scheme_vs_qc_failing_contigs_unconfident_ac(subtype_enterit def test_ac_vs_bad_contigs(subtype_enteritidis_fail): - st, df = subtype_contigs_ac(fasta_path=fasta_enteritidis_fail, - genome_name=genome_name, - scheme=scheme_enteritidis) - stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_enteritidis_fail, - scheme=scheme_enteritidis, - genome_name=genome_name) + st, df = subtype_contigs(fasta_path=fasta_enteritidis_fail, + genome_name=genome_name, + scheme=scheme_enteritidis) + stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_enteritidis_fail, + scheme=scheme_enteritidis, + genome_name=genome_name) assert isinstance(st, Subtype) assert isinstance(df, DataFrame) assert isinstance(stgz, Subtype) diff --git a/tests/test_subtyping_processing.py b/tests/test_subtyping_processing.py new file mode 100644 index 0000000..72fd527 --- /dev/null +++ b/tests/test_subtyping_processing.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +import pandas as pd + +from bio_hansel.qc import QC +from bio_hansel.subtype import Subtype +from bio_hansel.subtyper import absent_downstream_subtypes, sorted_subtype_ints, count_periods, empty_results +from bio_hansel.utils import find_inconsistent_subtypes + + +def test_count_periods(): + assert count_periods('1.1.1') == 2 + assert count_periods('1.1.10000') == 2 + assert count_periods('1.10000') == 1 + assert count_periods('10000') == 0 + + +def test_absent_downstream_subtypes(): + assert absent_downstream_subtypes('1', pd.Series(['1.1', '1.2', '1.3', '1']), ['1.1', '1.2', '1', '1.3']) is None + assert absent_downstream_subtypes('1', pd.Series(['1.1', '1.2', '1']), ['1.1', '1.2', '1', '1.3']) == ['1.3'] + assert absent_downstream_subtypes('1', pd.Series(['1']), ['1.1', '1.2', '1', '1.3']) == ['1.1', '1.2', '1.3'] + + +def test_sorted_subtype_ints(): + assert sorted_subtype_ints(pd.Series([])) == [] + assert sorted_subtype_ints(pd.Series(['1', '1.1', '1.1.1', '1.1.1.99'])) == [[1], [1, 1], [1, 1, 1], [1, 1, 1, 99]] + assert sorted_subtype_ints(pd.Series(['1', '1.1', '1.1.1', '1.1.1.99', '1.1', '1.1.1'])) == [[1], [1, 1], [1, 1, 1], + [1, 1, 1, 99]] + + +def test_empty_results(): + st = Subtype(sample='test', + file_path='tests/data/Retro1000data/10-1358.fastq', + scheme='enteritidis', + scheme_version='0.8.0', + subtype=None, + non_present_subtypes=None, + all_subtypes=None, + qc_status=QC.FAIL, + qc_message=QC.NO_TARGETS_FOUND) + df_empty = empty_results(st) + df_expected_empty = pd.DataFrame({0: dict(sample='test', + file_path='tests/data/Retro1000data/10-1358.fastq', + subtype=None, + refposition=None, + is_pos_tile=None, + scheme='enteritidis', + scheme_version='0.8.0', + qc_status=QC.FAIL, + qc_message=QC.NO_TARGETS_FOUND)}).transpose() + assert ((df_empty == df_expected_empty) | (df_empty.isnull() == df_expected_empty.isnull())).values.all(), \ + f'Empty result DataFrame should equal df_expected_empty: {df_expected_empty}' + + +def test_find_inconsistent_subtypes(): + consistent_subtypes = sorted_subtype_ints(pd.Series(''' +1 +1.1 +1.1.1 +1.1.1.1 + '''.strip().split('\n'))) + + assert find_inconsistent_subtypes(consistent_subtypes) == [], \ + 'Expecting all subtypes to be consistent with each other' + + inconsistent_subtypes = sorted_subtype_ints(pd.Series(''' +1 +1.1 +1.1.1 +1.1.1.1 +1.1.1.2 +1.1.1.3 + '''.strip().split('\n'))) + + exp_incon_subtypes = ''' +1.1.1.1 +1.1.1.2 +1.1.1.3 + '''.strip().split('\n') + + assert find_inconsistent_subtypes(inconsistent_subtypes) == exp_incon_subtypes, \ + f'Expecting subtypes {exp_incon_subtypes} to be inconsistent with each other' + + subtypes_list = ''' +1 +1.1 +1.1.1 +1.1.1.1 +1.1.1.2 +1.1.1.3 +1.1.2 +2 + '''.strip().split('\n') + + inconsistent_subtypes = sorted_subtype_ints(pd.Series(subtypes_list)) + assert set(find_inconsistent_subtypes(inconsistent_subtypes)) == set(subtypes_list), \ + f'All subtypes should be inconsistent with each other in {subtypes_list}' diff --git a/tests/test_subtyping_reads.py b/tests/test_subtyping_reads.py index 5e69bd4..7c3e510 100644 --- a/tests/test_subtyping_reads.py +++ b/tests/test_subtyping_reads.py @@ -1,11 +1,10 @@ import pytest from pandas import DataFrame -from bio_hansel.subtype import Subtype -from bio_hansel.subtyper import subtype_reads_ac from bio_hansel.const import SCHEME_FASTAS from bio_hansel.qc.const import QC - +from bio_hansel.subtype import Subtype +from bio_hansel.subtyper import subtype_reads from . import check_df_fastq_cols, check_subtype_attrs genome_name = 'test' @@ -53,8 +52,8 @@ def subtype_enteritidis_fail(): def test_heidelberg_scheme_vs_qc_passing_reads_with_ac(subtype_heidelberg_pass): - st, df = subtype_reads_ac(reads=fastq_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg) - stgz, dfgz = subtype_reads_ac(reads=fastq_gz_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg) + st, df = subtype_reads(reads=fastq_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg) + stgz, dfgz = subtype_reads(reads=fastq_gz_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg) assert isinstance(st, Subtype) assert isinstance(df, DataFrame) assert isinstance(stgz, Subtype) From acf4335341097e0583a4be7c271fa860cc7312de Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Fri, 13 Jul 2018 10:05:36 -0500 Subject: [PATCH 3/5] Update main.py organize imports --- bio_hansel/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio_hansel/main.py b/bio_hansel/main.py index 01ca854..dd7062a 100644 --- a/bio_hansel/main.py +++ b/bio_hansel/main.py @@ -4,12 +4,12 @@ import argparse import logging import sys +import re +import os from typing import Optional, List, Any, Tuple import attr -import os import pandas as pd -import re from . import program_desc, __version__ from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL From 0a8fe89c63c55b00d0c4867f9999586d7980a51c Mon Sep 17 00:00:00 2001 From: pkruczkiewicz Date: Fri, 13 Jul 2018 10:45:54 -0500 Subject: [PATCH 4/5] Update checks.py to use import pandas as pd for consistency Signed-off-by: pkruczkiewicz --- bio_hansel/qc/checks.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py index 22fb5c2..5313bf5 100644 --- a/bio_hansel/qc/checks.py +++ b/bio_hansel/qc/checks.py @@ -2,7 +2,7 @@ from typing import Tuple, Optional -from pandas import DataFrame +import pandas as pd from ..qc.const import QC from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts @@ -10,7 +10,7 @@ from ..subtyping_params import SubtypingParams -def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: +def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: if not st.are_subtypes_consistent \ or st.subtype is None \ or not st.is_fastq_input(): @@ -21,7 +21,7 @@ def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> T return None, None -def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: +def is_missing_tiles(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: """Are there more missing tiles than tolerated? Note: @@ -75,9 +75,13 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op return QC.FAIL, error_messages -def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str, - df: DataFrame, exp: int, obs: int, p: SubtypingParams) -> Tuple[ - Optional[str], Optional[str]]: +def check_for_missing_tiles(is_fastq: bool, + subtype_result: str, + scheme: str, + df: pd.DataFrame, + exp: int, + obs: int, + p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]: """Check if there are too many missing tiles Also check if the mean tile coverage depth is above the low coverage threshold. @@ -102,7 +106,7 @@ def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str, if p_missing > p.max_perc_missing_tiles: status = QC.FAIL if is_fastq: - tiles_with_hits = df[df['is_kmer_freq_okay']] # type: DataFrame + tiles_with_hits = df[df['is_kmer_freq_okay']] # type: pd.DataFrame depth = tiles_with_hits['freq'].mean() if depth < p.low_coverage_depth_freq: coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \ @@ -119,7 +123,7 @@ def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str, return status, messages -def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]: +def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str], Optional[str]]: """Is the subtype result mixed? Note: @@ -146,7 +150,7 @@ def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], f'the same target site{s} {positions} for subtype "{st.subtype}".' -def is_missing_too_many_target_sites(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[ +def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[ Optional[str], Optional[str]]: """Are there too many missing target sites for an expected subtype? @@ -200,7 +204,7 @@ def is_missing_downstream_targets(st: Subtype, *args) -> Tuple[Optional[str], Op return None, None -def is_maybe_intermediate_subtype(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[ +def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[ Optional[str], Optional[str]]: """Is the result a possible intermediate subtype? From 5eea6dc28c2a165bfbeaa36cd2e5cd0e62d3a4c6 Mon Sep 17 00:00:00 2001 From: pkruczkiewicz Date: Fri, 13 Jul 2018 10:53:32 -0500 Subject: [PATCH 5/5] Update subtyper.py; fix docstring, renaming a few vars, refactoring Signed-off-by: pkruczkiewicz --- bio_hansel/subtyper.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py index db44d42..7ebb4f4 100644 --- a/bio_hansel/subtyper.py +++ b/bio_hansel/subtyper.py @@ -27,7 +27,7 @@ def subtype_reads_samples(reads: List[Tuple[List[str], str]], """Subtype input genomes using a scheme. Args: - input_genomes: input genomes; tuple of FASTA file path and genome name + reads: input genomes; tuple of list of FASTQ file paths and genome name scheme: bio_hansel scheme FASTA path scheme_name: optional scheme name subtyping_params: scheme specific subtyping parameters @@ -123,8 +123,8 @@ def subtype_contigs(fasta_path: str, scheme_version=scheme_version, scheme_subtype_counts=scheme_subtype_counts) - A = init_automaton(scheme_fasta) - df = find_in_fasta(A, fasta_path) + automaton = init_automaton(scheme_fasta) + df = find_in_fasta(automaton, fasta_path) if df is None or df.shape[0] == 0: logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme) @@ -262,11 +262,11 @@ def subtype_reads(reads: Union[str, List[str]], scheme_version=scheme_version, scheme_subtype_counts=scheme_subtype_counts) - A = init_automaton(scheme_fasta) + automaton = init_automaton(scheme_fasta) if isinstance(reads, str): - df = find_in_fastqs(A, reads) + df = find_in_fastqs(automaton, reads) elif isinstance(reads, list): - df = find_in_fastqs(A, *reads) + df = find_in_fastqs(automaton, *reads) else: raise ValueError('Unexpected type "{}" for "reads": {}'.format(type(reads), reads)) @@ -314,7 +314,7 @@ def process_subtyping_results(st: Subtype, df: pd.DataFrame, scheme_subtype_coun st = set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_subtype_ints(dfpos.subtype))) st = set_subtyping_stats(st, df, dfpos, dfpos_highest_res, subtype_list, scheme_subtype_counts) st.non_present_subtypes = absent_downstream_subtypes(st.subtype, df.subtype, list(scheme_subtype_counts.keys())) - return (st, df) + return st, df def set_subtype_results(st: Subtype, df_positive: pd.DataFrame, subtype_list: List[str]) -> Subtype: @@ -427,7 +427,8 @@ def absent_downstream_subtypes(subtype: str, subtypes: pd.Series, scheme_subtype scheme_subtypes: Possible subtyping scheme subtypes Returns: - List of downstream subtypes that are not present in the results or `None` if all downstream subtypes are present + List of downstream subtypes that are not present in the results or `None` if all immediately downstream + subtypes are present. """ escaped_subtype = re.escape(subtype) re_subtype = re.compile(r'^{}\.\d+$'.format(escaped_subtype))