From 73365e772e47b6d69429563ab1253e3f5225f26f Mon Sep 17 00:00:00 2001
From: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
Date: Fri, 6 Jul 2018 17:03:11 -0500
Subject: [PATCH 1/5] adds 1 row DF for null subtyping result; refactoring

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
---
 bio_hansel/qc/const.py |   1 +
 bio_hansel/subtyper.py | 260 ++++++++++++++++++++++++++++++-----------
 bio_hansel/utils.py    |   6 +-
 3 files changed, 193 insertions(+), 74 deletions(-)

diff --git a/bio_hansel/qc/const.py b/bio_hansel/qc/const.py
index f30a86f..b2effc9 100644
--- a/bio_hansel/qc/const.py
+++ b/bio_hansel/qc/const.py
@@ -7,3 +7,4 @@ class QC:
     AMBIGUOUS_RESULTS_ERROR_3 = 'Ambiguous Results Error 3'
     UNCONFIDENT_RESULTS_ERROR_4 = 'Unconfident Results Error 4'
     NO_SUBTYPE_RESULT = 'No subtype result!'
+    NO_TARGETS_FOUND = 'No tiles/targets were found in this sample.'
diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py
index 41456f2..1ac345d 100644
--- a/bio_hansel/subtyper.py
+++ b/bio_hansel/subtyper.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
-
+"""
+Functions for subtyping of reads (e.g. FASTQ) and contigs (e.g. FASTA) using bio_hansel-compatible subtyping schemes.
+"""
 import re
 import logging
 from typing import Optional, List, Dict, Union, Tuple
@@ -9,7 +11,7 @@
 
 from .aho_corasick import init_automaton, find_in_fasta, find_in_fastqs
 from .const import COLUMNS_TO_REMOVE
-from .qc import perform_quality_check
+from .qc import perform_quality_check, QC
 from .subtype import Subtype
 from .subtype_stats import SubtypeCounts
 from .subtype_stats import subtype_counts
@@ -23,7 +25,21 @@ def subtype_contigs_ac(fasta_path: str,
                        subtyping_params: Optional[SubtypingParams] = None,
                        scheme_name: Optional[str] = None,
                        scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
-    Subtype, Optional[pd.DataFrame]]:
+    Subtype, pd.DataFrame]:
+    """Subtype input contigs using a particular scheme.
+
+    Args:
+        fasta_path: Input FASTA file path
+        genome_name: Input genome name
+        scheme: bio_hansel scheme FASTA path
+        subtyping_params: scheme specific subtyping parameters
+        scheme_name: optional scheme name
+        scheme_subtype_counts: summary information about scheme
+
+    Returns:
+        - Subtype result
+        - pd.DataFrame of detailed subtyping results
+    """
     scheme_fasta = get_scheme_fasta(scheme)
     if scheme_subtype_counts is None:
         scheme_subtype_counts = subtype_counts(scheme_fasta)
@@ -41,48 +57,17 @@ def subtype_contigs_ac(fasta_path: str,
 
     if df is None or df.shape[0] == 0:
         logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme)
+        st.qc_status = QC.FAIL
+        st.qc_message = QC.NO_SUBTYPE_RESULT
         st.are_subtypes_consistent = False
-        return st, None
+        return st, empty_results(st)
 
     refpositions = [x for x, y in df.tilename.str.split('-')]
     subtypes = [y for x, y in df.tilename.str.split('-')]
     df['refposition'] = [int(x.replace('negative', '')) for x in refpositions]
     df['subtype'] = subtypes
     df['is_pos_tile'] = ~df.tilename.str.contains('negative')
-    dfpos = df[df.is_pos_tile]
-    subtype_lens = dfpos.subtype.apply(len)
-    max_subtype_strlen = subtype_lens.max()
-    logging.debug('max substype str len: %s', max_subtype_strlen)
-    dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen]
-    logging.debug('dfpos_highest_res: %s', dfpos_highest_res)
-    pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()]
-    pos_subtypes.sort(key=lambda a: len(a))
-    logging.debug('pos_subtypes: %s', pos_subtypes)
-    inconsistent_subtypes = find_inconsistent_subtypes(pos_subtypes)
-    logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
-    st.n_tiles_matching_all = df.tilename.unique().size
-    st.n_tiles_matching_positive = dfpos.tilename.unique().size
-    st.n_tiles_matching_subtype = dfpos_highest_res.tilename.unique().size
-    pos_subtypes_str = [x for x in dfpos.subtype.unique()]
-    pos_subtypes_str.sort(key=lambda x: len(x))
-    st.all_subtypes = '; '.join(pos_subtypes_str)
-    subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
-    st.subtype = '; '.join(subtype_list)
-    st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
-    st.n_tiles_matching_positive_expected = ';'.join(
-        [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
-    st.n_tiles_matching_subtype_expected = ';'.join(
-        [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
-    st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename.unique()])
-
-    if len(inconsistent_subtypes) > 0:
-        st.are_subtypes_consistent = False
-        st.inconsistent_subtypes = inconsistent_subtypes
-
-    possible_downstream_subtypes = [s for s in scheme_subtype_counts
-                                    if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)]
-    st.non_present_subtypes = [x for x in possible_downstream_subtypes
-                               if not (df.subtype == x).any()]
+    process_contigs_subtyping_results(st, df, scheme_subtype_counts)
     st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
 
     logging.info(st)
@@ -98,12 +83,29 @@ def subtype_contigs_ac(fasta_path: str,
     return st, df
 
 
+
+
 def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]],
                               scheme: str,
                               scheme_name: Optional[str] = None,
                               subtyping_params: Optional[SubtypingParams] = None,
                               scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
                               n_threads: int = 1):
+    """Parallel subtyping of input contigs
+
+    Subtype and analyse each input in parallel using a multiprocessing thread pool.
+
+    Args:
+        input_genomes: Input genome FASTA paths
+        scheme: bio_hansel scheme FASTA path to subtype with
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: scheme summary information
+        n_threads: Number of threads to use
+
+    Returns:
+        A list of tuples of Subtype results and a pd.DataFrame of detailed subtyping results for each input
+    """
     from multiprocessing import Pool
     logging.info('Initializing thread pool with %s threads', n_threads)
     pool = Pool(processes=n_threads)
@@ -128,6 +130,21 @@ def query_contigs_ac(subtype_results: List[Subtype],
                      subtyping_params: Optional[SubtypingParams] = None,
                      scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
                      n_threads: int = 1) -> None:
+    """Subtype input genomes using a scheme.
+
+    Args:
+        subtype_results: Subtyping results for each input genome
+        dfs: detailed subtyping results for each input genome
+        input_genomes: input genomes; tuple of FASTA file path and genome name
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: summary information about scheme
+        n_threads: number of threads to use for subtyping analysis
+
+    Returns:
+        `subtype_results` and `dfs` by reference
+    """
     if n_threads == 1:
         logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
         outputs = [subtype_contigs_ac(fasta_path=input_fasta,
@@ -143,6 +160,9 @@ def query_contigs_ac(subtype_results: List[Subtype],
     for subtype, df in outputs:
         if df is not None:
             dfs.append(df)
+        else:
+            logging.error(f'Subtyping results DataFrame is "None" for {subtype}. Building empty results DF.')
+            dfs.append(empty_results(subtype))
         subtype_results.append(attr.asdict(subtype))
 
 
@@ -152,6 +172,21 @@ def parallel_query_reads_ac(reads: List[Tuple[List[str], str]],
                             subtyping_params: Optional[SubtypingParams] = None,
                             scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
                             n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]:
+    """Parallel subtyping of input reads
+
+    Subtype and analyse each input in parallel using a multiprocessing thread pool.
+
+    Args:
+        reads: Input reads; list of tuples of FASTQ file paths and genome names
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: scheme summary information
+        n_threads: number of threads to use
+
+    Returns:
+        A list of tuples of Subtype results and a pd.DataFrame of detailed subtyping results for each input
+    """
     from multiprocessing import Pool
     logging.info('Initializing thread pool with %s threads', n_threads)
     pool = Pool(processes=n_threads)
@@ -175,8 +210,20 @@ def subtype_reads_ac(reads: Union[str, List[str]],
                      subtyping_params: Optional[SubtypingParams] = None,
                      scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
     Subtype, Optional[pd.DataFrame]]:
-    logging.info("genome_name %s", genome_name)
+    """Subtype input reads using a particular scheme.
 
+    Args:
+        reads: Input FASTQ file path(s)
+        genome_name: Input genome name
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: summary information about scheme
+
+    Returns:
+        - Subtype result
+        - pd.DataFrame of detailed subtyping results
+    """
     scheme_fasta = get_scheme_fasta(scheme)
     if scheme_subtype_counts is None:
         scheme_subtype_counts = subtype_counts(scheme_fasta)
@@ -201,7 +248,9 @@ def subtype_reads_ac(reads: Union[str, List[str]],
     if df is None or df.shape[0] == 0:
         logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', reads, scheme)
         st.are_subtypes_consistent = False
-        return st, None
+        st.qc_status = QC.FAIL
+        st.qc_message = QC.NO_SUBTYPE_RESULT
+        return st, empty_results(st)
 
     refpositions = [x for x, y in df.tilename.str.split('-')]
     subtypes = [y for x, y in df.tilename.str.split('-')]
@@ -209,49 +258,80 @@ def subtype_reads_ac(reads: Union[str, List[str]],
     df['subtype'] = subtypes
     df['is_pos_tile'] = ~df.tilename.str.contains('negative')
     df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq)
-    dfgood = df[df.is_kmer_freq_okay]
-    dfpos = dfgood[dfgood.is_pos_tile]
-    dfneg = dfgood[~dfgood.is_pos_tile]
-    subtype_lens = dfpos.subtype.apply(len)
-    max_subtype_strlen = subtype_lens.max()
-    logging.debug('max substype str len: %s', max_subtype_strlen)
-    dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen]
-    pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()]
-    pos_subtypes.sort(key=lambda a: len(a))
-    logging.debug('pos_subtypes: %s', pos_subtypes)
-    inconsistent_subtypes = find_inconsistent_subtypes(pos_subtypes)
-    logging.debug('inconsistent_subtypes: %s', inconsistent_subtypes)
-    st.n_tiles_matching_all = dfgood.shape[0]
-    st.n_tiles_matching_positive = dfpos.shape[0]
-    st.n_tiles_matching_negative = dfneg.shape[0]
-    st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
+    st.avg_tile_coverage = df['freq'].mean()
+    process_contigs_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
+    st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
+    df['sample'] = genome_name
+    df['scheme'] = scheme_name or scheme
+    df['scheme_version'] = scheme_version
+    df['qc_status'] = st.qc_status
+    df['qc_message'] = st.qc_message
+    df = df[df.columns[~df.columns.isin(COLUMNS_TO_REMOVE)]]
+    return st, df
+
+
+def process_contigs_subtyping_results(st, df, scheme_subtype_counts):
+    dfpos = df[df.is_pos_tile]
+    dfpos_highest_res = highest_resolution_subtype_results(dfpos)
+    subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
+    set_subtype_results(st, dfpos, subtype_list)
+    set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_positive_subtypes(dfpos)))
+    calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts)
+    set_non_present_subtypes(st, df, scheme_subtype_counts)
+
+
+def set_subtype_results(st, dfpos, subtype_list):
+    st.subtype = '; '.join(subtype_list)
+    st.tiles_matching_subtype = '; '.join(subtype_list)
     pos_subtypes_str = [x for x in dfpos.subtype.unique()]
     pos_subtypes_str.sort(key=lambda x: len(x))
     st.all_subtypes = '; '.join(pos_subtypes_str)
-    subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
-    st.subtype = '; '.join(subtype_list)
+
+
+def calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts):
+    st.n_tiles_matching_all = df.tilename.unique().size
+    st.n_tiles_matching_positive = dfpos.tilename.unique().size
+    st.n_tiles_matching_negative = df[~df.is_pos_tile]
+    st.n_tiles_matching_subtype = len(subtype_list)
     st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
     st.n_tiles_matching_positive_expected = ';'.join(
         [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
     st.n_tiles_matching_subtype_expected = ';'.join(
         [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
-    st.tiles_matching_subtype = '; '.join([x for x in dfpos_highest_res.tilename.unique()])
-    if len(inconsistent_subtypes) > 0:
-        st.are_subtypes_consistent = False
-        st.inconsistent_subtypes = inconsistent_subtypes
+
+
+def count_periods(subtype_with_periods: str) -> pd.Series:
+    return (pd.Series(list(subtype_with_periods)) == '.').sum()
+
+
+def highest_resolution_subtype_results(dfpos):
+    subtype_lens = dfpos.subtype.apply(count_periods)
+    max_subtype_strlen = subtype_lens.max()
+    logging.debug('max substype str len: %s', max_subtype_strlen)
+    dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen]
+    return dfpos_highest_res
+
+
+def sorted_positive_subtypes(dfpos: pd.DataFrame) -> List[List[int]]:
+    pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()]
+    pos_subtypes.sort(key=lambda a: len(a))
+    return pos_subtypes
+
+
+def set_non_present_subtypes(st, df, scheme_subtype_counts):
     possible_downstream_subtypes = [s for s in scheme_subtype_counts
                                     if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)]
     st.non_present_subtypes = [x for x in possible_downstream_subtypes
                                if not (df.subtype == x).any()]
-    st.avg_tile_coverage = df['freq'].mean()
-    st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
-    df['sample'] = genome_name
-    df['scheme'] = scheme_name or scheme
-    df['scheme_version'] = scheme_version
-    df['qc_status'] = st.qc_status
-    df['qc_message'] = st.qc_message
-    df = df[df.columns[~df.columns.isin(COLUMNS_TO_REMOVE)]]
-    return st, df
+
+
+def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List):
+    if len(inconsistent_subtypes) > 0:
+        st.are_subtypes_consistent = False
+        st.inconsistent_subtypes = inconsistent_subtypes
+    else:
+        st.are_subtypes_consistent = True
+        st.are_subtypes_consistent = None
 
 
 def query_reads_ac(subtype_results: List[Subtype],
@@ -262,6 +342,21 @@ def query_reads_ac(subtype_results: List[Subtype],
                    subtyping_params: Optional[SubtypingParams] = None,
                    scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
                    n_threads: int = 1):
+    """Subtype input genomes using a scheme.
+
+    Args:
+        subtype_results: Subtyping results for each input genome
+        dfs: detailed subtyping results for each input genome
+        input_genomes: input genomes; tuple of FASTA file path and genome name
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: summary information about scheme
+        n_threads: number of threads to use for subtyping analysis
+
+    Returns:
+        `subtype_results` and `dfs` by reference
+    """
     if n_threads == 1:
         logging.info('Serial single threaded run mode on %s input genomes', len(reads))
         outputs = [subtype_reads_ac(reads=fastq_files,
@@ -282,3 +377,26 @@ def query_reads_ac(subtype_results: List[Subtype],
         if df is not None:
             dfs.append(df)
         subtype_results.append(attr.asdict(subtype))
+
+
+def empty_results(st: Subtype) -> pd.DataFrame:
+    """Get a results DataFrame with 1 entry when no results are retrieved.
+
+    When the output DataFrame for subtyping is empty, generate a "dummy" DataFrame with one row showing that subtyping
+    was performed on the sample, but no results were generated because nothing was found.
+
+    Args:
+        st: Subtype results; should be a null result
+
+    Returns:
+        pd.DataFrame with 1 row with null result
+    """
+    return pd.DataFrame({0: dict(sample=st.sample,
+                                 file_path=st.file_path,
+                                 subtype=st.subtype,
+                                 refposition=None,
+                                 is_pos_tile=None,
+                                 scheme=st.scheme,
+                                 scheme_version=st.scheme_version,
+                                 qc_status=st.qc_status,
+                                 qc_message=st.qc_message)}).transpose()
\ No newline at end of file
diff --git a/bio_hansel/utils.py b/bio_hansel/utils.py
index 499c799..97124ed 100644
--- a/bio_hansel/utils.py
+++ b/bio_hansel/utils.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import logging
-from typing import List, Any, Optional, Tuple
+from typing import List, Any, Optional, Tuple, Union
 
 import os
 import re
@@ -51,7 +51,7 @@ def compare_subtypes(a: List[Any], b: List[Any]) -> bool:
     return True
 
 
-def find_inconsistent_subtypes(subtypes: List[Any]) -> List[str]:
+def find_inconsistent_subtypes(subtypes: List[List[int]]) -> List[str]:
     from collections import Counter
     incon = []
     for i in range(len(subtypes) - 1):
@@ -98,7 +98,7 @@ def get_scheme_version(scheme: str) -> Optional[str]:
     return None
 
 
-def collect_fastq_from_dir(input_directory):
+def collect_fastq_from_dir(input_directory: str) -> List[Union[str, Tuple[List[str], str]]]:
     fastqs = []
     for x in os.listdir(input_directory):
         full_file_path = os.path.abspath(os.path.join(input_directory, x))

From 0159bd4b270b7928fb501f84a722b01c47e1f861 Mon Sep 17 00:00:00 2001
From: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
Date: Tue, 10 Jul 2018 13:20:34 -0500
Subject: [PATCH 2/5] Add unit tests, refactoring,

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
---
 bio_hansel/main.py                 |  56 ++--
 bio_hansel/qc/checks.py            |  52 ++--
 bio_hansel/subtyper.py             | 394 +++++++++++++++++------------
 tests/test_qc.py                   |  31 ++-
 tests/test_subtyping_contigs.py    |  45 ++--
 tests/test_subtyping_processing.py |  97 +++++++
 tests/test_subtyping_reads.py      |   9 +-
 7 files changed, 438 insertions(+), 246 deletions(-)
 create mode 100644 tests/test_subtyping_processing.py

diff --git a/bio_hansel/main.py b/bio_hansel/main.py
index 1a851cd..837bd5c 100644
--- a/bio_hansel/main.py
+++ b/bio_hansel/main.py
@@ -4,18 +4,20 @@
 import argparse
 import logging
 import sys
-import os
-import re
 from typing import Optional, List, Any, Tuple
+
+import attr
+import os
 import pandas as pd
+import re
 
 from . import program_desc, __version__
 from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL
 from .subtype import Subtype
 from .subtype_stats import subtype_counts
 from .subtyper import \
-    query_contigs_ac, \
-    query_reads_ac
+    subtype_contigs_samples, \
+    subtype_reads_samples
 from .utils import \
     genome_name_from_fasta_path, \
     get_scheme_fasta, \
@@ -189,34 +191,34 @@ def main():
     scheme_subtype_counts = subtype_counts(scheme_fasta)
     logging.debug(args)
     subtyping_params = init_subtyping_params(args, scheme)
-    input_genomes, reads = collect_inputs(args)
-    if len(input_genomes) == 0 and len(reads) == 0:
+    input_contigs, input_reads = collect_inputs(args)
+    if len(input_contigs) == 0 and len(input_reads) == 0:
         raise Exception('No input files specified!')
 
     n_threads = args.threads
 
-    subtype_results = []  # type: List[Subtype]
-    dfs = []  # type: List[pd.DataFrame]
-    if len(input_genomes) > 0:
-        query_contigs_ac(subtype_results=subtype_results,
-                         dfs=dfs,
-                         input_genomes=input_genomes,
-                         scheme=scheme,
-                         scheme_name=scheme_name,
-                         subtyping_params=subtyping_params,
-                         scheme_subtype_counts=scheme_subtype_counts,
-                         n_threads=n_threads)
-    if len(reads) > 0:
-        query_reads_ac(subtype_results=subtype_results,
-                       dfs=dfs,
-                       reads=reads,
-                       scheme=scheme,
-                       scheme_name=scheme_name,
-                       subtyping_params=subtyping_params,
-                       scheme_subtype_counts=scheme_subtype_counts,
-                       n_threads=n_threads)
+    subtype_results = []  # type: List[Tuple[Subtype, pd.DataFrame]]
+    if len(input_contigs) > 0:
+        contigs_results = subtype_contigs_samples(input_genomes=input_contigs,
+                                                  scheme=scheme,
+                                                  scheme_name=scheme_name,
+                                                  subtyping_params=subtyping_params,
+                                                  scheme_subtype_counts=scheme_subtype_counts,
+                                                  n_threads=n_threads)
+        logging.info('Generated %s subtyping results from %s contigs samples', len(contigs_results), len(input_contigs))
+        subtype_results += contigs_results
+    if len(input_reads) > 0:
+        reads_results = subtype_reads_samples(reads=input_reads,
+                                              scheme=scheme,
+                                              scheme_name=scheme_name,
+                                              subtyping_params=subtyping_params,
+                                              scheme_subtype_counts=scheme_subtype_counts,
+                                              n_threads=n_threads)
+        logging.info('Generated %s subtyping results from %s contigs samples', len(reads_results), len(input_reads))
+        subtype_results += reads_results
 
-    dfsummary = pd.DataFrame(subtype_results)
+    dfs = [df for st, df in subtype_results]  # type: List[pd.DataFrame]
+    dfsummary = pd.DataFrame([attr.asdict(st) for st, df in subtype_results])
     dfsummary = dfsummary[SUBTYPE_SUMMARY_COLS]
 
     if dfsummary['avg_tile_coverage'].isnull().all():
diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py
index 650a518..22fb5c2 100644
--- a/bio_hansel/qc/checks.py
+++ b/bio_hansel/qc/checks.py
@@ -4,10 +4,10 @@
 
 from pandas import DataFrame
 
-from ..subtyping_params import SubtypingParams
 from ..qc.const import QC
 from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts
 from ..subtype import Subtype
+from ..subtyping_params import SubtypingParams
 
 
 def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
@@ -39,7 +39,7 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
 
     if st.are_subtypes_consistent:
         return check_for_missing_tiles(is_fastq=st.is_fastq_input(),
-                                       curr_subtype=st.subtype,
+                                       subtype_result=st.subtype,
                                        scheme=st.scheme,
                                        df=df,
                                        exp=int(st.n_tiles_matching_all_expected),
@@ -55,32 +55,52 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
         mixed_subtype_counts = get_mixed_subtype_tile_counts(dfpos=dfpos,
                                                              subtype_list=subtype_list)
 
+        tiles_matching_negative = st.n_tiles_matching_negative
         for curr_subtype, exp in zip(subtype_list, n_tiles_matching_expected):
             # We can omit the status because there will be a fail status already from non consistent subtypes.
+
+            obs = mixed_subtype_counts.get(curr_subtype) + int(tiles_matching_negative)
             _, curr_messages = check_for_missing_tiles(is_fastq=st.is_fastq_input(),
-                                                       curr_subtype=curr_subtype,
+                                                       subtype_result=curr_subtype,
                                                        scheme=st.scheme,
                                                        df=df,
                                                        exp=int(exp),
-                                                       obs=(mixed_subtype_counts.get(curr_subtype) +
-                                                            int(st.n_tiles_matching_negative)),
+                                                       obs=obs,
                                                        p=p)
 
             message_list.append(curr_messages)
 
         error_messages = ' | '.join(filter(None.__ne__, message_list))
 
-        return _, error_messages
+        return QC.FAIL, error_messages
+
+
+def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str,
+                            df: DataFrame, exp: int, obs: int, p: SubtypingParams) -> Tuple[
+    Optional[str], Optional[str]]:
+    """Check if there are too many missing tiles
 
+    Also check if the mean tile coverage depth is above the low coverage threshold.
 
-def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str,
-                            df: DataFrame, exp: int, obs: int, p: SubtypingParams):
-    error_status = None
-    error_messages = None
+    Args:
+        is_fastq: Is input sample reads?
+        subtype_result: Single subtype designation
+        scheme: Scheme name
+        df: Subtyping results dataframe
+        exp: Expected number of tiles that should be found
+        obs: Actual observed number of tiles found
+        p: Subtyping parameters
+
+    Returns:
+        Tuple of QC status and any QC messages
+    """
+    status = None
+    messages = None
 
+    # proportion of missing tiles
     p_missing = (exp - obs) / exp  # type: float
     if p_missing > p.max_perc_missing_tiles:
-        error_status = QC.FAIL
+        status = QC.FAIL
         if is_fastq:
             tiles_with_hits = df[df['is_kmer_freq_okay']]  # type: DataFrame
             depth = tiles_with_hits['freq'].mean()
@@ -90,13 +110,13 @@ def check_for_missing_tiles(is_fastq: bool, curr_subtype: str, scheme: str,
             else:
                 coverage_msg = f'Okay coverage depth ({depth:.1f} >= {float(p.low_coverage_depth_freq):.1f} expected), ' \
                                f'but this may be the wrong serovar or species for scheme "{scheme}"'
-            error_messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
-                             f'tiles threshold. {coverage_msg}'
+            messages = f'{p_missing:.2%} missing tiles; more than {p.max_perc_missing_tiles:.2%} missing ' \
+                       f'tiles threshold. {coverage_msg}'
         else:
-            error_messages = f'{p_missing:.2%} missing tiles for subtype "{curr_subtype}"; more than ' \
-                             f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
+            messages = f'{p_missing:.2%} missing tiles for subtype "{subtype_result}"; more than ' \
+                       f'{p.max_perc_missing_tiles:.2%} missing tile threshold'
 
-    return error_status, error_messages
+    return status, messages
 
 
 def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py
index 1ac345d..db44d42 100644
--- a/bio_hansel/subtyper.py
+++ b/bio_hansel/subtyper.py
@@ -2,12 +2,11 @@
 """
 Functions for subtyping of reads (e.g. FASTQ) and contigs (e.g. FASTA) using bio_hansel-compatible subtyping schemes.
 """
-import re
 import logging
 from typing import Optional, List, Dict, Union, Tuple
 
-import attr
 import pandas as pd
+import re
 
 from .aho_corasick import init_automaton, find_in_fasta, find_in_fastqs
 from .const import COLUMNS_TO_REMOVE
@@ -19,12 +18,84 @@
 from .utils import find_inconsistent_subtypes, get_scheme_fasta, get_scheme_version, init_subtyping_params
 
 
-def subtype_contigs_ac(fasta_path: str,
-                       genome_name: str,
-                       scheme: str,
-                       subtyping_params: Optional[SubtypingParams] = None,
-                       scheme_name: Optional[str] = None,
-                       scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
+def subtype_reads_samples(reads: List[Tuple[List[str], str]],
+                          scheme: str,
+                          scheme_name: Optional[str] = None,
+                          subtyping_params: Optional[SubtypingParams] = None,
+                          scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
+                          n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]:
+    """Subtype input genomes using a scheme.
+
+    Args:
+        input_genomes: input genomes; tuple of FASTA file path and genome name
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: summary information about scheme
+        n_threads: number of threads to use for subtyping analysis
+
+    Returns:
+        List of tuple of Subtype and detailed subtyping results for each sample
+    """
+    if n_threads == 1:
+        logging.info('Serial single threaded run mode on %s input genomes', len(reads))
+        outputs = [subtype_reads(reads=fastq_files,
+                                 genome_name=genome_name,
+                                 scheme=scheme,
+                                 scheme_name=scheme_name,
+                                 subtyping_params=subtyping_params,
+                                 scheme_subtype_counts=scheme_subtype_counts)
+                   for fastq_files, genome_name in reads]
+    else:
+        outputs = parallel_query_reads(reads=reads,
+                                       scheme=scheme,
+                                       scheme_name=scheme_name,
+                                       subtyping_params=subtyping_params,
+                                       scheme_subtype_counts=scheme_subtype_counts,
+                                       n_threads=n_threads)
+    return outputs
+
+
+def subtype_contigs_samples(input_genomes: List[Tuple[str, str]],
+                            scheme: str,
+                            scheme_name: Optional[str] = None,
+                            subtyping_params: Optional[SubtypingParams] = None,
+                            scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
+                            n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]:
+    """Subtype input genomes using a scheme.
+
+    Args:
+        input_genomes: input genomes; tuple of FASTA file path and genome name
+        scheme: bio_hansel scheme FASTA path
+        scheme_name: optional scheme name
+        subtyping_params: scheme specific subtyping parameters
+        scheme_subtype_counts: summary information about scheme
+        n_threads: number of threads to use for subtyping analysis
+
+    Returns:
+        List of tuple of Subtype and detailed subtyping results for each sample
+    """
+    if n_threads == 1:
+        logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
+        outputs = [subtype_contigs(fasta_path=input_fasta,
+                                   genome_name=genome_name,
+                                   scheme=scheme,
+                                   scheme_name=scheme_name,
+                                   subtyping_params=subtyping_params,
+                                   scheme_subtype_counts=scheme_subtype_counts)
+                   for input_fasta, genome_name in input_genomes]
+    else:
+        outputs = parallel_query_contigs(input_genomes, scheme, scheme_name, subtyping_params, scheme_subtype_counts,
+                                         n_threads)
+    return outputs
+
+
+def subtype_contigs(fasta_path: str,
+                    genome_name: str,
+                    scheme: str,
+                    subtyping_params: Optional[SubtypingParams] = None,
+                    scheme_name: Optional[str] = None,
+                    scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
     Subtype, pd.DataFrame]:
     """Subtype input contigs using a particular scheme.
 
@@ -58,7 +129,7 @@ def subtype_contigs_ac(fasta_path: str,
     if df is None or df.shape[0] == 0:
         logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme)
         st.qc_status = QC.FAIL
-        st.qc_message = QC.NO_SUBTYPE_RESULT
+        st.qc_message = QC.NO_TARGETS_FOUND
         st.are_subtypes_consistent = False
         return st, empty_results(st)
 
@@ -67,7 +138,7 @@ def subtype_contigs_ac(fasta_path: str,
     df['refposition'] = [int(x.replace('negative', '')) for x in refpositions]
     df['subtype'] = subtypes
     df['is_pos_tile'] = ~df.tilename.str.contains('negative')
-    process_contigs_subtyping_results(st, df, scheme_subtype_counts)
+    process_subtyping_results(st, df, scheme_subtype_counts)
     st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
 
     logging.info(st)
@@ -83,14 +154,12 @@ def subtype_contigs_ac(fasta_path: str,
     return st, df
 
 
-
-
-def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]],
-                              scheme: str,
-                              scheme_name: Optional[str] = None,
-                              subtyping_params: Optional[SubtypingParams] = None,
-                              scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
-                              n_threads: int = 1):
+def parallel_query_contigs(input_genomes: List[Tuple[str, str]],
+                           scheme: str,
+                           scheme_name: Optional[str] = None,
+                           subtyping_params: Optional[SubtypingParams] = None,
+                           scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
+                           n_threads: int = 1):
     """Parallel subtyping of input contigs
 
     Subtype and analyse each input in parallel using a multiprocessing thread pool.
@@ -110,68 +179,24 @@ def parallel_query_contigs_ac(input_genomes: List[Tuple[str, str]],
     logging.info('Initializing thread pool with %s threads', n_threads)
     pool = Pool(processes=n_threads)
     logging.info('Running analysis asynchronously on %s input genomes', len(input_genomes))
-    res = [pool.apply_async(subtype_contigs_ac, (input_fasta,
-                                                 genome_name,
-                                                 scheme,
-                                                 subtyping_params,
-                                                 scheme_name,
-                                                 scheme_subtype_counts))
+    res = [pool.apply_async(subtype_contigs, (input_fasta,
+                                              genome_name,
+                                              scheme,
+                                              subtyping_params,
+                                              scheme_name,
+                                              scheme_subtype_counts))
            for input_fasta, genome_name in input_genomes]
     logging.info('Parallel analysis complete! Retrieving analysis results')
     outputs = [x.get() for x in res]
     return outputs
 
 
-def query_contigs_ac(subtype_results: List[Subtype],
-                     dfs: List[pd.DataFrame],
-                     input_genomes: List[Tuple[str, str]],
-                     scheme: str,
-                     scheme_name: Optional[str] = None,
-                     subtyping_params: Optional[SubtypingParams] = None,
-                     scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
-                     n_threads: int = 1) -> None:
-    """Subtype input genomes using a scheme.
-
-    Args:
-        subtype_results: Subtyping results for each input genome
-        dfs: detailed subtyping results for each input genome
-        input_genomes: input genomes; tuple of FASTA file path and genome name
-        scheme: bio_hansel scheme FASTA path
-        scheme_name: optional scheme name
-        subtyping_params: scheme specific subtyping parameters
-        scheme_subtype_counts: summary information about scheme
-        n_threads: number of threads to use for subtyping analysis
-
-    Returns:
-        `subtype_results` and `dfs` by reference
-    """
-    if n_threads == 1:
-        logging.info('Serial single threaded run mode on %s input genomes', len(input_genomes))
-        outputs = [subtype_contigs_ac(fasta_path=input_fasta,
-                                      genome_name=genome_name,
-                                      scheme=scheme,
-                                      scheme_name=scheme_name,
-                                      subtyping_params=subtyping_params,
-                                      scheme_subtype_counts=scheme_subtype_counts)
-                   for input_fasta, genome_name in input_genomes]
-    else:
-        outputs = parallel_query_contigs_ac(input_genomes, scheme, scheme_name, subtyping_params, scheme_subtype_counts,
-                                            n_threads)
-    for subtype, df in outputs:
-        if df is not None:
-            dfs.append(df)
-        else:
-            logging.error(f'Subtyping results DataFrame is "None" for {subtype}. Building empty results DF.')
-            dfs.append(empty_results(subtype))
-        subtype_results.append(attr.asdict(subtype))
-
-
-def parallel_query_reads_ac(reads: List[Tuple[List[str], str]],
-                            scheme: str,
-                            scheme_name: Optional[str] = None,
-                            subtyping_params: Optional[SubtypingParams] = None,
-                            scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
-                            n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]:
+def parallel_query_reads(reads: List[Tuple[List[str], str]],
+                         scheme: str,
+                         scheme_name: Optional[str] = None,
+                         subtyping_params: Optional[SubtypingParams] = None,
+                         scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
+                         n_threads: int = 1) -> List[Tuple[Subtype, pd.DataFrame]]:
     """Parallel subtyping of input reads
 
     Subtype and analyse each input in parallel using a multiprocessing thread pool.
@@ -191,24 +216,24 @@ def parallel_query_reads_ac(reads: List[Tuple[List[str], str]],
     logging.info('Initializing thread pool with %s threads', n_threads)
     pool = Pool(processes=n_threads)
     logging.info('Running analysis asynchronously on %s input genomes', len(reads))
-    res = [pool.apply_async(subtype_reads_ac, (fastqs,
-                                               genome_name,
-                                               scheme,
-                                               scheme_name,
-                                               subtyping_params,
-                                               scheme_subtype_counts))
+    res = [pool.apply_async(subtype_reads, (fastqs,
+                                            genome_name,
+                                            scheme,
+                                            scheme_name,
+                                            subtyping_params,
+                                            scheme_subtype_counts))
            for fastqs, genome_name in reads]
     logging.info('Parallel analysis complete! Retrieving analysis results')
     outputs = [x.get() for x in res]
     return outputs
 
 
-def subtype_reads_ac(reads: Union[str, List[str]],
-                     genome_name: str,
-                     scheme: str,
-                     scheme_name: Optional[str] = None,
-                     subtyping_params: Optional[SubtypingParams] = None,
-                     scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
+def subtype_reads(reads: Union[str, List[str]],
+                  genome_name: str,
+                  scheme: str,
+                  scheme_name: Optional[str] = None,
+                  subtyping_params: Optional[SubtypingParams] = None,
+                  scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None) -> Tuple[
     Subtype, Optional[pd.DataFrame]]:
     """Subtype input reads using a particular scheme.
 
@@ -249,7 +274,7 @@ def subtype_reads_ac(reads: Union[str, List[str]],
         logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', reads, scheme)
         st.are_subtypes_consistent = False
         st.qc_status = QC.FAIL
-        st.qc_message = QC.NO_SUBTYPE_RESULT
+        st.qc_message = QC.NO_TARGETS_FOUND
         return st, empty_results(st)
 
     refpositions = [x for x, y in df.tilename.str.split('-')]
@@ -259,8 +284,9 @@ def subtype_reads_ac(reads: Union[str, List[str]],
     df['is_pos_tile'] = ~df.tilename.str.contains('negative')
     df['is_kmer_freq_okay'] = (df.freq >= subtyping_params.min_kmer_freq) & (df.freq <= subtyping_params.max_kmer_freq)
     st.avg_tile_coverage = df['freq'].mean()
-    process_contigs_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
+    st, df = process_subtyping_results(st, df[df.is_kmer_freq_okay], scheme_subtype_counts)
     st.qc_status, st.qc_message = perform_quality_check(st, df, subtyping_params)
+    df['file_path'] = str(st.file_path)
     df['sample'] = genome_name
     df['scheme'] = scheme_name or scheme
     df['scheme_version'] = scheme_version
@@ -270,113 +296,163 @@ def subtype_reads_ac(reads: Union[str, List[str]],
     return st, df
 
 
-def process_contigs_subtyping_results(st, df, scheme_subtype_counts):
+def process_subtyping_results(st: Subtype, df: pd.DataFrame, scheme_subtype_counts: Dict[str, SubtypeCounts]) -> Tuple[
+    Subtype, pd.DataFrame]:
+    """Process the subtyping results to get the final subtype result and summary stats
+
+    Args:
+        st: Subtype result
+        df: Subtyping results
+        scheme_subtype_counts: Subtyping scheme summary info
+    Returns:
+        Tuple of `st` and `df`
+    """
     dfpos = df[df.is_pos_tile]
     dfpos_highest_res = highest_resolution_subtype_results(dfpos)
     subtype_list = [x for x in dfpos_highest_res.subtype.unique()]
-    set_subtype_results(st, dfpos, subtype_list)
-    set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_positive_subtypes(dfpos)))
-    calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts)
-    set_non_present_subtypes(st, df, scheme_subtype_counts)
+    st = set_subtype_results(st, dfpos, subtype_list)
+    st = set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_subtype_ints(dfpos.subtype)))
+    st = set_subtyping_stats(st, df, dfpos, dfpos_highest_res, subtype_list, scheme_subtype_counts)
+    st.non_present_subtypes = absent_downstream_subtypes(st.subtype, df.subtype, list(scheme_subtype_counts.keys()))
+    return (st, df)
 
 
-def set_subtype_results(st, dfpos, subtype_list):
+def set_subtype_results(st: Subtype, df_positive: pd.DataFrame, subtype_list: List[str]) -> Subtype:
+    """Set subtype results
+
+    Args:
+        st: Subtype result
+        df_positive: Positive tiles subtyping results
+        subtype_list: List of subtypes found
+    """
     st.subtype = '; '.join(subtype_list)
     st.tiles_matching_subtype = '; '.join(subtype_list)
-    pos_subtypes_str = [x for x in dfpos.subtype.unique()]
+    pos_subtypes_str = [x for x in df_positive.subtype.unique()]
     pos_subtypes_str.sort(key=lambda x: len(x))
     st.all_subtypes = '; '.join(pos_subtypes_str)
+    return st
 
 
-def calculate_subtyping_stats(st, df, dfpos, subtype_list, scheme_subtype_counts):
+def set_subtyping_stats(st: Subtype,
+                        df: pd.DataFrame,
+                        dfpos: pd.DataFrame,
+                        dfpos_highest_res: pd.DataFrame,
+                        subtype_list: List[str],
+                        scheme_subtype_counts: Dict[str, SubtypeCounts]) -> Subtype:
+    """Set subtyping result stats
+
+    - `n_tiles_matching_subtype`: # tiles matching final subtype
+    - `n_tiles_matching_all`: # tiles found
+    - `n_tiles_matching_positive`: # positive tiles found
+    - `n_tiles_matching_negative`:  # negative tiles found
+    - `n_tiles_matching_all_expected`: expected # of all tiles found for each subtype
+    - `n_tiles_matching_positive_expected`: expected # of positive tiles found for each subtype
+    - `n_tiles_matching_subtype_expected`: expected # of subtype-specific tiles found for each subtype
+
+    Args:
+        st: Subtype result
+        df: Subtyping results
+        dfpos: Positive tile subtyping results
+        dfpos_highest_res: Final subtype specific subtyping results
+        subtype_list: List of subtypes found
+        scheme_subtype_counts: Subtyping scheme summary info
+    """
+    st.n_tiles_matching_subtype = dfpos_highest_res.shape[0]
     st.n_tiles_matching_all = df.tilename.unique().size
     st.n_tiles_matching_positive = dfpos.tilename.unique().size
-    st.n_tiles_matching_negative = df[~df.is_pos_tile]
-    st.n_tiles_matching_subtype = len(subtype_list)
+    st.n_tiles_matching_negative = df[~df.is_pos_tile].shape[0]
     st.n_tiles_matching_all_expected = ';'.join([str(scheme_subtype_counts[x].all_tile_count) for x in subtype_list])
     st.n_tiles_matching_positive_expected = ';'.join(
         [str(scheme_subtype_counts[x].positive_tile_count) for x in subtype_list])
     st.n_tiles_matching_subtype_expected = ';'.join(
         [str(scheme_subtype_counts[x].subtype_tile_count) for x in subtype_list])
+    return st
 
 
-def count_periods(subtype_with_periods: str) -> pd.Series:
-    return (pd.Series(list(subtype_with_periods)) == '.').sum()
+def count_periods(s: str) -> int:
+    """Count the number of periods in a string.
+
+    Examples:
+
+        count_periods("2.1.1") => 2
+        count_periods("1") => 0
+
+    Args:
+        s: Some string with periods
 
+    Returns:
+        The number of periods found in the input string.
+    """
+    return sum((1 for c in list(s) if c == '.'))
+
+
+def highest_resolution_subtype_results(df: pd.DataFrame) -> pd.DataFrame:
+    """Get the highest resolution subtype results
+
+    Where the highest resolution result has the most periods ('.') in its designation.
+
+    Args:
+        df: positive subtyping results
 
-def highest_resolution_subtype_results(dfpos):
-    subtype_lens = dfpos.subtype.apply(count_periods)
+    Returns:
+        Highest resolution (most periods in subtype) subtyping results
+    """
+    subtype_lens = df.subtype.apply(count_periods)
     max_subtype_strlen = subtype_lens.max()
-    logging.debug('max substype str len: %s', max_subtype_strlen)
-    dfpos_highest_res = dfpos[subtype_lens == max_subtype_strlen]
-    return dfpos_highest_res
+    return df[subtype_lens == max_subtype_strlen]
 
 
-def sorted_positive_subtypes(dfpos: pd.DataFrame) -> List[List[int]]:
-    pos_subtypes = [[int(y) for y in x.split('.')] for x in dfpos.subtype.unique()]
-    pos_subtypes.sort(key=lambda a: len(a))
-    return pos_subtypes
+def sorted_subtype_ints(subtypes: pd.Series) -> List[List[int]]:
+    """Get the list of subtypes as lists of integers sorted by subtype resolution
 
+    Where the subtype resolution is determined by the number of periods ('.') in the subtype.
 
-def set_non_present_subtypes(st, df, scheme_subtype_counts):
-    possible_downstream_subtypes = [s for s in scheme_subtype_counts
-                                    if re.search("^({})(\.)(\d)$".format(re.escape(st.subtype)), s)]
-    st.non_present_subtypes = [x for x in possible_downstream_subtypes
-                               if not (df.subtype == x).any()]
+    Args:
+        subtypes: pd.Series of subtype strings
 
+    Return:
+        list of subtypes as lists of integers sorted by subtype resolution
+    """
+    subtypes_ints = [[int(y) for y in x.split('.')] for x in subtypes.unique()]
+    subtypes_ints.sort(key=lambda a: len(a))
+    return subtypes_ints
 
-def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List):
-    if len(inconsistent_subtypes) > 0:
-        st.are_subtypes_consistent = False
-        st.inconsistent_subtypes = inconsistent_subtypes
-    else:
-        st.are_subtypes_consistent = True
-        st.are_subtypes_consistent = None
 
+def absent_downstream_subtypes(subtype: str, subtypes: pd.Series, scheme_subtypes: List[str]) -> Optional[List[str]]:
+    """Find the downstream subtypes that are not present in the results
+
+    Args:
+        subtype: Final subtype result
+        subtypes: Subtypes found
+        scheme_subtypes: Possible subtyping scheme subtypes
+
+    Returns:
+         List of downstream subtypes that are not present in the results or `None` if all downstream subtypes are present
+    """
+    escaped_subtype = re.escape(subtype)
+    re_subtype = re.compile(r'^{}\.\d+$'.format(escaped_subtype))
+    downstream_subtypes = [s for s in scheme_subtypes if re_subtype.search(s)]
+    absentees = [x for x in downstream_subtypes if not (subtypes == x).any()]
+    return absentees if len(absentees) > 0 else None
 
-def query_reads_ac(subtype_results: List[Subtype],
-                   dfs: List[pd.DataFrame],
-                   reads: List[Tuple[List[str], str]],
-                   scheme: str,
-                   scheme_name: Optional[str] = None,
-                   subtyping_params: Optional[SubtypingParams] = None,
-                   scheme_subtype_counts: Optional[Dict[str, SubtypeCounts]] = None,
-                   n_threads: int = 1):
-    """Subtype input genomes using a scheme.
+
+def set_inconsistent_subtypes(st: Subtype, inconsistent_subtypes: List[str]) -> Subtype:
+    """Save the list of inconsistent subtypes, if any, to the Subtype result
 
     Args:
-        subtype_results: Subtyping results for each input genome
-        dfs: detailed subtyping results for each input genome
-        input_genomes: input genomes; tuple of FASTA file path and genome name
-        scheme: bio_hansel scheme FASTA path
-        scheme_name: optional scheme name
-        subtyping_params: scheme specific subtyping parameters
-        scheme_subtype_counts: summary information about scheme
-        n_threads: number of threads to use for subtyping analysis
+        st: Subtype result
+        inconsistent_subtypes: List of inconsistent subtypes
 
     Returns:
-        `subtype_results` and `dfs` by reference
+
     """
-    if n_threads == 1:
-        logging.info('Serial single threaded run mode on %s input genomes', len(reads))
-        outputs = [subtype_reads_ac(reads=fastq_files,
-                                    genome_name=genome_name,
-                                    scheme=scheme,
-                                    scheme_name=scheme_name,
-                                    subtyping_params=subtyping_params,
-                                    scheme_subtype_counts=scheme_subtype_counts)
-                   for fastq_files, genome_name in reads]
+    if len(inconsistent_subtypes) > 0:
+        st.are_subtypes_consistent = False
+        st.inconsistent_subtypes = inconsistent_subtypes
     else:
-        outputs = parallel_query_reads_ac(reads=reads,
-                                          scheme=scheme,
-                                          scheme_name=scheme_name,
-                                          subtyping_params=subtyping_params,
-                                          scheme_subtype_counts=scheme_subtype_counts,
-                                          n_threads=n_threads)
-    for subtype, df in outputs:
-        if df is not None:
-            dfs.append(df)
-        subtype_results.append(attr.asdict(subtype))
+        st.are_subtypes_consistent = True
+        st.inconsistent_subtypes = None
+    return st
 
 
 def empty_results(st: Subtype) -> pd.DataFrame:
@@ -399,4 +475,4 @@ def empty_results(st: Subtype) -> pd.DataFrame:
                                  scheme=st.scheme,
                                  scheme_version=st.scheme_version,
                                  qc_status=st.qc_status,
-                                 qc_message=st.qc_message)}).transpose()
\ No newline at end of file
+                                 qc_message=st.qc_message)}).transpose()
diff --git a/tests/test_qc.py b/tests/test_qc.py
index 9b7b914..55859fb 100644
--- a/tests/test_qc.py
+++ b/tests/test_qc.py
@@ -1,13 +1,12 @@
 # -*- coding: utf-8 -*-
-from bio_hansel.qc import is_maybe_intermediate_subtype
 
-from bio_hansel.utils import init_subtyping_params
-from pandas import DataFrame
+import pandas as pd
 
+from bio_hansel.qc import is_maybe_intermediate_subtype
 from bio_hansel.qc.const import QC
 from bio_hansel.subtype import Subtype
-from bio_hansel.subtyper import subtype_reads_ac, subtype_contigs_ac
-import pandas as pd
+from bio_hansel.subtyper import subtype_reads, subtype_contigs
+from bio_hansel.utils import init_subtyping_params
 
 genome_name = 'test'
 
@@ -15,9 +14,9 @@
 def test_low_coverage():
     scheme = 'heidelberg'
     fastq = 'tests/data/SRR1696752/SRR1696752.fastq'
-    st, df = subtype_reads_ac(reads=fastq, genome_name=genome_name, scheme=scheme)
+    st, df = subtype_reads(reads=fastq, genome_name=genome_name, scheme=scheme)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.is_fastq_input()
     assert st.scheme == scheme
     assert 'Low coverage for all tiles (7.439 < 20 expected)' in st.qc_message
@@ -53,7 +52,7 @@ def test_intermediate_subtype():
     p = init_subtyping_params(args=None, scheme=scheme)
     st.qc_status, st.qc_message = is_maybe_intermediate_subtype(st, df, p)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.scheme == scheme
     assert "Total subtype matches observed (n=3) vs expected (n=6)" in st.qc_message
     assert st.qc_status == QC.WARNING
@@ -62,9 +61,9 @@ def test_intermediate_subtype():
 def test_missing_tiles():
     scheme = 'heidelberg'
     fastq = 'tests/data/SRR1696752/SRR1696752.fastq'
-    st, df = subtype_reads_ac(reads=fastq, genome_name=genome_name, scheme=scheme)
+    st, df = subtype_reads(reads=fastq, genome_name=genome_name, scheme=scheme)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.is_fastq_input()
     assert st.scheme == scheme
     assert 'Low coverage depth (10.9 < 20.0 expected)' in st.qc_message
@@ -74,9 +73,9 @@ def test_missing_tiles():
 def test_mixed_tiles():
     scheme = 'heidelberg'
     fastqs = ['tests/data/SRR3392166/SRR3392166.fastq', 'tests/data/SRR3392166/SRR3392166.fastq']
-    st, df = subtype_reads_ac(reads=fastqs, genome_name=genome_name, scheme=scheme)
+    st, df = subtype_reads(reads=fastqs, genome_name=genome_name, scheme=scheme)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.scheme == scheme
     assert 'Mixed subtypes found: "1; 2; 2.1"' in st.qc_message
     assert st.qc_status == QC.FAIL
@@ -85,9 +84,9 @@ def test_mixed_tiles():
 def test_mixed_subtype_positive_negative_tiles_same_target():
     scheme = 'heidelberg'
     fasta = 'tests/data/fail-qc-mixed-subtype-pos-neg-tiles.fasta'
-    st, df = subtype_contigs_ac(fasta_path=fasta, genome_name=genome_name, scheme=scheme)
+    st, df = subtype_contigs(fasta_path=fasta, genome_name=genome_name, scheme=scheme)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.scheme == scheme
     assert st.qc_status == QC.FAIL
     expected_qc_msg = ('FAIL: Mixed subtype; the positive and negative tiles were found for the same '
@@ -103,9 +102,9 @@ def test_mixed_subtype_positive_negative_tiles_same_target():
 def test_unconfident_subtype():
     scheme = 'enteritidis'
     fasta = 'tests/data/fail-qc-unconfident-subtype.fasta'
-    st, df = subtype_contigs_ac(fasta_path=fasta, genome_name=genome_name, scheme=scheme)
+    st, df = subtype_contigs(fasta_path=fasta, genome_name=genome_name, scheme=scheme)
     assert isinstance(st, Subtype)
-    assert isinstance(df, DataFrame)
+    assert isinstance(df, pd.DataFrame)
     assert st.scheme == scheme
     assert st.qc_status == QC.FAIL
     assert QC.UNCONFIDENT_RESULTS_ERROR_4 in st.qc_message
diff --git a/tests/test_subtyping_contigs.py b/tests/test_subtyping_contigs.py
index 90c8194..3e1225b 100644
--- a/tests/test_subtyping_contigs.py
+++ b/tests/test_subtyping_contigs.py
@@ -1,11 +1,10 @@
 import pytest
-from pandas import DataFrame, Series
-import numpy as np
-from bio_hansel.subtype import Subtype
-from bio_hansel.subtyper import subtype_contigs_ac
+from pandas import DataFrame
+
 from bio_hansel.const import SCHEME_FASTAS
 from bio_hansel.qc.const import QC
-
+from bio_hansel.subtype import Subtype
+from bio_hansel.subtyper import subtype_contigs
 from . import check_subtype_attrs, check_df_fasta_cols
 
 genome_name = 'test'
@@ -76,12 +75,12 @@ def subtype_enteritidis_fail():
 
 
 def test_heidelberg_fasta_ac(subtype_heidelberg_pass):
-    st, df = subtype_contigs_ac(fasta_path=fasta_heidelberg_pass,
-                                genome_name=genome_name,
-                                scheme=scheme_heidelberg)
-    stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_heidelberg_pass,
-                                    genome_name=genome_name,
-                                    scheme=scheme_heidelberg)
+    st, df = subtype_contigs(fasta_path=fasta_heidelberg_pass,
+                             genome_name=genome_name,
+                             scheme=scheme_heidelberg)
+    stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_heidelberg_pass,
+                                 genome_name=genome_name,
+                                 scheme=scheme_heidelberg)
     assert isinstance(st, Subtype)
     assert isinstance(df, DataFrame)
     assert isinstance(stgz, Subtype)
@@ -92,12 +91,12 @@ def test_heidelberg_fasta_ac(subtype_heidelberg_pass):
 
 
 def test_enteritidis_scheme_vs_qc_failing_contigs_unconfident_ac(subtype_enteritidis_fail_unconfident):
-    st, df = subtype_contigs_ac(fasta_path=fasta_enteritidis_unconfident,
-                                genome_name=genome_name,
-                                scheme=scheme_enteritidis)
-    stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_enteritidis_unconfident,
-                                    genome_name=genome_name,
-                                    scheme=scheme_enteritidis)
+    st, df = subtype_contigs(fasta_path=fasta_enteritidis_unconfident,
+                             genome_name=genome_name,
+                             scheme=scheme_enteritidis)
+    stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_enteritidis_unconfident,
+                                 genome_name=genome_name,
+                                 scheme=scheme_enteritidis)
     assert isinstance(st, Subtype)
     assert isinstance(df, DataFrame)
     assert isinstance(stgz, Subtype)
@@ -110,12 +109,12 @@ def test_enteritidis_scheme_vs_qc_failing_contigs_unconfident_ac(subtype_enterit
 
 
 def test_ac_vs_bad_contigs(subtype_enteritidis_fail):
-    st, df = subtype_contigs_ac(fasta_path=fasta_enteritidis_fail,
-                                genome_name=genome_name,
-                                scheme=scheme_enteritidis)
-    stgz, dfgz = subtype_contigs_ac(fasta_path=fasta_gz_enteritidis_fail,
-                                    scheme=scheme_enteritidis,
-                                    genome_name=genome_name)
+    st, df = subtype_contigs(fasta_path=fasta_enteritidis_fail,
+                             genome_name=genome_name,
+                             scheme=scheme_enteritidis)
+    stgz, dfgz = subtype_contigs(fasta_path=fasta_gz_enteritidis_fail,
+                                 scheme=scheme_enteritidis,
+                                 genome_name=genome_name)
     assert isinstance(st, Subtype)
     assert isinstance(df, DataFrame)
     assert isinstance(stgz, Subtype)
diff --git a/tests/test_subtyping_processing.py b/tests/test_subtyping_processing.py
new file mode 100644
index 0000000..72fd527
--- /dev/null
+++ b/tests/test_subtyping_processing.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+
+from bio_hansel.qc import QC
+from bio_hansel.subtype import Subtype
+from bio_hansel.subtyper import absent_downstream_subtypes, sorted_subtype_ints, count_periods, empty_results
+from bio_hansel.utils import find_inconsistent_subtypes
+
+
+def test_count_periods():
+    assert count_periods('1.1.1') == 2
+    assert count_periods('1.1.10000') == 2
+    assert count_periods('1.10000') == 1
+    assert count_periods('10000') == 0
+
+
+def test_absent_downstream_subtypes():
+    assert absent_downstream_subtypes('1', pd.Series(['1.1', '1.2', '1.3', '1']), ['1.1', '1.2', '1', '1.3']) is None
+    assert absent_downstream_subtypes('1', pd.Series(['1.1', '1.2', '1']), ['1.1', '1.2', '1', '1.3']) == ['1.3']
+    assert absent_downstream_subtypes('1', pd.Series(['1']), ['1.1', '1.2', '1', '1.3']) == ['1.1', '1.2', '1.3']
+
+
+def test_sorted_subtype_ints():
+    assert sorted_subtype_ints(pd.Series([])) == []
+    assert sorted_subtype_ints(pd.Series(['1', '1.1', '1.1.1', '1.1.1.99'])) == [[1], [1, 1], [1, 1, 1], [1, 1, 1, 99]]
+    assert sorted_subtype_ints(pd.Series(['1', '1.1', '1.1.1', '1.1.1.99', '1.1', '1.1.1'])) == [[1], [1, 1], [1, 1, 1],
+                                                                                                 [1, 1, 1, 99]]
+
+
+def test_empty_results():
+    st = Subtype(sample='test',
+                 file_path='tests/data/Retro1000data/10-1358.fastq',
+                 scheme='enteritidis',
+                 scheme_version='0.8.0',
+                 subtype=None,
+                 non_present_subtypes=None,
+                 all_subtypes=None,
+                 qc_status=QC.FAIL,
+                 qc_message=QC.NO_TARGETS_FOUND)
+    df_empty = empty_results(st)
+    df_expected_empty = pd.DataFrame({0: dict(sample='test',
+                                              file_path='tests/data/Retro1000data/10-1358.fastq',
+                                              subtype=None,
+                                              refposition=None,
+                                              is_pos_tile=None,
+                                              scheme='enteritidis',
+                                              scheme_version='0.8.0',
+                                              qc_status=QC.FAIL,
+                                              qc_message=QC.NO_TARGETS_FOUND)}).transpose()
+    assert ((df_empty == df_expected_empty) | (df_empty.isnull() == df_expected_empty.isnull())).values.all(), \
+        f'Empty result DataFrame should equal df_expected_empty: {df_expected_empty}'
+
+
+def test_find_inconsistent_subtypes():
+    consistent_subtypes = sorted_subtype_ints(pd.Series('''
+1
+1.1
+1.1.1
+1.1.1.1
+    '''.strip().split('\n')))
+
+    assert find_inconsistent_subtypes(consistent_subtypes) == [], \
+        'Expecting all subtypes to be consistent with each other'
+
+    inconsistent_subtypes = sorted_subtype_ints(pd.Series('''
+1
+1.1
+1.1.1
+1.1.1.1
+1.1.1.2
+1.1.1.3
+    '''.strip().split('\n')))
+
+    exp_incon_subtypes = '''
+1.1.1.1
+1.1.1.2
+1.1.1.3
+    '''.strip().split('\n')
+
+    assert find_inconsistent_subtypes(inconsistent_subtypes) == exp_incon_subtypes, \
+        f'Expecting subtypes {exp_incon_subtypes} to be inconsistent with each other'
+
+    subtypes_list = '''
+1
+1.1
+1.1.1
+1.1.1.1
+1.1.1.2
+1.1.1.3
+1.1.2
+2
+    '''.strip().split('\n')
+
+    inconsistent_subtypes = sorted_subtype_ints(pd.Series(subtypes_list))
+    assert set(find_inconsistent_subtypes(inconsistent_subtypes)) == set(subtypes_list), \
+        f'All subtypes should be inconsistent with each other in {subtypes_list}'
diff --git a/tests/test_subtyping_reads.py b/tests/test_subtyping_reads.py
index 5e69bd4..7c3e510 100644
--- a/tests/test_subtyping_reads.py
+++ b/tests/test_subtyping_reads.py
@@ -1,11 +1,10 @@
 import pytest
 from pandas import DataFrame
 
-from bio_hansel.subtype import Subtype
-from bio_hansel.subtyper import subtype_reads_ac
 from bio_hansel.const import SCHEME_FASTAS
 from bio_hansel.qc.const import QC
-
+from bio_hansel.subtype import Subtype
+from bio_hansel.subtyper import subtype_reads
 from . import check_df_fastq_cols, check_subtype_attrs
 
 genome_name = 'test'
@@ -53,8 +52,8 @@ def subtype_enteritidis_fail():
 
 
 def test_heidelberg_scheme_vs_qc_passing_reads_with_ac(subtype_heidelberg_pass):
-    st, df = subtype_reads_ac(reads=fastq_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg)
-    stgz, dfgz = subtype_reads_ac(reads=fastq_gz_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg)
+    st, df = subtype_reads(reads=fastq_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg)
+    stgz, dfgz = subtype_reads(reads=fastq_gz_heidelberg_pass, genome_name=genome_name, scheme=scheme_heidelberg)
     assert isinstance(st, Subtype)
     assert isinstance(df, DataFrame)
     assert isinstance(stgz, Subtype)

From acf4335341097e0583a4be7c271fa860cc7312de Mon Sep 17 00:00:00 2001
From: Peter Kruczkiewicz <peter.kruczkiewicz@gmail.com>
Date: Fri, 13 Jul 2018 10:05:36 -0500
Subject: [PATCH 3/5] Update main.py

organize imports
---
 bio_hansel/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio_hansel/main.py b/bio_hansel/main.py
index 01ca854..dd7062a 100644
--- a/bio_hansel/main.py
+++ b/bio_hansel/main.py
@@ -4,12 +4,12 @@
 import argparse
 import logging
 import sys
+import re
+import os
 from typing import Optional, List, Any, Tuple
 
 import attr
-import os
 import pandas as pd
-import re
 
 from . import program_desc, __version__
 from .const import SUBTYPE_SUMMARY_COLS, REGEX_FASTQ, REGEX_FASTA, JSON_EXT_TMPL

From 0a8fe89c63c55b00d0c4867f9999586d7980a51c Mon Sep 17 00:00:00 2001
From: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
Date: Fri, 13 Jul 2018 10:45:54 -0500
Subject: [PATCH 4/5] Update checks.py to use import pandas as pd for
 consistency

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
---
 bio_hansel/qc/checks.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/bio_hansel/qc/checks.py b/bio_hansel/qc/checks.py
index 22fb5c2..5313bf5 100644
--- a/bio_hansel/qc/checks.py
+++ b/bio_hansel/qc/checks.py
@@ -2,7 +2,7 @@
 
 from typing import Tuple, Optional
 
-from pandas import DataFrame
+import pandas as pd
 
 from ..qc.const import QC
 from ..qc.utils import get_conflicting_tiles, get_num_pos_neg_tiles, get_mixed_subtype_tile_counts
@@ -10,7 +10,7 @@
 from ..subtyping_params import SubtypingParams
 
 
-def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
+def is_overall_coverage_low(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     if not st.are_subtypes_consistent \
             or st.subtype is None \
             or not st.is_fastq_input():
@@ -21,7 +21,7 @@ def is_overall_coverage_low(st: Subtype, df: DataFrame, p: SubtypingParams) -> T
     return None, None
 
 
-def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
+def is_missing_tiles(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     """Are there more missing tiles than tolerated?
 
     Note:
@@ -75,9 +75,13 @@ def is_missing_tiles(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[Op
         return QC.FAIL, error_messages
 
 
-def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str,
-                            df: DataFrame, exp: int, obs: int, p: SubtypingParams) -> Tuple[
-    Optional[str], Optional[str]]:
+def check_for_missing_tiles(is_fastq: bool,
+                            subtype_result: str,
+                            scheme: str,
+                            df: pd.DataFrame,
+                            exp: int,
+                            obs: int,
+                            p: SubtypingParams) -> Tuple[Optional[str], Optional[str]]:
     """Check if there are too many missing tiles
 
     Also check if the mean tile coverage depth is above the low coverage threshold.
@@ -102,7 +106,7 @@ def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str,
     if p_missing > p.max_perc_missing_tiles:
         status = QC.FAIL
         if is_fastq:
-            tiles_with_hits = df[df['is_kmer_freq_okay']]  # type: DataFrame
+            tiles_with_hits = df[df['is_kmer_freq_okay']]  # type: pd.DataFrame
             depth = tiles_with_hits['freq'].mean()
             if depth < p.low_coverage_depth_freq:
                 coverage_msg = f'Low coverage depth ({depth:.1f} < {float(p.low_coverage_depth_freq):.1f} expected); ' \
@@ -119,7 +123,7 @@ def check_for_missing_tiles(is_fastq: bool, subtype_result: str, scheme: str,
     return status, messages
 
 
-def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
+def is_mixed_subtype(st: Subtype, df: pd.DataFrame, *args) -> Tuple[Optional[str], Optional[str]]:
     """Is the subtype result mixed?
 
     Note:
@@ -146,7 +150,7 @@ def is_mixed_subtype(st: Subtype, df: DataFrame, *args) -> Tuple[Optional[str],
                     f'the same target site{s} {positions} for subtype "{st.subtype}".'
 
 
-def is_missing_too_many_target_sites(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
+def is_missing_too_many_target_sites(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
     Optional[str], Optional[str]]:
     """Are there too many missing target sites for an expected subtype?
 
@@ -200,7 +204,7 @@ def is_missing_downstream_targets(st: Subtype, *args) -> Tuple[Optional[str], Op
     return None, None
 
 
-def is_maybe_intermediate_subtype(st: Subtype, df: DataFrame, p: SubtypingParams) -> Tuple[
+def is_maybe_intermediate_subtype(st: Subtype, df: pd.DataFrame, p: SubtypingParams) -> Tuple[
     Optional[str], Optional[str]]:
     """Is the result a possible intermediate subtype?
 

From 5eea6dc28c2a165bfbeaa36cd2e5cd0e62d3a4c6 Mon Sep 17 00:00:00 2001
From: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
Date: Fri, 13 Jul 2018 10:53:32 -0500
Subject: [PATCH 5/5] Update subtyper.py; fix docstring, renaming a few vars,
 refactoring

Signed-off-by: pkruczkiewicz <peter.kruczkiewicz@canada.ca>
---
 bio_hansel/subtyper.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/bio_hansel/subtyper.py b/bio_hansel/subtyper.py
index db44d42..7ebb4f4 100644
--- a/bio_hansel/subtyper.py
+++ b/bio_hansel/subtyper.py
@@ -27,7 +27,7 @@ def subtype_reads_samples(reads: List[Tuple[List[str], str]],
     """Subtype input genomes using a scheme.
 
     Args:
-        input_genomes: input genomes; tuple of FASTA file path and genome name
+        reads: input genomes; tuple of list of FASTQ file paths and genome name
         scheme: bio_hansel scheme FASTA path
         scheme_name: optional scheme name
         subtyping_params: scheme specific subtyping parameters
@@ -123,8 +123,8 @@ def subtype_contigs(fasta_path: str,
                  scheme_version=scheme_version,
                  scheme_subtype_counts=scheme_subtype_counts)
 
-    A = init_automaton(scheme_fasta)
-    df = find_in_fasta(A, fasta_path)
+    automaton = init_automaton(scheme_fasta)
+    df = find_in_fasta(automaton, fasta_path)
 
     if df is None or df.shape[0] == 0:
         logging.warning('No subtyping tile matches for input "%s" for scheme "%s"', fasta_path, scheme)
@@ -262,11 +262,11 @@ def subtype_reads(reads: Union[str, List[str]],
                  scheme_version=scheme_version,
                  scheme_subtype_counts=scheme_subtype_counts)
 
-    A = init_automaton(scheme_fasta)
+    automaton = init_automaton(scheme_fasta)
     if isinstance(reads, str):
-        df = find_in_fastqs(A, reads)
+        df = find_in_fastqs(automaton, reads)
     elif isinstance(reads, list):
-        df = find_in_fastqs(A, *reads)
+        df = find_in_fastqs(automaton, *reads)
     else:
         raise ValueError('Unexpected type "{}" for "reads": {}'.format(type(reads), reads))
 
@@ -314,7 +314,7 @@ def process_subtyping_results(st: Subtype, df: pd.DataFrame, scheme_subtype_coun
     st = set_inconsistent_subtypes(st, find_inconsistent_subtypes(sorted_subtype_ints(dfpos.subtype)))
     st = set_subtyping_stats(st, df, dfpos, dfpos_highest_res, subtype_list, scheme_subtype_counts)
     st.non_present_subtypes = absent_downstream_subtypes(st.subtype, df.subtype, list(scheme_subtype_counts.keys()))
-    return (st, df)
+    return st, df
 
 
 def set_subtype_results(st: Subtype, df_positive: pd.DataFrame, subtype_list: List[str]) -> Subtype:
@@ -427,7 +427,8 @@ def absent_downstream_subtypes(subtype: str, subtypes: pd.Series, scheme_subtype
         scheme_subtypes: Possible subtyping scheme subtypes
 
     Returns:
-         List of downstream subtypes that are not present in the results or `None` if all downstream subtypes are present
+         List of downstream subtypes that are not present in the results or `None` if all immediately downstream
+         subtypes are present.
     """
     escaped_subtype = re.escape(subtype)
     re_subtype = re.compile(r'^{}\.\d+$'.format(escaped_subtype))