dib-lab · standage · Oct 3, 2016 · Sep 3, 2016 · Sep 3, 2016 · Sep 3, 2016
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,19 @@
+2016-10-02  Titus Brown  <titus@idyll.org>
+
+   * khmer/{utils.py,thread_utils.py,trimming.py}: introduce new function,
+   clean_input_reads, that does consistent read cleaning; update ReadBundle
+   and trimming to make use of new cleaned_seq attribute.
+   * scripts/{normalize-by-median.py,trim-low-abund.py}: refactor to use
+   cleaned_seq attribute properly; this fixed a bug in normalize-by-median.py
+   that led to ignoring lowercase 'actgn'.
+   * doc/dev/development.rst: update developer documentation for new
+   cleaned_seq attribute and ReadBundle behavior.
+   * tests/test_script_output.py: update md5 signatures for output to reflect
+   corrected behavior.
+   * tests/test_functions.py: add test for broken_paired_iter uppercase
+   behavior.
+   * tests/test_read_handling.py: update tests for ReadBundle.
+
 2016-10-01  Titus Brown  <titus@idyll.org>
 
    * khmer/trimming.py,scripts/{filter-abund-single.py,filter-abund.py,

diff --git a/doc/dev/development.rst b/doc/dev/development.rst
@@ -196,9 +196,32 @@ On the C++ side, there are an abundance of ``consume`` functions for loading
 Fasta/Fastq sequences. On the Python side, read handling is sometimes delegated
 to the C++ library, and sometimes handled in Python using screed.
 
-In an attempt to normalize read handling in Python, the ``ReadBundle`` class has
-been taken out of ``trim-low-abund.py`` and placed in ``khmer/utils.py`` to be
-accessible to all other modules and scripts. Further development should focus on
-moving all existing Python read cleaning/QC/handling to ``ReadBundle``, as well
-as replacing *ad hoc* read handling code throughout the codebase with
-``ReadBundle``.
+In an attempt to normalize read handling in Python, the functions in
+``khmer/utils.py`` should be used whenever possible.  Here,
+``broken_paired_reader`` in ``khmer/utils.py`` should be used to do all
+paired-end sequence handling, and sequence loading should
+go through ``khmer.utils.clean_input_reads(iter)``; this is a
+generator that wraps the iterator produced by ``screed.open``, and it
+adds a ``cleaned_seq`` attribute to screed ``Record`` objects.  This
+attribute should be used for any k-mer or graph operations, while
+the normal ``sequence`` attribute is what should be written out.
+``write_record`` and ``write_record_pair`` should be used to output
+records.  All of these functions are aware of FASTA and FASTQ records,
+too.
+
+For applying operations to collections of reads, the ``ReadBundle`` class is
+available.  This is used to wrap a collection of reads for examination and
+processing in situations where (for example) something should be done to
+either both reads in a pair, or neither.
+
+Some basic rules of sequence handling in khmer are:
+
+* consume and produce "broken paired" format, such that pairs of sequences
+  always stay together; see ``khmer.utils.broken_paired_reader``.
+
+* when looking at the coverage of reads (for trimming or digital normalization)
+  always consider pairs; see ``khmer.utils.ReadBundle(...)``.
+
+* only apply graph or k-mer operations to sequences consisting only of ATCG;
+  typically this will be ``record.cleaned_seq``.  See
+  ``khmer.utils.clean_input_read(...)``.
diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py
@@ -1,6 +1,6 @@
 # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
 # Copyright (C) 2011-2015, Michigan State University.
-# Copyright (C) 2015, The Regents of the University of California.
+# Copyright (C) 2015-2016, The Regents of the University of California.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 import threading
 import sys
 import screed
-from khmer.utils import write_record, check_is_pair
+from khmer.utils import (write_record, check_is_pair, clean_input_reads)
 from khmer.khmer_logger import log_info
 # stdlib queue module was renamed on Python 3
 try:
@@ -54,7 +54,7 @@
 
 def verbose_loader(filename):
     """Screed iterator that additionally prints progress info to stderr."""
-    screed_iter = screed.open(filename)
+    screed_iter = clean_input_reads(screed.open(filename))
     for num, record in enumerate(screed_iter):
         if num % 100000 == 0:
             log_info('... filtering {num}', num=num)

diff --git a/khmer/trimming.py b/khmer/trimming.py
@@ -40,11 +40,10 @@ def trim_record(countgraph, record, cutoff, variable_coverage=False,
                 normalize_to=None):
     name = record.name
     seq = record.sequence
-    seqN = seq.replace('N', 'A')
+    seqN = record.cleaned_seq
 
     if variable_coverage:  # only trim when sequence has high enough C
-        med, _, _ = countgraph.get_median_count(seqN)
-        if med < normalize_to:
+        if not countgraph.median_at_least(seqN, normalize_to):
             return record, False                 # return unmodified
 
     _, trim_at = countgraph.trim_on_abundance(seqN, cutoff)

diff --git a/khmer/utils.py b/khmer/utils.py
@@ -183,8 +183,7 @@ def broken_paired_reader(screed_iter, min_length=None,
         raise ValueError("force_single and require_paired cannot both be set!")
 
     # handle the majority of the stream.
-    for record in screed_iter:
-
+    for record in clean_input_reads(screed_iter):
         if prev_record:
             if check_is_pair(prev_record, record) and not force_single:
                 if min_length and (len(prev_record.sequence) < min_length or
@@ -249,24 +248,30 @@ def write_record_pair(read1, read2, fileobj):
     write_record(read2, fileobj)
 
 
+def clean_input_reads(screed_iter):
+    for record in screed_iter:
+        record.cleaned_seq = record.sequence.upper().replace('N', 'A')
+        yield record
+
+
 class ReadBundle(object):
     def __init__(self, *raw_records):
         self.reads = [i for i in raw_records if i]
-        self.cleaned_seqs = [r.sequence.replace('N', 'A') for r in self.reads]
 
     def coverages(self, graph):
-        return [graph.get_median_count(r)[0] for r in self.cleaned_seqs]
+        return [graph.get_median_count(r.cleaned_seq)[0] for r in self.reads]
 
-    def both(self):
-        return zip(self.reads, self.cleaned_seqs)
+    def coverages_at_least(self, graph, coverage):
+        return all(graph.median_at_least(r.cleaned_seq, coverage)
+                   for r in self.reads)
 
     @property
     def num_reads(self):
-        return len(self.cleaned_seqs)
+        return len(self.reads)
 
     @property
     def total_length(self):
-        return sum([len(r) for r in self.cleaned_seqs])
+        return sum([len(r.sequence) for r in self.reads])
 
 
 # vim: set filetype=python tabstop=4 softtabstop=4 shiftwidth=4 expandtab:

diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
@@ -61,7 +61,7 @@
 from khmer.kfile import (check_space, check_space_for_graph,
                          check_valid_file_exists, add_output_compression_type,
                          get_file_writer, is_block, describe_file_handle)
-from khmer.utils import write_record, broken_paired_reader
+from khmer.utils import (write_record, broken_paired_reader, ReadBundle)
 from khmer.khmer_logger import (configure_logging, log_info, log_error)
 
 
@@ -168,24 +168,13 @@ def __call__(self, is_paired, read0, read1):
         * if any read's median k-mer count is below desired coverage, keep all;
         * consume and yield kept reads.
         """
+        batch = ReadBundle(read0, read1)
         desired_coverage = self.desired_coverage
 
-        passed_filter = False
-
-        batch = []
-        batch.append(read0)
-        if read1 is not None:
-            batch.append(read1)
-
-        for record in batch:
-            seq = record.sequence.replace('N', 'A')
-            if not self.countgraph.median_at_least(seq, desired_coverage):
-                passed_filter = True
-
-        if passed_filter:
-            for record in batch:
-                seq = record.sequence.replace('N', 'A')
-                self.countgraph.consume(seq)
+        # if any in batch have coverage below desired coverage, consume &yield
+        if not batch.coverages_at_least(self.countgraph, desired_coverage):
+            for record in batch.reads:
+                self.countgraph.consume(record.cleaned_seq)
                 yield record
 
 

diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py
@@ -219,16 +219,16 @@ def pass1(self, reader, saver):
 
             # trim?
             if min_coverage >= TRIM_AT_COVERAGE:
-                for read, cleaned_seq in bundle.both():
+                for read in bundle.reads:
                     record, did_trim = trim_record(graph, read, CUTOFF)
                     if did_trim:
                         self.trimmed_reads += 1
                     if record:
                         yield record
             # no, too low coverage to trim; consume & set aside for 2nd pass.
             else:
-                for read, cleaned_seq in bundle.both():
-                    graph.consume(cleaned_seq)
+                for read in bundle.reads:
+                    graph.consume(read.cleaned_seq)
                     write_record(read, saver)
                     self.n_saved += 1
 
@@ -254,14 +254,18 @@ def pass2(self, reader):
             self.n_reads += bundle.num_reads
             self.n_bp += bundle.total_length
 
-            for read, coverage in zip(bundle.reads, bundle.coverages(graph)):
-                if coverage >= TRIM_AT_COVERAGE or self.do_trim_low_abund:
-                    record, did_trim = trim_record(graph, read, CUTOFF)
+            if self.do_trim_low_abund or \
+               bundle.coverages_at_least(graph, TRIM_AT_COVERAGE):
+
+                for read in bundle.reads:
+                    trimmed_record, did_trim = trim_record(graph, read, CUTOFF)
+
                     if did_trim:
                         self.trimmed_reads += 1
-                    if record:
-                        yield record
-                else:
+                    if trimmed_record:
+                        yield trimmed_record
+            else:
+                for read in bundle.reads:
                     self.n_skipped += 1
                     self.bp_skipped += 1
                     yield read

diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -495,3 +495,24 @@ def test_BrokenPairedReader_OnPairs_4():
     assert x == expected, x
     assert m == 1
     assert n == 0, n
+
+
+def test_BrokenPairedReader_lowercase():
+    stream = [screed.Record(name='seq1/1', sequence='acgtn'),
+              screed.Record(name='seq1/2', sequence='AcGtN'),
+              screed.Record(name='seq1/2', sequence='aCgTn')]
+
+    results = []
+    for num, is_pair, read1, read2 in broken_paired_reader(stream):
+        results.append((read1, read2))
+
+    a, b = results[0]
+    assert a.sequence == 'acgtn'
+    assert a.cleaned_seq == 'ACGTA'
+    assert b.sequence == 'AcGtN'
+    assert b.cleaned_seq == 'ACGTA'
+
+    c, d = results[1]
+    assert c.sequence == 'aCgTn'
+    assert c.cleaned_seq == 'ACGTA'
+    assert d is None
diff --git a/tests/test_read_handling.py b/tests/test_read_handling.py
@@ -53,6 +53,7 @@
 import khmer
 import khmer.kfile
 import screed
+from khmer.utils import clean_input_reads
 
 
 def test_interleave_read_stdout():
@@ -781,10 +782,10 @@ def test_extract_paired_reads_5_stdin_error():
 
 def test_read_bundler():
     infile = utils.get_test_data('unclean-reads.fastq')
-    records = [r for r in screed.open(infile)]
+    records = [r for r in clean_input_reads(screed.open(infile))]
     bundle = khmer.utils.ReadBundle(*records)
 
-    raw_reads = (
+    raw_seqs = (
         'GGTTGACGGGGNNNAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCG'
         'GGGCGGAGGCCGCAGACGCGAGTGGTGGAGG',
         'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGANNNCCG'
@@ -800,25 +801,23 @@ def test_read_bundler():
 
     assert bundle.num_reads == 2
     assert bundle.total_length == 200
-    assert bundle.cleaned_seqs[0] == cleaned_seqs[0]
-    assert bundle.cleaned_seqs[1] == cleaned_seqs[1]
 
-    for (rd, cln), raw, tstcln in zip(bundle.both(), raw_reads, cleaned_seqs):
-        assert rd.sequence == raw
-        assert cln == tstcln
+    for read, raw_seq, clean_seq in zip(bundle.reads, raw_seqs, cleaned_seqs):
+        assert read.sequence == raw_seq
+        assert read.cleaned_seq == clean_seq
 
 
 def test_read_bundler_single_read():
     infile = utils.get_test_data('single-read.fq')
-    records = [r for r in screed.open(infile)]
+    records = [r for r in clean_input_reads(screed.open(infile))]
     bundle = khmer.utils.ReadBundle(*records)
     assert bundle.num_reads == 1
-    assert bundle.reads[0].sequence == bundle.cleaned_seqs[0]
+    assert bundle.reads[0].sequence == bundle.reads[0].cleaned_seq
 
 
 def test_read_bundler_empty_file():
     infile = utils.get_test_data('empty-file')
-    records = [r for r in screed.open(infile)]
+    records = [r for r in clean_input_reads(screed.open(infile))]
     bundle = khmer.utils.ReadBundle(*records)
     assert bundle.num_reads == 0