Merge branch 'master' of github.com:ged-lab/khmer into fix/spacechecks

Conflicts: ChangeLog
dib-lab · Jul 20, 2015 · af68d9d · af68d9d
2 parents 9fd52c9 + 5695b9f
commit af68d9d
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 66 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,4 +1,4 @@
-2015-07-18  Jacob Fenton  <bocajnotnef@gmail.com>
+2015-07-20  Jacob Fenton  <bocajnotnef@gmail.com>
 
    * khmer/{kfile,khmer_args}.py: refactored information passing, made it so
    space checks happen in the right directory
@@ -8,6 +8,14 @@
    sample-reads-randomly,trim-low-abund}.py,tests/test_script_arguments.py:
    changed to use new arg structure
 
+2015-07-20  Titus Brown  <titus@idyll.org>
+
+   * khmer/__init__.py: cleaned up FP rate reporting.
+   * scripts/normalize-by-median.py: corrected epilog; refactored reporting
+   to be a bit cleaner; use CSV for reporting file;
+   added --report-frequency arg.
+   * tests/test_normalize_by_median.py: updated/added tests for reporting.
+
 2015-07-17  Jacob Fenton  <bocajnotnef@gmail.com>
    * oxli/{functions,build_graph}.py,scripts/{load-graph,normalize-by-median,
    abundance-dist}.py,tests/test_{normalize_by_median,subset_graph,hashbits,

diff --git a/khmer/__init__.py b/khmer/__init__.py
@@ -172,8 +172,8 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
         print("** Do not use these results!!", file=sys.stderr)
         print("**", file=sys.stderr)
         print("** (estimated false positive rate of %.3f;" % fp_all,
-              file=sys.stderr)
-        print("max allowable %.3f" % max_false_pos, file=sys.stderr)
+              file=sys.stderr, end=' ')
+        print("max recommended %.3f)" % max_false_pos, file=sys.stderr)
         print("**", file=sys.stderr)
 
         if not force:

diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
@@ -38,43 +38,87 @@
 DEFAULT_DESIRED_COVERAGE = 20
 
 
-def with_diagnostics(ifilename, norm, reader, fp_file):
+class WithDiagnostics(object):
     """
     Generator/context manager to do boilerplate output of statistics.
 
     uses a Normalizer object.
     """
-    # per read diagnostic output
-    for _, record in enumerate(norm(reader)):
-
-        if norm.total % 100000 == 0:
-            print('... kept {kept} of {total} or {perc:2}% so far'
-                  .format(kept=norm.total - norm.discarded,
-                          total=norm.total,
-                          perc=int(100. - norm.discarded /
-                                   float(norm.total) * 100.)),
-                  file=sys.stderr)
+    def __init__(self, norm, report_fp=None, report_frequency=100000):
+        self.norm = norm
+        self.report_fp = report_fp
+        if report_fp:
+            report_fp.write('total,kept,f_kept\n')
 
-            print('... in file ' + ifilename, file=sys.stderr)
+        self.total = 0
+        self.kept = 0
+
+        self.report_frequency = report_frequency
+        self.next_report_at = self.report_frequency
+        self.last_report_at = self.report_frequency
+
+    def __call__(self, reader, ifilename):
+        norm = self.norm
+        report_fp = self.report_fp
+
+        reads_start = self.total
+        total = self.total
+        kept = self.kept
+
+        try:
+            for _, is_paired, read0, read1 in reader:
+                if is_paired:
+                    total += 2
+                else:
+                    total += 1
+
+                # do diginorm
+                for record in norm(is_paired, read0, read1):
+                    kept += 1
+                    yield record
 
-        yield record
+                # report!
+                if total >= self.next_report_at:
+                    self.next_report_at += self.report_frequency
+                    self.last_report_at = total
+
+                    perc_kept = kept / float(total)
+
+                    print('... kept {kept} of {tot} or {perc_kept:.1%} so far'
+                          .format(kept=kept, tot=total, perc_kept=perc_kept),
+                          file=sys.stderr)
+                    print('... in file ' + ifilename, file=sys.stderr)
+
+                    if report_fp:
+                        print("{total},{kept},{f_kept:.4}"
+                              .format(total=total, f_kept=perc_kept,
+                                      kept=kept),
+                              file=report_fp)
+                        report_fp.flush()
+        finally:
+            self.total = total
+            self.kept = kept
+
+        # per file diagnostic output
+        if total == reads_start:
+            print('SKIPPED empty file ' + ifilename, file=sys.stderr)
+        else:
+            perc_kept = kept / float(total)
 
-    # per file diagnostic output
-    if norm.total == 0:
-        print('SKIPPED empty file ' + ifilename, file=sys.stderr)
-    else:
-        print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'
-              .format(inp=ifilename, kept=norm.total - norm.discarded,
-                      total=norm.total, perc=int(100. - norm.discarded /
-                                                 float(norm.total) * 100.)),
-              file=sys.stderr)
+            print('DONE with {inp}; kept {kept} of {total} or {perc_kept:.1%}'
+                  .format(inp=ifilename, kept=kept, total=total,
+                          perc_kept=perc_kept),
+                  file=sys.stderr)
 
-    if fp_file:
-        print("{total} {kept} {discarded}"
-              .format(total=norm.total, kept=norm.total - norm.discarded,
-                      discarded=1. - (norm.discarded / float(norm.total))),
-              file=fp_file)
-        fp_file.flush()
+        # make sure there's at least one report per file, at the end of each
+        # file.
+        if report_fp and total != self.last_report_at:
+            perc_kept = kept / float(total)
+
+            print("{total},{kept},{f_kept:.4}"
+                  .format(total=total, f_kept=perc_kept, kept=kept),
+                  file=report_fp)
+            report_fp.flush()
 
 
 class Normalizer(object):
@@ -85,10 +129,7 @@ def __init__(self, desired_coverage, htable):
         self.htable = htable
         self.desired_coverage = desired_coverage
 
-        self.total = 0
-        self.discarded = 0
-
-    def __call__(self, reader):
+    def __call__(self, is_paired, read0, read1):
         """
         Actually does digital normalization - the core algorithm.
 
@@ -100,31 +141,23 @@ def __call__(self, reader):
         """
         desired_coverage = self.desired_coverage
 
-        for _, is_paired, read0, read1 in reader:
-            passed_filter = False
+        passed_filter = False
 
-            self.total += 1
+        batch = []
+        batch.append(read0)
+        if read1 is not None:
+            batch.append(read1)
 
-            if is_paired:
-                self.total += 1
-
-            batch = []
-            batch.append(read0)
-            if read1 is not None:
-                batch.append(read1)
+        for record in batch:
+            seq = record.sequence.replace('N', 'A')
+            if not self.htable.median_at_least(seq, desired_coverage):
+                passed_filter = True
 
+        if passed_filter:
             for record in batch:
                 seq = record.sequence.replace('N', 'A')
-                if not self.htable.median_at_least(seq, desired_coverage):
-                    passed_filter = True
-
-            if passed_filter:
-                for record in batch:
-                    seq = record.sequence.replace('N', 'A')
-                    self.htable.consume(seq)
-                    yield record
-            else:
-                self.discarded += len(batch)
+                self.htable.consume(seq)
+                yield record
 
 
 @contextmanager
@@ -166,10 +199,7 @@ def get_parser():
 
     With :option:`-s`/:option:`--savetable`, the k-mer counting table
     will be saved to the specified file after all sequences have been
-    processed. With :option:`-d`, the k-mer counting table will be
-    saved every d files for multifile runs; if :option:`-s` is set,
-    the specified name will be used, and if not, the name `backup.ct`
-    will be used.  :option:`-l`/:option:`--loadtable` will load the
+    processed. :option:`-l`/:option:`--loadtable` will load the
     specified k-mer counting table before processing the specified
     files.  Note that these tables are are in the same format as those
     produced by :program:`load-into-counting.py` and consumed by
@@ -220,9 +250,12 @@ def get_parser():
                         'reads are loaded.')
     parser.add_argument('-R', '--report',
                         metavar='filename', type=argparse.FileType('w'))
+    parser.add_argument('--report-frequency',
+                        metavar='report_frequency', type=int,
+                        default=100000)
     parser.add_argument('-f', '--force', dest='force',
-                        help='continue on next file if read errors are \
-                         encountered', action='store_true')
+                        help='continue past file reading errors',
+                        action='store_true')
     parser.add_argument('-o', '--out', metavar="filename",
                         dest='single_output_file',
                         type=argparse.FileType('w'),
@@ -291,6 +324,7 @@ def main():  # pylint: disable=too-many-branches,too-many-statements
 
     # create an object to handle diginorm of all files
     norm = Normalizer(args.cutoff, htable)
+    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)
 
     # make a list of all filenames and if they're paired or not;
     # if we don't know if they're paired, default to allowing but not
@@ -331,7 +365,7 @@ def main():  # pylint: disable=too-many-branches,too-many-statements
                                           require_paired=require_paired)
 
             # actually do diginorm
-            for record in with_diagnostics(filename, norm, reader, report_fp):
+            for record in with_diagnostics(reader, filename):
                 if record is not None:
                     write_record(record, outfp)
 

diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py
@@ -210,8 +210,62 @@ def test_normalize_by_median_known_good():
         assert False
 
 
-@attr('huge')
 def test_normalize_by_median_report_fp():
+    # this tests basic reporting of diginorm stats => report.out, including
+    # a test of aggregate stats for two input files.
+
+    infile = utils.get_temp_filename('test.fa')
+    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+    infile2 = utils.get_temp_filename('test2.fa')
+    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile2)
+
+    in_dir = os.path.dirname(infile)
+    outfile = utils.get_temp_filename('report.out')
+
+    script = 'normalize-by-median.py'
+    args = ['-C', '1', '-k', '17', '-R', outfile, infile, infile2]
+    (status, out, err) = utils.runscript(script, args, in_dir)
+
+    assert os.path.exists(outfile)
+    report = open(outfile, 'r')
+    line = report.readline().strip()
+    assert line == 'total,kept,f_kept', line
+    line = report.readline().strip()
+    assert line == '1001,1,0.000999', line
+    line = report.readline().strip()
+    assert line == '2002,1,0.0004995', line
+
+
+def test_normalize_by_median_report_fp_hifreq():
+    # this tests high-frequency reporting of diginorm stats for a single
+    # file => report.out.
+
+    infile = utils.get_temp_filename('test.fa')
+    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+    in_dir = os.path.dirname(infile)
+    outfile = utils.get_temp_filename('report.out')
+
+    script = 'normalize-by-median.py'
+    args = ['-C', '1', '-k', '17', '-R', outfile, infile,
+            '--report-frequency', '100']
+    (status, out, err) = utils.runscript(script, args, in_dir)
+
+    assert os.path.exists(outfile)
+    report = open(outfile, 'r')
+    line = report.readline().strip()
+    assert line == 'total,kept,f_kept', line
+    line = report.readline().strip()
+    assert line == '100,1,0.01', line
+    line = report.readline().strip()
+    assert line == '200,1,0.005', line
+
+
+@attr('huge')
+def test_normalize_by_median_report_fp_huge():
+    # this tests reporting of diginorm stats => report.out for a large
+    # file, with the default reporting interval of once every 100k.
+
     infile = utils.get_temp_filename('test.fa')
     in_dir = os.path.dirname(infile)
     outfile = utils.get_temp_filename('report.out')
@@ -224,8 +278,9 @@ def test_normalize_by_median_report_fp():
 
     assert "fp rate estimated to be 0.623" in err, err
     report = open(outfile, 'r')
+    line = report.readline()            # skip header
     line = report.readline()
-    assert "100000 25261 0.25261" in line, line
+    assert "100000,25261,0.2526" in line, line
 
 
 def test_normalize_by_median_unpaired_and_paired():
@@ -262,12 +317,12 @@ def test_normalize_by_median_count_kmers_PE():
     args = ['-C', CUTOFF, '-k', '17', '--force-single', infile]
     (status, out, err) = utils.runscript(script, args, in_dir)
     assert 'Total number of unique k-mers: 98' in err, err
-    assert 'kept 1 of 2 or 50%' in err, err
+    assert 'kept 1 of 2 or 50.0%' in err, err
 
     args = ['-C', CUTOFF, '-k', '17', '-p', infile]
     (status, out, err) = utils.runscript(script, args, in_dir)
     assert 'Total number of unique k-mers: 99' in err, err
-    assert 'kept 2 of 2 or 100%' in err, err
+    assert 'kept 2 of 2 or 100.0%' in err, err
 
 
 def test_normalize_by_median_double_file_name():