dib-lab · ctb · Feb 25, 2015 · Feb 24, 2015 · Feb 24, 2015 · Feb 24, 2015
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,11 @@
+2015-02-25  Tamer A. Mansour  <drtamermansour@gmail.com>
+
+   * scripts/normalize-by-median.py: change to the default behavior to
+   overwrite the sequences output file. Also add a new argument --append to
+   append new reads to the output file.
+   * tests/test_scripts.py: add a test for the --append option in
+   normalize-by-median.py
+
 2015-02-25  Hussien Alameldin  <hussien@msu.edu>
 
    * khmer/khmer_args.py: add 'hll' citation entry "Irber and Brown,

diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
@@ -192,6 +192,9 @@ def get_parser():
                         dest='single_output_filename',
                         default='', help='only output a single'
                         ' file with the specified filename')
+    parser.add_argument('--append', default=False, action='store_true',
+                        help='append reads to the outputfile. '
+                        'Only with -o specified')
     parser.add_argument('input_filenames', metavar='input_sequence_filename',
                         help='Input FAST[AQ] sequence filename.', nargs='+')
     parser.add_argument('--report-total-kmers', '-t', action='store_true',
@@ -233,11 +236,15 @@ def main():  # pylint: disable=too-many-branches,too-many-statements
     discarded = 0
     input_filename = None
 
-    for index, input_filename in enumerate(args.input_filenames):
-        if args.single_output_filename != '':
-            output_name = args.single_output_filename
+    if args.single_output_filename:
+        output_name = args.single_output_filename
+        if args.append:
             outfp = open(args.single_output_filename, 'a')
         else:
+            outfp = open(args.single_output_filename, 'w')
+
+    for index, input_filename in enumerate(args.input_filenames):
+        if not args.single_output_filename:
             output_name = os.path.basename(input_filename) + '.keep'
             outfp = open(output_name, 'w')
 

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -515,6 +515,42 @@ def test_normalize_by_median():
     assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
 
 
+def test_normalize_by_median_append():
+    outfile = utils.get_temp_filename('test.fa.keep')
+    shutil.copyfile(utils.get_test_data('test-abund-read.fa'), outfile)
+    in_dir = os.path.dirname(outfile)
+
+    CUTOFF = '1'
+    infile = utils.get_temp_filename('test.fa', in_dir)
+    shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
+    script = scriptpath('normalize-by-median.py')
+
+    args = ['-C', CUTOFF, '-k', '17', '-t', '-o', outfile, '--append', infile]
+    (status, out, err) = utils.runscript(script, args, in_dir)
+    assert os.path.exists(outfile), outfile
+    seqs = [r.sequence for r in screed.open(outfile)]
+    assert len(seqs) == 2, seqs
+    assert 'GACAGCgtgCCGCA' in seqs[1], seqs
+
+
+def test_normalize_by_median_overwrite():
+    outfile = utils.get_temp_filename('test.fa.keep')
+    shutil.copyfile(utils.get_test_data('test-abund-read.fa'), outfile)
+    in_dir = os.path.dirname(outfile)
+
+    CUTOFF = '1'
+    infile = utils.get_temp_filename('test.fa', in_dir)
+    shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
+    script = scriptpath('normalize-by-median.py')
+
+    args = ['-C', CUTOFF, '-k', '17', '-t', '-o', outfile, infile]
+    (status, out, err) = utils.runscript(script, args, in_dir)
+    assert os.path.exists(outfile), outfile
+    seqs = [r.sequence for r in screed.open(outfile)]
+    assert len(seqs) == 1, seqs
+    assert 'GACAGCgtgCCGCA' in seqs[0], seqs
+
+
 def test_normalize_by_median_version():
     script = scriptpath('normalize-by-median.py')
     args = ['--version']