From 47e8890bdf25c92f2282a047846fb8f973621eb4 Mon Sep 17 00:00:00 2001 From: Jessica Mizzi Date: Fri, 5 Dec 2014 13:53:35 -0500 Subject: [PATCH] Added force option to all scripts to script IO sanity checks and updated tests to match --- ChangeLog | 5 ++--- khmer/file.py | 15 +++++++++------ sandbox/sweep-reads.py | 7 ++++--- scripts/abundance-dist-single.py | 6 ++++-- scripts/abundance-dist.py | 4 +++- scripts/annotate-partitions.py | 8 +++++--- scripts/count-median.py | 6 ++++-- scripts/count-overlap.py | 7 ++++--- scripts/do-partition.py | 4 ++-- scripts/extract-paired-reads.py | 6 ++++-- scripts/extract-partitions.py | 6 ++++-- scripts/filter-abund-single.py | 8 +++++--- scripts/filter-abund.py | 6 ++++-- scripts/filter-stoptags.py | 7 ++++--- scripts/interleave-reads.py | 4 ++-- scripts/load-graph.py | 6 +++--- scripts/load-into-counting.py | 6 +++--- scripts/make-initial-stoptags.py | 6 ++++-- scripts/merge-partitions.py | 6 ++++-- scripts/normalize-by-median.py | 6 ++++-- scripts/partition-graph.py | 6 ++++-- scripts/sample-reads-randomly.py | 4 ++-- scripts/split-paired-reads.py | 6 ++++-- setup.cfg | 2 +- tests/test_script_arguments.py | 28 ++++++++++++++++++++++++++-- tests/test_scripts.py | 4 ++-- 26 files changed, 117 insertions(+), 62 deletions(-) diff --git a/ChangeLog b/ChangeLog index ffb19a05d9..707065d942 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,5 @@ -2014-10-06 Michael R. Crusoe - - * Doxyfile.in: add links to the stdc++ docs +2014-12-05 Jessica Mizzi + * khmer/file.py,sandbox/sweep-reads.py,scripts/{abundance-dist-single,abundance-dist,annotate-partitions,count-median,count-overlap,do-partition,extract-paired-reads,extract-partitions,filter-abund-single,filter-abund,filter-stoptags,interleave-reads,load-graph,load-into-counting,make-initial-stoptags,merge-partitions,normalize-by-median,partition-graph,sample-reads-randomly,split-paired-reads}.py,setup.cfg,tests/{test_script_arguments,test_scripts}.py: Added force option to all scripts to script IO sanity checks and updated tests to match. 2014-10-01 Ben Taylor diff --git a/khmer/file.py b/khmer/file.py index 46984cecc7..07796819a9 100644 --- a/khmer/file.py +++ b/khmer/file.py @@ -13,7 +13,7 @@ import sys -def check_file_status(file_path): +def check_file_status(file_path, force): """ Check status of file - return if file exists; warn and exit if empty, or does not exist @@ -21,15 +21,17 @@ def check_file_status(file_path): if not os.path.exists(file_path): print >>sys.stderr, "ERROR: Input file %s does not exist; exiting" % \ file_path - sys.exit(1) + if not force: + sys.exit(1) else: if os.stat(file_path).st_size == 0: print >>sys.stderr, "ERROR: Input file %s is empty; exiting." % \ file_path - sys.exit(1) + if not force: + sys.exit(1) -def check_space(in_files, _testhook_free_space=None): +def check_space(in_files, force, _testhook_free_space=None): """ Estimate size of input files passed, then calculate disk space available. Exit if insufficient disk space, @@ -65,10 +67,11 @@ def check_space(in_files, _testhook_free_space=None): % (float(total_size) / 1e9,) print >>sys.stderr, " Free space: %.1f GB" \ % (float(free_space) / 1e9,) - sys.exit(1) + if not force: + sys.exit(1) -def check_space_for_hashtable(hash_size, _testhook_free_space=None): +def check_space_for_hashtable(hash_size, force, _testhook_free_space=None): """ Check we have enough size to write a hash table """ diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index fbdc068875..81770fb5af 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -191,7 +191,8 @@ def get_parser(): parser.add_argument(dest='input_fastp', help='Reference fasta or fastp') parser.add_argument('input_files', nargs='+', help='Reads to be swept and sorted') - + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -224,13 +225,13 @@ def main(): buf_size = args.buffer_size max_reads = args.max_reads - check_file_status(args.input_fastp) + check_file_status(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability - check_space(all_input_files) + check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 3e7c19f90d..8de569445e 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -60,6 +60,8 @@ def get_parser(): "filename.") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -68,8 +70,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches args = get_parser().parse_args() report_on_config(args) - check_file_status(args.input_sequence_filename) - check_space([args.input_sequence_filename]) + check_file_status(args.input_sequence_filename, args.force) + check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py index de41aaeeb4..4fd1e8aec5 100755 --- a/scripts/abundance-dist.py +++ b/scripts/abundance-dist.py @@ -44,6 +44,8 @@ def get_parser(): help='Overwrite output file if it exists') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -53,7 +55,7 @@ def main(): infiles = [args.input_counting_table_filename, args.input_sequence_filename] for infile in infiles: - check_file_status(infile) + check_file_status(infile, args.force) print('hashtable from', args.input_counting_table_filename) counting_hash = khmer.load_counting_hash( diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index e10ae92908..ab2de29844 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -54,6 +54,8 @@ def get_parser(): 'annotate.') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -67,11 +69,11 @@ def main(): partitionmap_file = args.graphbase + '.pmap.merged' - check_file_status(partitionmap_file) + check_file_status(partitionmap_file, args.force) for _ in filenames: - check_file_status(_) + check_file_status(_, args.force) - check_space(filenames) + check_space(filenames, args.force) print 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) diff --git a/scripts/count-median.py b/scripts/count-median.py index 3709fbb0c0..275d2ecc77 100755 --- a/scripts/count-median.py +++ b/scripts/count-median.py @@ -50,6 +50,8 @@ def get_parser(): help='output summary filename') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -63,9 +65,9 @@ def main(): infiles = [htfile, input_filename] for infile in infiles: - check_file_status(infile) + check_file_status(infile, args.force) - check_space(infiles) + check_space(infiles, args.force) print 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py index f708b7fd9d..cd598d67d3 100755 --- a/scripts/count-overlap.py +++ b/scripts/count-overlap.py @@ -44,7 +44,8 @@ def get_parser(): help="input sequence filename") parser.add_argument('report_filename', metavar='output_report_filename', help='output report filename') - + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -54,9 +55,9 @@ def main(): report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: - check_file_status(infile) + check_file_status(infile, args.force) - check_space([args.ptfile, args.fafile]) + check_space([args.ptfile, args.fafile], args.force) print 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) diff --git a/scripts/do-partition.py b/scripts/do-partition.py index c77668b9d5..62de9672a9 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -111,9 +111,9 @@ def main(): # pylint: disable=too-many-locals,too-many-statements report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: - check_file_status(infile) + check_file_status(infile, args.force) - check_space(args.input_filenames) + check_space(args.input_filenames, args.force) print 'Saving k-mer presence table to %s' % args.graphbase print 'Loading kmers from sequences in %s' % repr(args.input_filenames) diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index 79b8e6959b..c9320e1288 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -74,6 +74,8 @@ def get_parser(): parser.add_argument('infile') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -81,9 +83,9 @@ def main(): info('extract-paired-reads.py') args = get_parser().parse_args() - check_file_status(args.infile) + check_file_status(args.infile, args.force) infiles = [args.infile] - check_space(infiles) + check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index 23ce60f383..ab51a282c1 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -81,6 +81,8 @@ def get_parser(): help='Output unassigned sequences, too') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -94,9 +96,9 @@ def main(): # pylint: disable=too-many-locals,too-many-branches n_unassigned = 0 for infile in args.part_filenames: - check_file_status(infile) + check_file_status(infile, args.force) - check_space(args.part_filenames) + check_space(args.part_filenames, args.force) print '---' print 'reading partitioned files:', repr(args.part_filenames) diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index 2d82954a22..ed647366a1 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -56,16 +56,18 @@ def get_parser(): help="FAST[AQ] sequence file to trim") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser def main(): info('filter-abund-single.py', ['counting']) args = get_parser().parse_args() - check_file_status(args.datafile) - check_space([args.datafile]) + check_file_status(args.datafile, args.force) + check_space([args.datafile],args.force) if args.savetable: - check_space_for_hashtable(args.n_tables * args.min_tablesize) + check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) config = khmer.get_config() diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index 32f7fd65da..84c2d2d07e 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -66,6 +66,8 @@ def get_parser(): 'file for each input file.') parser.add_argument('--version', action='version', version='khmer {v}'.format(v=__version__)) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -77,9 +79,9 @@ def main(): infiles = args.input_filename for _ in infiles: - check_file_status(_) + check_file_status(_, args.force) - check_space(infiles) + check_space(infiles, args.force) print 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index 3d7ffd1f60..ca9cede93f 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -44,9 +44,10 @@ def get_parser(): nargs='+') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser - def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() @@ -54,9 +55,9 @@ def main(): infiles = args.input_filenames for _ in infiles: - check_file_status(_) + check_file_status(_, args.force) - check_space(infiles) + check_space(infiles, args.force) print 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py index 9f79c71a67..ca33e8c201 100755 --- a/scripts/interleave-reads.py +++ b/scripts/interleave-reads.py @@ -73,9 +73,9 @@ def main(): args = get_parser().parse_args() for _ in args.infiles: - check_file_status(_) + check_file_status(_, args.force) - check_space(args.infiles) + check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: diff --git a/scripts/load-graph.py b/scripts/load-graph.py index be2fcb0709..01b78a3d87 100755 --- a/scripts/load-graph.py +++ b/scripts/load-graph.py @@ -54,10 +54,10 @@ def main(): filenames = args.input_filenames for _ in args.input_filenames: - check_file_status(_) + check_file_status(_, args.force) - check_space(args.input_filenames) - check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.) + check_space(args.input_filenames, args.force) + check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.), args.force) print 'Saving k-mer presence table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index c407e3eb96..f954f8db71 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -71,10 +71,10 @@ def main(): filenames = args.input_sequence_filename for name in args.input_sequence_filename: - check_file_status(name) + check_file_status(name, args.force) - check_space(args.input_sequence_filename) - check_space_for_hashtable(args.n_tables * args.min_tablesize) + check_space(args.input_sequence_filename, args.force) + check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print 'Saving k-mer counting table to %s' % base print 'Loading kmers from sequences in %s' % repr(filenames) diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index 186773dd69..551be8f7e5 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -62,6 +62,8 @@ def get_parser(): help="Use stoptags in this file during partitioning") parser.add_argument('graphbase', help='basename for input and output ' 'filenames') + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -77,9 +79,9 @@ def main(): if args.stoptags: infiles.append(args.stoptags) for _ in infiles: - check_file_status(_) + check_file_status(_, args.force) - check_space(infiles) + check_space(infiles, args.force) print 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 986e9f76a9..e06fa41119 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -43,6 +43,8 @@ def get_parser(): 'files') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -60,9 +62,9 @@ def main(): htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: - check_file_status(_) + check_file_status(_, args.force) - check_space(pmap_files) + check_space(pmap_files, args.force) for pmap_file in pmap_files: print 'merging', pmap_file diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 8d23d76209..648ef7e428 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -207,6 +207,8 @@ def get_parser(): parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers" " post-normalization to stderr") + parser.add_argument('--force', default=False, action='store_true', + help='Overwrite output file if it exists') add_loadhash_args(parser) return parser @@ -220,9 +222,9 @@ def main(): # pylint: disable=too-many-branches,too-many-statements report_fp = args.report check_valid_file_exists(args.input_filenames) - check_space(args.input_filenames) + check_space(args.input_filenames, args.force) if args.savetable: - check_space_for_hashtable(args.n_tables * args.min_tablesize) + check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py index 8ba728cc48..657f33edd4 100755 --- a/scripts/partition-graph.py +++ b/scripts/partition-graph.py @@ -89,6 +89,8 @@ def get_parser(): 'traversals') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') add_threading_args(parser) return parser @@ -100,9 +102,9 @@ def main(): filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: - check_file_status(_) + check_file_status(_, args.force) - check_space(filenames) + check_space(filenames, args.force) print '--' print 'SUBSET SIZE', args.subset_size diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index a36e359c0d..d2e6f98594 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -84,9 +84,9 @@ def main(): args = get_parser().parse_args() for _ in args.filenames: - check_file_status(_) + check_file_status(_, args.force) - check_space(args.filenames) + check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 09521e5cbe..fbb4ead6ec 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -42,6 +42,8 @@ def get_parser(): parser.add_argument('infile') parser.add_argument('--version', action='version', version='%(prog)s ' + khmer.__version__) + parser.add_argument('-f', '--force', default=False, action='store_true', + help='Overwrite output file if it exists') return parser @@ -51,9 +53,9 @@ def main(): infile = args.infile - check_file_status(infile) + check_file_status(infile, args.force) filenames = [infile] - check_space(filenames) + check_space(filenames, args.force) out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' diff --git a/setup.cfg b/setup.cfg index 69aecb27d8..88eedf7225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [nosetests] verbosity = 2 -stop = TRUE +#stop = TRUE attr = !known_failing,!jenkins #processes = -1 # breaks xunit output #attr = !known_failing,!highmem diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py index ba850f7e71..ac91f0a4dd 100644 --- a/tests/test_script_arguments.py +++ b/tests/test_script_arguments.py @@ -20,7 +20,7 @@ def test_check_space(): save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() try: - khmer.file.check_space([fakelump_fa], _testhook_free_space=0) + khmer.file.check_space([fakelump_fa], force=False, _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: print str(e) @@ -31,9 +31,33 @@ def test_check_space(): def test_check_tablespace(): save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() try: - khmer.file.check_space_for_hashtable(1e9, _testhook_free_space=0) + khmer.file.check_space_for_hashtable(1e9, force=False, _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: print str(e) finally: sys.stderr = save_stderr + + +def test_check_space_force(): + fakelump_fa = utils.get_test_data('fakelump.fa') + + save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + try: + khmer.file.check_space([fakelump_fa], force=True, _testhook_free_space=0) + assert True, "this should pass" + except SystemExit as e: + print str(e) + finally: + sys.stderr = save_stderr + + +def test_check_tablespace_force(): + save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO() + try: + khmer.file.check_space_for_hashtable(1e9, force=True, _testhook_free_space=0) + assert True, "this should pass" + except SystemExit as e: + print str(e) + finally: + sys.stderr = save_stderr diff --git a/tests/test_scripts.py b/tests/test_scripts.py index d9bb93bfd1..17291c09e0 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -30,7 +30,7 @@ def teardown(): def test_check_space(): # @CTB this probably belongs in a new test file, along with other # tests of the file.py module. - khmer.file.check_space(['', utils.get_test_data('test-abund-read-2.fa')]) + khmer.file.check_space(['', utils.get_test_data('test-abund-read-2.fa')], False) def test_load_into_counting(): @@ -1483,7 +1483,7 @@ def test_sample_reads_randomly_S(): badargs = list(args) badargs.extend(['-o', 'test', 'test.fq', 'test.fq']) (status, out, err) = utils.runscript(script, badargs, in_dir, fail_ok=True) - assert status == -1, (status, out, err) + assert status == 1, (status, out, err) args.append('test.fq')