diff --git a/ChangeLog b/ChangeLog index 892ab0f976..7ce9d3d41c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,42 @@ +2015-08-24 Michael R. Crusoe + + * khmer/khmer_args.py: Replaced sanitize_epilog() with santize_help() that + reflows the text of ArgParse descriptions and epilog while preserving the + formatting. Enhanced removal of Sphinx directives by replacing double + backticks with the double quote character. + * scripts/*.py: Renamed sanitize_epilog to sanitize_help; leading newlines + from triple-quoted epilogs removed; formatting made consistent; + sanitize_help and ComboFormatter added where it was missing; a couple + script specific epilog reformatting (for use of `:doc:` and a + hyperlink). + * scripts/{count-median,filter-abund-single}.py: Fixed printing of output + file name to do so instead of printing information about the file handle. + * scripts/count-median.py: Added missing command so that example given + actually works. + * scripts/filter-abund-single.py: Removed redundant printing of output file + names. + * scripts/normalize-by-median.py: Removed unused option "-d" from an example + command (left over from the "--dump-frequency" era). + * scripts/{partition-graph.py,do-partition.py}: Fixed erasure of the queue + module name in the worker functions, which is necessary for basic + functionality. + * scripts/{do-partition,abundance-dist,abundance-dist-single, + extract-long-sequences}.py: Added an example command to the epilog. + * tests/khmer_tst_utils.py: Added 'name' attribute to make the fake + sys.stdout more like a read stdout object. + * oxli/__init__.py: removed redundant and unused help text + * scripts/{abundance-dist,annotate-partitions,count-median, + extract-long-sequences,extract-paired-reads,extract-partitions, + fastq-to-fasta,filter-abund,filter-stopgaps,interleave-reads, + load-into-graph,merge-partitions,normalize-by-median,partition-graph, + readstats,sample-reads-randomly,split-paired-reads}.py: made "--version" + and the citation header consistent across the scripts. + * tests/test_scripts.py: added tests for the "--version" and citation + header behavior. + * tests/test_normalize_by_median.py: updated test for 'quiet' mode as + citation header still prints to STDERR. + * setup.py,tests/test_scripts.py: turned off the "oxli" script for v2.0. + 2015-08-17 Michael R. Crusoe * Makefile: remove BASH shell designation that appears to be incompatible diff --git a/doc/run-corn-50m.sh b/doc/run-corn-50m.sh index 642e2a28eb..ee5f3b743e 100644 --- a/doc/run-corn-50m.sh +++ b/doc/run-corn-50m.sh @@ -12,8 +12,10 @@ # https://s3.amazonaws.com/public.ged.msu.edu/khmer/iowa-corn-50m.fa.gz # +set -e +set -x + KHMER_PATH=$1 -export PYTHONPATH=$KHMER_PATH/python SCRIPTPATH=$KHMER_PATH/scripts diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index b280bccc2c..778313039f 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -13,6 +13,7 @@ import os import argparse import math +import textwrap from argparse import _VersionAction from collections import namedtuple @@ -392,10 +393,24 @@ def add_threading_args(parser): help='Number of simultaneous threads to execute') -def sanitize_epilog(parser): - parser.epilog = parser.epilog.replace( - '//', '/').replace(':option:', '').replace( - ':program:', '').replace('::', ':') +def sanitize_help(parser): + """Remove Sphinx directives & reflow text to width of 79 characters.""" + wrapper = textwrap.TextWrapper(width=79) + parser.description = wrapper.fill(parser.description) + if not parser.epilog: + return parser + cleanlog = parser.epilog.replace(':option:', '').replace( + ':program:', '').replace('::', ':').replace('``', '"') + newlog = prev_section = "" + for section in cleanlog.split('\n\n'): + if section.startswith(' '): + newlog += section + '\n' + else: + if prev_section.startswith(' '): + newlog += '\n' + newlog += wrapper.fill(section) + '\n\n' + prev_section = section + parser.epilog = newlog return parser _algorithms = { diff --git a/oxli/__init__.py b/oxli/__init__.py index 4a4816b6ed..e0b5d923c5 100755 --- a/oxli/__init__.py +++ b/oxli/__init__.py @@ -13,7 +13,7 @@ import argparse import sys import textwrap -from khmer import khmer_args +from khmer.khmer_args import build_nodegraph_args from oxli import build_graph @@ -30,15 +30,12 @@ def get_parser(): # build-graph (formerly load-graph.py) parsers here parser_build_graph = \ - subparsers.add_parser('build-graph', - help="Load sequences into the compressible graph" - "format plus optional tagset", - description="Load sequences into the " - "compressible graph format plus optional tagset") - - khmer_args.build_nodegraph_args("Load sequences into the compressible" - "graph format plus optional tagset.", - None, parser=parser_build_graph) + subparsers.add_parser( + name='build-graph', + help="Load sequences into the compressible graph format " + "plus optional tagset") + + parser_build_graph = build_nodegraph_args(parser=parser_build_graph) build_graph.build_parser(parser_build_graph) parser_build_graph.set_defaults(func=build_graph.main) diff --git a/oxli/build_graph.py b/oxli/build_graph.py index 63af8fe40d..65099a28f0 100644 --- a/oxli/build_graph.py +++ b/oxli/build_graph.py @@ -43,8 +43,6 @@ def build_parser(parser): def main(args): - info('build-graph.py', ['graph', 'SeqAn']) - report_on_config(args, graphtype='nodegraph') base = args.output_filename filenames = args.input_filenames diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py index 05f886b29e..08a4054757 100755 --- a/sandbox/collect-reads.py +++ b/sandbox/collect-reads.py @@ -22,7 +22,7 @@ import khmer from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, info, - calculate_graphsize, sanitize_epilog) + calculate_graphsize, sanitize_help) from khmer.kfile import check_input_files, check_space from khmer.kfile import check_space_for_graph import argparse @@ -68,7 +68,7 @@ def get_parser(): def main(): info('collect-reads.py', ['counting']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename diff --git a/sandbox/correct-reads.py b/sandbox/correct-reads.py index 7a14f5bff8..22170518ed 100755 --- a/sandbox/correct-reads.py +++ b/sandbox/correct-reads.py @@ -26,7 +26,7 @@ import argparse from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, - report_on_config, sanitize_epilog) + report_on_config, sanitize_help) from khmer.utils import write_record, write_record_pair, broken_paired_reader from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists) @@ -114,7 +114,7 @@ def get_parser(): def main(): info('correct-reads.py', ['streaming']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() ### diff --git a/sandbox/estimate_optimal_hash.py b/sandbox/estimate_optimal_hash.py index 0c6f8b21a8..5ec9b26877 100755 --- a/sandbox/estimate_optimal_hash.py +++ b/sandbox/estimate_optimal_hash.py @@ -29,7 +29,7 @@ from __future__ import print_function import argparse import khmer, oxli -from khmer.khmer_args import info, optimal_size, sanitize_epilog +from khmer.khmer_args import info, optimal_size, sanitize_help import textwrap import sys @@ -70,7 +70,7 @@ def get_parser(): def main(): info('estimate_optimal_hash.py', ['counting']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() N = args.N if args.M: M = args.M diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py index 1e28f01117..186be14f15 100755 --- a/sandbox/saturate-by-median.py +++ b/sandbox/saturate-by-median.py @@ -22,7 +22,7 @@ from khmer.khmer_args import (build_counting_args, add_loadgraph_args, report_on_config, info, create_countgraph, - sanitize_epilog) + sanitize_help) import argparse from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists) @@ -178,7 +178,7 @@ def get_parser(): def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) - parser = sanitize_epilog(get_parser()) + parser = sanitize_help(get_parser()) args = parser.parse_args() report_on_config(args) diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py index 55a13a9fff..1a62ac361a 100755 --- a/sandbox/sweep-files.py +++ b/sandbox/sweep-files.py @@ -37,7 +37,7 @@ import time import khmer from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, - sanitize_epilog) + sanitize_help) DEFAULT_OUT_PREF = 'reads' DEFAULT_RANGE = -1 @@ -101,7 +101,7 @@ def clear(self): def main(): #info('sweep-files.py', ['sweep']) - parser = sanitize_epilog(get_parser()) + parser = sanitize_help(get_parser()) args = parser.parse_args() if args.max_tablesize < MIN_HSIZE: diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index b5b9e801fd..41f895e9be 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -39,7 +39,7 @@ import time import khmer from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, - sanitize_epilog) + sanitize_help) from khmer.kfile import (check_input_files, check_valid_file_exists, check_space) @@ -206,7 +206,7 @@ def get_parser(): def main(): info('sweep-reads-buffered.py', ['sweep']) - parser = sanitize_epilog(get_parser()) + parser = sanitize_help(get_parser()) args = parser.parse_args() if args.max_tablesize < MAX_HSIZE: diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 0ce40e8fe1..e54b2a0a1e 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -25,19 +25,24 @@ from khmer import khmer_args from khmer.khmer_args import (build_counting_args, add_threading_args, report_on_config, info, calculate_graphsize, - sanitize_epilog) + sanitize_help) from khmer.kfile import (check_input_files, check_space_for_graph) def get_parser(): - epilog = ''' - Note that with :option:`-b` this script is constant memory; in exchange, - k-mer counts will stop at 255. The memory usage of this script with - :option:`-b` will be about 1.15x the product of the :option:`-x` and - :option:`-N` numbers. + epilog = '''\ + Note that with :option:`-b`/:option:`--no-bigcount` this script is constant + memory; in exchange, k-mer counts will stop at 255. The memory usage of + this script with :option:`-b` will be about 1.15x the product of the + :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. + + Example:: + + abundance-dist-single.py -x 1e7 -N 2 -k 17 \\ + tests/test-data/test-abund-read-2.fa test-dist ''' parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " @@ -69,7 +74,7 @@ def get_parser(): def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py index ad40d4534c..12db9a17f7 100755 --- a/scripts/abundance-dist.py +++ b/scripts/abundance-dist.py @@ -19,16 +19,26 @@ import csv import khmer import argparse +import textwrap import os +from khmer import __version__ from khmer.kfile import check_input_files -from khmer.khmer_args import info +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) def get_parser(): + epilog = """\ + Example:: + + load-into-countgraph.py -x 1e7 -N 2 -k 17 counts \\ + tests/test-data/test-abund-read-2.fa + abundance-dist.py counts tests/test-data/test-abund-read-2.fa test-dist + """ parser = argparse.ArgumentParser( description="Calculate abundance distribution of the k-mers in " "the sequence file using a pre-made k-mer countgraph.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=ComboFormatter, epilog=textwrap.dedent(epilog)) parser.add_argument('input_count_graph_filename', help='The name of the' ' input k-mer countgraph file.') @@ -46,8 +56,8 @@ def get_parser(): parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Continue even if specified input files ' 'do not exist or are empty.') @@ -56,7 +66,7 @@ def get_parser(): def main(): info('abundance-dist.py', ['counting']) - args = get_parser().parse_args() + args = sanitize_help(get_parser()).parse_args() infiles = [args.input_count_graph_filename, args.input_sequence_filename] diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index 32048a1490..3e58dd9bed 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -20,20 +20,21 @@ import os import argparse import textwrap -import khmer import sys +from khmer import __version__, Nodegraph from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) DEFAULT_K = 32 def get_parser(): - epilog = """ - Load in a partitionmap (generally produced by partition-graph.py or - merge-partitions.py) and annotate the sequences in the given files with - their partition IDs. Use :program:`extract-partitions.py` to extract - sequences into separate group files. + epilog = """\ + Load in a partitionmap (generally produced by :program:`partition-graph.py` + or :program:`merge-partitions.py`) and annotate the sequences in the given + files with their partition IDs. Use :program:`extract-partitions.py` to + extract sequences into separate group files. Example (results will be in ``random-20-a.fa.part``):: @@ -44,8 +45,7 @@ def get_parser(): """ parser = argparse.ArgumentParser( description="Annotate sequences with partition IDs.", - epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help="k-mer size (default: %d)" % DEFAULT_K) @@ -54,8 +54,8 @@ def get_parser(): parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+', help='input FAST[AQ] sequences to ' 'annotate.') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser @@ -63,11 +63,11 @@ def get_parser(): def main(): info('annotate-partitions.py', ['graph']) - args = get_parser().parse_args() + args = sanitize_help(get_parser()).parse_args() ksize = args.ksize filenames = args.input_filenames - nodegraph = khmer.Nodegraph(ksize, 1, 1) + nodegraph = Nodegraph(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' diff --git a/scripts/count-median.py b/scripts/count-median.py index a5b8e73eaf..219137493b 100755 --- a/scripts/count-median.py +++ b/scripts/count-median.py @@ -28,13 +28,14 @@ import csv import textwrap -import khmer +from khmer import __version__, load_countgraph from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) def get_parser(): - epilog = """ + epilog = """\ Count the median/avg k-mer abundance for each sequence in the input file, based on the k-mer counts in the given k-mer countgraph. Can be used to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic). @@ -44,13 +45,14 @@ def get_parser(): Example:: - count-median.py counts.ct tests/test-data/test-reads.fq.gz medians.txt + load-into-countgraph.py counts tests/test-data/test-reads.fq.gz + count-median.py counts tests/test-data/test-reads.fq.gz medians.txt NOTE: All 'N's in the input sequences are converted to 'A's. """ parser = argparse.ArgumentParser( description='Count k-mers summary stats for sequences', - epilog=textwrap.dedent(epilog)) + epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('countgraph', metavar='input_count_graph_filename', help='input k-mer countgraph filename') @@ -59,8 +61,8 @@ def get_parser(): parser.add_argument('output', metavar='output_summary_filename', help='output summary filename', type=argparse.FileType('w')) - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser @@ -68,12 +70,11 @@ def get_parser(): def main(): info('count-median.py', ['diginorm']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output - output_filename = str(output) infiles = [htfile, input_filename] for infile in infiles: @@ -82,9 +83,9 @@ def main(): check_space(infiles, args.force) print('loading k-mer countgraph from', htfile, file=sys.stderr) - countgraph = khmer.load_countgraph(htfile) + countgraph = load_countgraph(htfile) ksize = countgraph.ksize() - print('writing to', output_filename, file=sys.stderr) + print('writing to', output.name, file=sys.stderr) output = csv.writer(output) # write headers: diff --git a/scripts/do-partition.py b/scripts/do-partition.py index 4a52d42c0f..b448441b12 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -24,7 +24,7 @@ import textwrap from khmer import khmer_args from khmer.khmer_args import (build_nodegraph_args, report_on_config, info, - add_threading_args, sanitize_epilog) + add_threading_args, sanitize_help) import glob from khmer.kfile import check_input_files, check_space import re @@ -41,10 +41,10 @@ DEFAULT_K = 32 -def worker(queue, basename, stop_big_traversals): +def worker(tasks, basename, stop_big_traversals): while True: try: - (nodegraph, index, start, stop) = queue.get(False) + (nodegraph, index, start, stop) = tasks.get(False) except queue.Empty: print('exiting', file=sys.stderr) return @@ -68,7 +68,7 @@ def worker(queue, basename, stop_big_traversals): def get_parser(): - epilog = """ + epilog = """\ Load in a set of sequences, partition them, merge the partitions, and annotate the original sequences files with the partition information. @@ -78,6 +78,10 @@ def get_parser(): one script. This is convenient but should probably not be used for large data sets, because :program:`do-partition.py` doesn't provide save/resume functionality. + + Example:: + + do-partition.py -k 20 example tests/test-data/random-20-a.fa """ parser = build_nodegraph_args( descr='Load, partition, and annotate FAST[AQ] sequences', @@ -103,7 +107,7 @@ def get_parser(): # pylint: disable=too-many-branches def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py index 46680d8330..4f06a426c4 100755 --- a/scripts/extract-long-sequences.py +++ b/scripts/extract-long-sequences.py @@ -21,16 +21,25 @@ from __future__ import print_function import argparse import screed +import textwrap import sys +from khmer import __version__ from khmer.utils import write_record from khmer.kfile import add_output_compression_type, get_file_writer +from khmer.khmer_args import (ComboFormatter, sanitize_help, info, + _VersionStdErrAction) def get_parser(): + epilog = """\ + Example:: + + extract-long-sequences.py --length 10 tests/test-data/paired-mixed.fa + """ parser = argparse.ArgumentParser( description='Extract FASTQ or FASTA sequences longer than' ' specified length (default: 200 bp).', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=ComboFormatter, epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', help='Input FAST[AQ]' ' sequence filename.', nargs='+') @@ -40,12 +49,15 @@ def get_parser(): parser.add_argument('-l', '--length', help='The minimum length of' ' the sequence file.', type=int, default=200) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) add_output_compression_type(parser) return parser def main(): - args = get_parser().parse_args() + info('extract-long-sequences.py') + args = sanitize_help(get_parser()).parse_args() outfp = get_file_writer(args.output, args.gzip, args.bzip) for filename in args.input_filenames: for record in screed.open(filename): diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index c7241f0905..29bbbe32fc 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -22,9 +22,11 @@ import os.path import textwrap import argparse -import khmer + +from khmer import __version__ from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) from khmer.kfile import add_output_compression_type from khmer.kfile import get_file_writer @@ -32,7 +34,7 @@ def get_parser(): - epilog = """ + epilog = """\ Many read-handling programs (assemblers, mappers, etc.) require that you give them either perfectly interleaved files, or files containing only single reads. This script takes files that were @@ -61,15 +63,14 @@ def get_parser(): """ parser = argparse.ArgumentParser( description='Take a mixture of reads and split into pairs and ' - 'orphans.', epilog=textwrap.dedent(epilog)) + 'orphans.', epilog=textwrap.dedent(epilog), + formatter_class=ComboFormatter) parser.add_argument('infile', nargs='?', default='/dev/stdin') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) - + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-d', '--output-dir', default='', help='Output ' 'split reads to specified directory. Creates ' 'directory if necessary') - parser.add_argument('--output-paired', '-p', metavar="filename", type=argparse.FileType('wb'), default=None, help='Output paired reads to this ' @@ -85,7 +86,7 @@ def get_parser(): def main(): info('extract-paired-reads.py') - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() infile = args.infile check_input_files(infile, args.force) diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index 43007d8f0c..97f51a7b2b 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -24,11 +24,12 @@ import screed import argparse import textwrap -import khmer +from khmer import __version__ from khmer.kfile import (check_input_files, check_space, add_output_compression_type, get_file_writer) -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) from khmer.utils import write_record DEFAULT_MAX_SIZE = int(1e6) @@ -42,7 +43,7 @@ def read_partition_file(filename): def get_parser(): - epilog = """ + epilog = """\ Example (results will be in ``example.group0000.fa``):: load-into-graph.py -k 20 example tests/test-data/random-20-a.fa @@ -51,15 +52,15 @@ def get_parser(): annotate-partitions.py -k 20 example tests/test-data/random-20-a.fa extract-partitions.py example random-20-a.fa.part - (extract-partitions.py will produce a partition size distribution - in .dist. The columns are: (1) number of reads, (2) count - of partitions with n reads, (3) cumulative sum of partitions, - (4) cumulative sum of reads.) + (:program:`extract-partitions.py` will produce a partition size + distribution in .dist. The columns are: (1) number of reads, + (2) count of partitions with n reads, (3) cumulative sum of partitions, + (4) cumulative sum of reads.) """ parser = argparse.ArgumentParser( description="Separate sequences that are annotated with partitions " "into grouped files.", epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=ComboFormatter) parser.add_argument('prefix', metavar='output_filename_prefix') parser.add_argument('part_filenames', metavar='input_partition_filename', nargs='+') @@ -75,8 +76,8 @@ def get_parser(): parser.add_argument('--output-unassigned', '-U', default=False, action='store_true', help='Output unassigned sequences, too') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_output_compression_type(parser) @@ -86,7 +87,7 @@ def get_parser(): # pylint: disable=too-many-statements def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() distfilename = args.prefix + '.dist' diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py index 5cfb785aea..6fdb548731 100755 --- a/scripts/fastq-to-fasta.py +++ b/scripts/fastq-to-fasta.py @@ -18,15 +18,18 @@ import sys import argparse import screed +from khmer import __version__ from khmer.kfile import (add_output_compression_type, get_file_writer, is_block, describe_file_handle) from khmer.utils import write_record +from khmer.khmer_args import (sanitize_help, ComboFormatter, info, + _VersionStdErrAction) def get_parser(): parser = argparse.ArgumentParser( description='Converts FASTQ format (.fq) files to FASTA format (.fa).', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=ComboFormatter) parser.add_argument('input_sequence', help='The name of the input' ' FASTQ sequence file.') @@ -38,12 +41,15 @@ def get_parser(): parser.add_argument('-n', '--n_keep', default=False, action='store_true', help='Option to keep reads containing \'N\'s in ' 'input_sequence file. Default is to drop reads') + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) add_output_compression_type(parser) return parser def main(): - args = get_parser().parse_args() + info('fastq-to-fasta.py') + args = sanitize_help(get_parser()).parse_args() print(('fastq from ', args.input_sequence), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index bdb946ee94..707d7e7c0e 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -27,7 +27,7 @@ from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, add_threading_args, info, calculate_graphsize, - sanitize_epilog) + sanitize_help) from khmer.kfile import (check_input_files, check_space, check_space_for_graph, add_output_compression_type, @@ -37,8 +37,9 @@ def get_parser(): - epilog = """ - Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. + epilog = """\ + Trimmed sequences will be placed in + ``${input_sequence_filename}.abundfilt``. This script is constant memory. @@ -69,7 +70,7 @@ def get_parser(): def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) @@ -131,14 +132,12 @@ def process_fn(record): tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) - print('output in', outfile, file=sys.stderr) + print('output in', outfile.name, file=sys.stderr) if args.savegraph: print('Saving k-mer countgraph filename', args.savegraph, file=sys.stderr) - print('...saving to', args.savegraph, file=sys.stderr) graph.save(args.savegraph) - print('wrote to: ', outfile, file=sys.stderr) if __name__ == '__main__': main() diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index d8f7c67fb9..bd2819ed15 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -24,7 +24,7 @@ import sys from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader from khmer.khmer_args import (ComboFormatter, add_threading_args, info, - sanitize_epilog) + sanitize_help, _VersionStdErrAction) from khmer.kfile import (check_input_files, check_space, add_output_compression_type, get_file_writer) from khmer import __version__ @@ -34,10 +34,11 @@ def get_parser(): - epilog = """ - Trimmed sequences will be placed in `${input_sequence_filename}.abundfilt` - for each input sequence file. If the input sequences are from RNAseq or - metagenome sequencing then :option:`--variable-coverage` should be used. + epilog = """\ + Trimmed sequences will be placed in + ``${input_sequence_filename}.abundfilt`` for each input sequence file. If + the input sequences are from RNAseq or metagenome sequencing then + :option:`--variable-coverage` should be used. Example:: @@ -70,7 +71,7 @@ def get_parser(): help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') - parser.add_argument('--version', action='version', + parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') @@ -80,7 +81,7 @@ def get_parser(): def main(): info('filter-abund.py', ['counting']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index 65999b9af5..5448286137 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -19,35 +19,35 @@ from __future__ import print_function import os -import khmer import argparse import textwrap import sys +from khmer import __version__, Nodegraph from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) # @CTB K should be loaded from file... DEFAULT_K = 32 def get_parser(): - epilog = """ + epilog = """\ Load stoptags in from the given `.stoptags` file and use them to trim or remove the sequences in ``. Trimmed sequences will be placed in `.stopfilt`. """ parser = argparse.ArgumentParser( description="Trim sequences at stoptags.", - epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('--ksize', '-k', default=DEFAULT_K, type=int, help='k-mer size') parser.add_argument('stoptags_file', metavar='input_stoptags_filename') parser.add_argument('input_filenames', metavar='input_sequence_filename', nargs='+') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser @@ -55,7 +55,7 @@ def get_parser(): def main(): info('filter-stoptags.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames @@ -65,7 +65,7 @@ def main(): check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) - nodegraph = khmer.Nodegraph(args.ksize, 1, 1) + nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): diff --git a/scripts/find-knots.py b/scripts/find-knots.py index 0d58dc260f..bea14d69d0 100755 --- a/scripts/find-knots.py +++ b/scripts/find-knots.py @@ -24,7 +24,8 @@ from khmer.kfile import check_input_files, check_space from khmer import khmer_args from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, - report_on_config, sanitize_epilog) + report_on_config, sanitize_help, + ComboFormatter) # counting hash parameters. DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes @@ -48,14 +49,15 @@ def get_parser(): - epilog = """ + epilog = """\ Load an k-mer nodegraph/tagset pair created by :program:`load-into-graph.py`, and a set of pmap files created by :program:`partition-graph.py`. Go through each pmap file, select the largest partition in each, and do the same kind of traversal as in :program:`make-initial-stoptags.py` from each of the waypoints in that - partition; this should identify all of the HCKs in that partition. These - HCKs are output to .stoptags after each pmap file. + partition; this should identify all of the Highly Connected Kmers in that + partition. These HCKs are output to ``.stoptags`` after each + pmap file. Parameter choice is reasonably important. See the pipeline in :doc:`partitioning-big-data` for an example run. @@ -78,7 +80,13 @@ def get_parser(): def main(): info('find-knots.py', ['graph']) - args = get_parser().parse_args() + parser = get_parser() + parser.epilog = parser.epilog.replace( + ":doc:`partitioning-big-data`", + "http://khmer.readthedocs.org/en/stable/user/" + "partitioning-big-data.html" + ) + args = sanitize_help(parser).parse_args() graphbase = args.graphbase diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py index 873dab5d80..69dabf7ef0 100755 --- a/scripts/interleave-reads.py +++ b/scripts/interleave-reads.py @@ -23,9 +23,10 @@ import os import textwrap import argparse -import khmer +from khmer import __version__ from khmer.kfile import check_input_files, check_space, is_block -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) from khmer.kfile import (add_output_compression_type, get_file_writer, describe_file_handle) from khmer.utils import (write_record_pair, check_is_left, check_is_right, @@ -38,7 +39,7 @@ def get_parser(): - epilog = """ + epilog = """\ The output is an interleaved set of reads, with each read in paired with a read in . By default, the output goes to stdout unless :option:`-o`/:option:`--output` is specified. @@ -49,20 +50,19 @@ def get_parser(): Example:: - interleave-reads.py tests/test-data/paired.fq.1 \\ - tests/test-data/paired.fq.2 -o paired.fq""" + interleave-reads.py tests/test-data/paired.fq.1 \\ + tests/test-data/paired.fq.2 -o paired.fq""" parser = argparse.ArgumentParser( description='Produce interleaved files from R1/R2 paired files', - epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('left') parser.add_argument('right') parser.add_argument('-o', '--output', metavar="filename", type=argparse.FileType('wb'), default=sys.stdout) - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_output_compression_type(parser) @@ -71,7 +71,7 @@ def get_parser(): def main(): info('interleave-reads.py') - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index c86cddbdc9..023e270d02 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -23,30 +23,30 @@ from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, info, add_threading_args, calculate_graphsize, - sanitize_epilog) + sanitize_help) from khmer.kfile import check_file_writable from khmer.kfile import check_input_files from khmer.kfile import check_space_for_graph def get_parser(): - epilog = """ - Note: with :option:`-b` the output will be the exact size of the - k-mer countgraph and this script will use a constant amount of memory. - In exchange k-mer counts will stop at 255. The memory usage of this script - with :option:`-b` will be about 1.15x the product of the :option:`-x` and - :option:`-N` numbers. + epilog = """\ + Note: with :option:`-b`/:option:`--no-bigcount` the output will be the + exact size of the k-mer countgraph and this script will use a constant + amount of memory. In exchange k-mer counts will stop at 255. The memory + usage of this script with :option:`-b` will be about 1.15x the product of + the :option:`-x` and :option:`-N` numbers. Example:: - load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa + load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: - load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa + load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer countgraph from the given" @@ -75,7 +75,7 @@ def main(): info('load-into-counting.py', ['counting', 'SeqAn']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename diff --git a/scripts/load-into-graph.py b/scripts/load-into-graph.py index 10cbe5bc91..e65769b08f 100755 --- a/scripts/load-into-graph.py +++ b/scripts/load-into-graph.py @@ -16,7 +16,7 @@ import sys -from khmer.khmer_args import build_nodegraph_args +from khmer.khmer_args import build_nodegraph_args, info from oxli import build_graph @@ -29,6 +29,7 @@ def get_parser(): if __name__ == '__main__': + info('load-into-graph.py', ['graph', 'SeqAn']) build_graph.main(get_parser().parse_args()) # vim: set ft=python ts=4 sts=4 sw=4 et tw=79: diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index 212521304b..2bf88b7065 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -17,7 +17,7 @@ import textwrap import khmer from khmer import khmer_args -from khmer.khmer_args import (build_counting_args, info, sanitize_epilog) +from khmer.khmer_args import (build_counting_args, info, sanitize_help) from khmer.kfile import check_input_files DEFAULT_SUBSET_SIZE = int(1e4) @@ -41,7 +41,7 @@ def get_parser(): - epilog = """ + epilog = """\ Loads a k-mer nodegraph/tagset pair created by :program:`load-into-graph.py`, and does a small set of traversals from graph waypoints; on these traversals, @@ -73,7 +73,7 @@ def get_parser(): def main(): info('make-initial-stoptags.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 8a33271710..777e54e2c6 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -22,21 +22,23 @@ import textwrap import khmer import sys +from khmer import __version__ from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) DEFAULT_K = 32 def get_parser(): - epilog = """ - Take the `${graphbase}.subset.#.pmap` files and merge them all into a - single ${graphbase}.pmap.merged file for :program:`annotate-partitions.py` - to use. + epilog = """\ + Take the ``${graphbase}.subset.#.pmap`` files and merge them all into a + single ``${graphbase}.pmap.merged`` file for + :program:`annotate-partitions.py` to use. """ parser = argparse.ArgumentParser( description="Merge partition map '.pmap' files.", - epilog=textwrap.dedent(epilog)) + epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help="k-mer size (default: %d)" % DEFAULT_K) parser.add_argument('--keep-subsets', dest='remove_subsets', @@ -44,8 +46,8 @@ def get_parser(): help='Keep individual subsets (default: False)') parser.add_argument('graphbase', help='basename for input and output ' 'files') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser @@ -53,7 +55,7 @@ def get_parser(): def main(): info('merge-partitions.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 35904b655c..3c9846126b 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -28,7 +28,7 @@ from contextlib import contextmanager from khmer.khmer_args import (build_counting_args, add_loadgraph_args, report_on_config, info, calculate_graphsize, - sanitize_epilog) + sanitize_help) import argparse from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, @@ -182,7 +182,7 @@ def catch_io_errors(ifile, out, single_out, force, corrupt_files): def get_parser(): - epilog = """ + epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in .keep. @@ -232,9 +232,9 @@ def get_parser(): Example:: - normalize-by-median.py -k 17 -d 2 -s test.ct \\ + normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ - tests/test-data/test-fastq-reads""" + tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)) @@ -252,7 +252,7 @@ def get_parser(): help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default='', - help='save the k-mer countgraph to disk after all' + help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', metavar='report_filename', type=argparse.FileType('w')) @@ -277,12 +277,10 @@ def get_parser(): def main(): # pylint: disable=too-many-branches,too-many-statements - - parser = sanitize_epilog(get_parser()) - parser = get_parser() + info('normalize-by-median.py', ['diginorm']) + parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) - info('normalize-by-median.py', ['diginorm']) report_on_config(args) report_fp = args.report diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py index af3b2ee2f1..7c89e69105 100755 --- a/scripts/partition-graph.py +++ b/scripts/partition-graph.py @@ -21,9 +21,12 @@ import gc import os.path import argparse -import khmer +import textwrap import sys -from khmer.khmer_args import (add_threading_args, info, sanitize_epilog) + +from khmer import __version__, load_nodegraph +from khmer.khmer_args import (add_threading_args, info, sanitize_help, + ComboFormatter, _VersionStdErrAction) from khmer.kfile import check_input_files # stdlib queue module was renamed on Python 3 @@ -36,10 +39,10 @@ DEFAULT_N_THREADS = 4 -def worker(queue, basename, stop_big_traversals): +def worker(tasks, basename, stop_big_traversals): while True: try: - (nodegraph, index, start, stop) = queue.get(False) + (nodegraph, index, start, stop) = tasks.get(False) except queue.Empty: print('exiting', file=sys.stderr) return @@ -63,14 +66,14 @@ def worker(queue, basename, stop_big_traversals): def get_parser(): - epilog = """ - The resulting partition maps are saved as `${basename}.subset.#.pmap` + epilog = """\ + The resulting partition maps are saved as ``${basename}.subset.#.pmap`` files. """ parser = argparse.ArgumentParser( description="Partition a sequence graph based upon waypoint " - "connectivity", epilog=epilog, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + "connectivity", epilog=textwrap.dedent(epilog), + formatter_class=ComboFormatter) parser.add_argument('basename', help="basename of the input k-mer" "nodegraph + tagset files") @@ -82,8 +85,8 @@ def get_parser(): parser.add_argument('--no-big-traverse', action='store_true', default=False, help='Truncate graph joins at big ' 'traversals') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_threading_args(parser) @@ -92,7 +95,7 @@ def get_parser(): def main(): info('partition-graph.py', ['graph']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() basename = args.basename filenames = [basename, basename + '.tagset'] @@ -107,7 +110,7 @@ def main(): print('--', file=sys.stderr) print('loading nodegraph %s' % basename, file=sys.stderr) - nodegraph = khmer.load_nodegraph(basename) + nodegraph = load_nodegraph(basename) nodegraph.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? diff --git a/scripts/readstats.py b/scripts/readstats.py index 5eabe94817..9a003dabd8 100755 --- a/scripts/readstats.py +++ b/scripts/readstats.py @@ -20,12 +20,14 @@ import argparse import textwrap -from khmer.khmer_args import sanitize_epilog +from khmer import __version__ +from khmer.khmer_args import (sanitize_help, ComboFormatter, info, + _VersionStdErrAction) def get_parser(): descr = "Display summary statistics for one or more FASTA/FASTQ files." - epilog = (""" + epilog = """\ Report number of bases, number of sequences, and average sequence length for one or more FASTA/FASTQ files; and report aggregate statistics at end. @@ -35,10 +37,11 @@ def get_parser(): Example:: readstats.py tests/test-data/test-abund-read-2.fa - """) + """ - parser = argparse.ArgumentParser(description=descr, - epilog=textwrap.dedent(epilog)) + parser = argparse.ArgumentParser( + description=descr, formatter_class=ComboFormatter, + epilog=textwrap.dedent(epilog),) parser.add_argument('filenames', nargs='+') parser.add_argument('-o', '--output', dest='outfp', metavar="filename", help="output file for statistics; defaults to stdout.", @@ -46,6 +49,8 @@ def get_parser(): parser.add_argument('--csv', default=False, action='store_true', help='Use the CSV format for the statistics, ' 'including column headers.') + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) return parser @@ -143,8 +148,8 @@ def analyze_file(filename): def main(): """Main function - run when executed as a script.""" - parser = sanitize_epilog(get_parser()) - args = parser.parse_args() + info('readstats.py') + args = sanitize_help(get_parser()).parse_args() total_bp = 0 total_seqs = 0 diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 4c998800c6..87451097bc 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -26,10 +26,11 @@ import textwrap import sys -import khmer +from khmer import __version__ from khmer.kfile import (check_input_files, add_output_compression_type, get_file_writer) -from khmer.khmer_args import info, sanitize_epilog +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) from khmer.utils import write_record, broken_paired_reader DEFAULT_NUM_READS = int(1e5) @@ -38,8 +39,7 @@ def get_parser(): - epilog = (""" - + epilog = """\ Take a list of files containing sequences, and subsample 100,000 sequences (:option:`-N`/:option:`--num_reads`) uniformly, using reservoir sampling. Stop after first 100m sequences @@ -47,17 +47,16 @@ def get_parser(): but take :option:`-S`/:option:`--samples` samples if specified. The output is placed in :option:`-o`/:option:`--output` - (for a single sample) or in `.subset.0` to `.subset.S-1` + (for a single sample) or in ``.subset.0`` to ``.subset.S-1`` (for more than one sample). This script uses the `reservoir sampling `__ algorithm. - """) # noqa + """ parser = argparse.ArgumentParser( description="Uniformly subsample sequences from a collection of files", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - epilog=textwrap.dedent(epilog)) + formatter_class=ComboFormatter, epilog=textwrap.dedent(epilog)) parser.add_argument('filenames', nargs='+') parser.add_argument('-N', '--num_reads', type=int, dest='num_reads', @@ -73,8 +72,8 @@ def get_parser(): parser.add_argument('-o', '--output', dest='output_file', type=argparse.FileType('wb'), metavar="filename", default=None) - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exits') add_output_compression_type(parser) @@ -83,7 +82,13 @@ def get_parser(): def main(): info('sample-reads-randomly.py') - args = sanitize_epilog(get_parser()).parse_args() + parser = get_parser() + parser.epilog = parser.epilog.replace( + "`reservoir sampling\n" + "`__ algorithm.", + "reservoir sampling algorithm. " + "http://en.wikipedia.org/wiki/Reservoir_sampling") + args = sanitize_help(parser).parse_args() for _ in args.filenames: check_input_files(_, args.force) diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 0c2e34b200..830685e47d 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -22,8 +22,9 @@ import os import textwrap import argparse -import khmer -from khmer.khmer_args import info, sanitize_epilog +from khmer import __version__ +from khmer.khmer_args import (info, sanitize_help, ComboFormatter, + _VersionStdErrAction) from khmer.utils import (write_record, broken_paired_reader, UnpairedReadsError) from khmer.kfile import (check_input_files, check_space, @@ -32,7 +33,7 @@ def get_parser(): - epilog = """ + epilog = """\ Some programs want paired-end read input in the One True Format, which is interleaved; other programs want input in the Insanely Bad Format, with left- and right- reads separated. This reformats the former to the latter. @@ -55,7 +56,7 @@ def get_parser(): Example:: - split-paired-reads.py -o ~/reads-go-here tests/test-data/paired.fq + split-paired-reads.py -0 reads-output-file tests/test-data/paired.fq Example:: @@ -64,7 +65,7 @@ def get_parser(): parser = argparse.ArgumentParser( description='Split interleaved reads into two files, left and right.', epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=ComboFormatter) parser.add_argument('infile', nargs='?', default='/dev/stdin') @@ -82,8 +83,8 @@ def get_parser(): parser.add_argument('-2', '--output-second', metavar='output_second', default=None, help='Output "right" reads to this ' 'file', type=argparse.FileType('wb')) - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) + parser.add_argument('--version', action=_VersionStdErrAction, + version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_output_compression_type(parser) @@ -92,7 +93,7 @@ def get_parser(): def main(): info('split-paired-reads.py') - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() infile = args.infile diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 1680d11302..1d2f89b3cc 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -29,7 +29,7 @@ from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args, report_on_config, calculate_graphsize, - sanitize_epilog) + sanitize_help) from khmer.utils import write_record, write_record_pair, broken_paired_reader from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, @@ -50,21 +50,22 @@ def trim_record(read, trim_at): def get_parser(): - epilog = """ - The output is one file for each input file, .abundtrim, placed - in the current directory. This output contains the input sequences + epilog = """\ + The output is one file for each input file, ``.abundtrim``, + placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. - The ``-V/--variable-coverage`` parameter will, if specified, - prevent elimination of low-abundance reads by only trimming + The :option:`-V`/:option:`--variable-coverage` parameter will, if + specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, - use ``load-into-counting.py`` and ``filter-abund.py``. However, read - pairs will be kept together, in "broken-paired" format; you can use - ``extract-paired-reads.py`` to extract read pairs and orphans. + use :program:`load-into-countgraph.py` and :program:`filter-abund.py`. + However, read pairs will be kept together, in "broken-paired" format; you + can use :program:`extract-paired-reads.py` to extract read pairs and + orphans. Example:: @@ -113,7 +114,7 @@ def get_parser(): def main(): info('trim-low-abund.py', ['streaming']) - parser = sanitize_epilog(get_parser()) + parser = sanitize_help(get_parser()) args = parser.parse_args() ### diff --git a/scripts/unique-kmers.py b/scripts/unique-kmers.py index 6e2eafeb5f..b5779e1893 100755 --- a/scripts/unique-kmers.py +++ b/scripts/unique-kmers.py @@ -24,7 +24,7 @@ import khmer from khmer.khmer_args import (DEFAULT_K, info, ComboFormatter, - _VersionStdErrAction, sanitize_epilog) + _VersionStdErrAction, sanitize_help) from khmer.utils import write_record from khmer.khmer_args import graphsize_args_report from khmer import __version__ @@ -33,9 +33,9 @@ def get_parser(): descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE." - epilog = (""" - A HyperLogLog counter is used to do cardinality estimation. Since this counter - is based on a tradeoff between precision and memory consumption, + epilog = """\ + A HyperLogLog counter is used to do cardinality estimation. Since this + counter is based on a tradeoff between precision and memory consumption, the :option:`-e`/:option:`--error-rate` can be used to control how much memory will be used. In practice the memory footprint is small even at low error rates (< 0.01). @@ -69,7 +69,7 @@ def get_parser(): Example:: unique-kmers.py -R unique_count -k 30 \\ - tests/test-data/test-abund-read-paired.fa""") # noqa + tests/test-data/test-abund-read-paired.fa""" # noqa parser = argparse.ArgumentParser( description=descr, epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) @@ -109,7 +109,7 @@ def get_parser(): def main(): info('unique-kmers.py', ['SeqAn', 'hll']) - args = sanitize_epilog(get_parser()).parse_args() + args = sanitize_help(get_parser()).parse_args() total_hll = khmer.HLLCounter(args.error_rate, args.ksize) diff --git a/setup.py b/setup.py index 8d2d2bfb4a..918e9c7fd3 100755 --- a/setup.py +++ b/setup.py @@ -179,18 +179,18 @@ def check_for_openmp(): # http://docs.python.org/2/distutils/setupscript.html # additiona-meta-data note #3 "url": 'https://khmer.readthedocs.org/', - "packages": ['khmer', 'khmer.tests', 'oxli'], + "packages": ['khmer', 'khmer.tests'], "package_dir": {'khmer.tests': 'tests'}, "install_requires": ['screed >= 0.9', 'bz2file'], "extras_require": {':python_version=="2.6"': ['argparse>=1.2.1'], 'docs': ['sphinx', 'sphinxcontrib-autoprogram'], 'tests': ['nose >= 1.0']}, "scripts": SCRIPTS, - "entry_points": { - 'console_scripts': [ - "oxli = oxli:main" - ] - }, + # "entry_points": { # Not ready for distribution yet. + # 'console_scripts': [ + # "oxli = oxli:main" + # ] + # }, "ext_modules": [EXTENSION_MOD, ], # "platforms": '', # empty as is conveyed by the classifiers below # "license": '', # empty as is conveyed by the classifier below diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py index ddded56547..b17febe214 100644 --- a/tests/khmer_tst_utils.py +++ b/tests/khmer_tst_utils.py @@ -121,6 +121,7 @@ def runscript(scriptname, args, in_directory=None, oldout, olderr = sys.stdout, sys.stderr sys.stdout = StringIO() + sys.stdout.name = "StringIO" sys.stderr = StringIO() if in_directory: diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py index 0e620c812b..c5d5e5d892 100644 --- a/tests/test_normalize_by_median.py +++ b/tests/test_normalize_by_median.py @@ -97,7 +97,7 @@ def test_normalize_by_median_quiet(): (status, out, err) = utils.runscript(script, args, in_dir) assert len(out) == 0, out - assert len(err) == 0, err + assert len(err) < 460, len(err) outfile = infile + '.keep' assert os.path.exists(outfile), outfile diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 748a7dcbf7..4cd28818ed 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -23,6 +23,7 @@ import bz2 import gzip import io +import re from . import khmer_tst_utils as utils import khmer @@ -798,6 +799,7 @@ def test_load_graph(): assert x == (1, 0), x +@attr('known_failing') def test_oxli_build_graph(): script = 'oxli' args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20'] @@ -828,6 +830,7 @@ def test_oxli_build_graph(): assert x == (1, 0), x +@attr('known_failing') def test_oxli_build_graph_unique_kmers_arg(): script = 'oxli' args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20', '-U', '3960'] @@ -860,6 +863,7 @@ def test_oxli_build_graph_unique_kmers_arg(): assert x == (1, 0), x +@attr('known_failing') def test_oxli_nocommand(): script = 'oxli' @@ -890,6 +894,7 @@ def test_load_graph_no_tags(): # loading the ht file... +@attr('known_failing') def test_oxli_build_graph_no_tags(): script = 'oxli' args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20', '-n'] @@ -927,6 +932,7 @@ def test_load_graph_fail(): assert "** ERROR: the graph structure is too small" in err +@attr('known_failing') def test_oxli_build_graph_fail(): script = 'oxli' args = ['build-graph', '-x', '1e3', '-N', '2', '-k', '20'] # use small HT @@ -963,6 +969,7 @@ def test_load_graph_write_fp(): assert 'false positive rate estimated to be 0.002' in data +@attr('known_failing') def test_oxli_build_graph_write_fp(): script = 'oxli' # use small HT @@ -997,6 +1004,7 @@ def test_load_graph_multithread(): (status, out, err) = utils.runscript(script, args) +@attr('known_failing') def test_oxli_build_graph_multithread(): script = 'oxli' @@ -3678,3 +3686,17 @@ def test_unique_kmers_multiple_inputs(): assert ('Estimated number of unique 20-mers in {0}: 232'.format(infiles[1]) in err) assert 'Total estimated number of unique 20-mers: 4170' in err + + +def check_version(scriptname): + version = re.compile("^khmer .*$", re.MULTILINE) + status, out, err = utils.runscript(scriptname, ["--version"]) + assert status == 0, status + assert "publication" in err, err + assert version.search(err) is not None, err + + +def test_version(): + for entry in os.listdir(utils.scriptpath()): + if entry.endswith(".py"): + yield check_version, entry