diff --git a/ChangeLog b/ChangeLog index cdbb35017a..408023df19 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,38 @@ +2015-06-19 Titus Brown + + * khmer/__init__.py: split CountingHash into _CountingHash (CPython) and + CountingHash to mimic Hashbits behavior; pass IOError through + extract_countinghash_info and extract_hashbits_info so that + file-does-not-exist errors are correctly reported; fixed FP rate reporting; + changed to using get_n_primes_near_x to build hashtable sizes; removed + get_n_primes_above_x, new_hashbits, and new_counting_hash functions. + * khmer/_khmer.cc: changed tp_flags for KCountingHash so that it could + be a base class. + * khmer/khmer_args.py: removed environment variable override for hash size + defaults; added -M/--max_memory_usage, and functions create_nodegraph() + and create_countgraph(). Also renamed --min-tablesize to --max-tablesize. + * khmer/kfile.py: fixed check_space_for_hashtable to depend on args obj. + * oxli/build_graph.py, scripts/{annotate-partitions.py,count-overlap.py, + do-partition.py,filter-stoptags.py, + merge-partitions.py}, sandbox/{assembly-diff.py,assembly-diff-2.py, + bloom-count-intersection.py,bloom-count.py,build-sparse-graph.py, + collect-reads.py,saturate-by-median.py, graph-size.py,print-stoptags.py, + print-tagset.py,stoptags-by-position.py, subset-report.py, + sweep-out-reads-with-contigs.py,sweep-reads2.py,sweep-reads3.py}: changed + hashtype over to 'nodegraph' and 'countgraph' in call to report_on_config; + replaced counting hash/hashbits creation with new khmer_args create* + functions, and/or new_counting_hash/new_hashbits with CountingHash/Hashbits. + * doc/scripts.rst: updated hashtable size help text. + * doc/whats-new-2.0.rst: updated with description of -M/--max-memory-usage. + * tests/test*.py: switched from new_counting_hash to CountingHash, and + new_hashbits to Hashbits; adjusts tests for new behavior of hashtable + size calculation. + * tests/test_hashbits_obj.py: merged into test_hashbits.py and removed file. + * tests/test_script_arguments.py: updated for new check_space_for_hashtable + behavior; added tests for create_countgraph and create_nodegraph. + * tests/test_counting_single.py: fixed countgraph size & palindrome testing + beahavior in test_complete_no_collision. + 2015-06-19 Titus Brown * Makefile: temporarily disable 'huge' tests on Linux. diff --git a/doc/user/choosing-table-sizes.rst b/doc/user/choosing-table-sizes.rst index bbc112d1ce..caba889af4 100644 --- a/doc/user/choosing-table-sizes.rst +++ b/doc/user/choosing-table-sizes.rst @@ -1,53 +1,55 @@ .. vim: set filetype=rst -============================== -Choosing table sizes for khmer -============================== +========================== +Setting khmer memory usage +========================== If you look at the documentation for the scripts (:doc:`scripts`) you'll -see two mysterious parameters -- :option:`-N` and :option:`-x`, or, more -verbosely, :option:`-n_tables` and :option:`--tablesize`. What are these, and -how do you specify them? +see a :option:`-M` parameter that sets the maximum memory usage for +any script that uses k-mer counting tables or k-mer graphs. What is this? + +khmer uses a special data structure that lets it store counting tables +and k-mer graphs in very low memory; the trick is that you must fix +the amount of memory khmer can use before running it. (See `Pell et +al., 2012 `__ and `Zhang +et al., 2014 `__ for the +details.) This is what the :option:`-M` parameter does. + +If you set it too low, khmer will warn you to set it higher at the end. +See below for some good choices for various kinds of data. + +**Note for khmer 1.x users:** as of khmer 2.0, the :option:`-M` +parameter sets the :option:`-N`/:option:`--n_tables` and +:option:`-x`/:option:`--max_tablesize` parameters automatically. +You can still set these parameters directly if you wish. The really short version ======================== There is no way (except for experience, rules of thumb, and intuition) to -know what these parameters should be up front. So, make the product of -these two parameters be the size of your available memory:: +know what this parameter should be up front. So, use the maximum +available memory:: - -N 4 -x 4e9 + -M 16e9 -for a machine with 16 GB of free memory, for example. Also see -the rules of thumb, below. +for a machine with 16 GB of free memory, for example. The short version ================= -These parameters specify the maximum memory usage of the primary data +This parameter specifies the maximum memory usage of the primary data structure in khmer, which is basically N big hash tables of size x. The **product** of the number of hash tables and the size of the hash -tables specifies the total amount of memory used. +tables specifies the total amount of memory used, which is what the +:option:`-M` parameter sets. -This table is used to track k-mers. If it is too small, khmer -will fail in various ways (and should complain), but there is no harm +These tables are used to track k-mers. If they are too small, khmer +will fail in various ways (and will complain), but there is no harm in making it too large. So, **the absolute safest thing to do is to specify as much memory as is available**. Most scripts will inform you of the total memory usage, and (at the end) will complain if it's too small. -For normalize-by-median, khmer uses one byte per hash entry, so: if -you had 16 GB of available RAM, you should specify something like ``-N -4 -x 4e9``, which multiplies out to about 16 GB. - -For the graph partitioning stuff, khmer uses only 1 bit per k-mer, so -you can multiple your available memory by 8: for 16 GB of RAM, you could -use :: - - -N 4 -x 32e9 - -which multiplies out to 128 Gbits of RAM, or 16 Gbytes. - Life is a bit more complicated than this, however, because some scripts -- load-into-counting and load-graph -- keep ancillary information that will consume memory beyond this table data structure. So if you run out of @@ -124,26 +126,24 @@ an error-code. Rules of thumb -------------- -Just use -N 4, always, and vary the -x parameter. - For digital normalization, we recommend: - - ``-x 2e9`` for any amount of sequencing for a single microbial genome, + - ``-M 8e9`` for any amount of sequencing for a single microbial genome, MDA-amplified or single colony. - - ``-x 4e9`` for up to a billion mRNAseq reads from any organism. Past that, + - ``-M 16e9`` for up to a billion mRNAseq reads from any organism. Past that, increase it. - - ``-x 8e9`` for most eukaryotic genome samples. + - ``-M 32e9`` for most eukaryotic genome samples. - - ``-x 8e9`` will also handle most "simple" metagenomic samples (HMP on down) + - ``-M 32e9`` will also handle most "simple" metagenomic samples (HMP on down) - For metagenomic samples that are more complex, such as soil or marine, - start as high as possible. For example, we are using ``-x 64e9`` for + start as high as possible. For example, we are using ``-M 256e9`` for ~300 Gbp of soil reads. For partitioning of complex metagenome samples, we recommend starting as high as you can -- something like half your system memory. So if -you have 256 GB of RAM, use ``-N 4 -x 256e9`` which will use 4 x 256 / -8 = 128 GB of RAM for the basic graph storage, leaving other memory -for the ancillary data structures. +you have 256 GB of RAM, use ``-M 128e9`` which will use 128 GB of RAM +for the basic graph storage, leaving other memory for the ancillary +data structures. diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst index 157af7b654..3bc0cbb8fd 100644 --- a/doc/user/scripts.rst +++ b/doc/user/scripts.rst @@ -10,14 +10,10 @@ distribution. Below is our documentation for these scripts. Note that all scripts can be given :option:`-h` which will print out a list of arguments taken by that script. -Many scripts take :option:`-x` and :option:`-N` parameters, which drive khmer's -memory usage. These parameters depend on details of your data set; for more information -on how to choose them, see :doc:`choosing-table-sizes`. - -You can also override the default values of :option:`--ksize`/:option:`-k`, -:option:`--n_tables`/:option:`-N`, and :option:`--min-tablesize`/:option:`-x` with -the environment variables `KHMER_KSIZE`, `KHMER_N_TABLES`, and -`KHMER_MIN_TABLESIZE` respectively. +Scripts that use k-mer counting tables or k-mer graphs take an +:option:`-M` parameter, which sets the maximum memory usage in bytes. +This should generally be set as high as possible; see +:doc:`choosing-table-sizes` for more information. 1. :ref:`scripts-counting` 2. :ref:`scripts-partitioning` diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst index dd1fffb38d..2a7f0c7a07 100644 --- a/doc/whats-new-2.0.rst +++ b/doc/whats-new-2.0.rst @@ -3,10 +3,27 @@ What's New In khmer 2.0? ######################## -All binary khmer formats (presence tables, counting tables, tag sets, stop tags, -and partition subsets) have changed. Files are -now pre-pended with the string ``OXLI`` to indicate that they are from this -project. +Incompatible changes +==================== + +New parameter for tablesize/number of table parameters. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is now a :option:`-M`/:option:`--max-memory-usage` parameter +that sets the number of tables (:option:`-N`/:option:`--num_tables`) +and tablesize (:option:`-x`/:option:`--max-tablesize`) parameters +automatically to match the desired memory usage. + +(:option:`--min-tablesize` was also renamed to +:option:`--max-tablesize` to reflect this more desirable behavior.) + +Binary file formats have changed! +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All binary khmer formats (presence tables, counting tables, tag sets, +stop tags, and partition subsets) have changed. Files are now +pre-pended with the string ``OXLI`` to indicate that they are from +this project. Files of the above types made in previous versions of khmer are not compatible with v2.0; the reverse is also true. diff --git a/khmer/__init__.py b/khmer/__init__.py index fe7011b5f8..032ca7b688 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -8,7 +8,7 @@ from __future__ import print_function -from khmer._khmer import CountingHash +from khmer._khmer import CountingHash as _CountingHash from khmer._khmer import LabelHash as _LabelHash from khmer._khmer import Hashbits as _Hashbits from khmer._khmer import HLLCounter as _HLLCounter @@ -45,36 +45,6 @@ del get_versions -def new_hashbits(k, starting_size, n_tables=2): - """Return a new hashbits object. Deprecated. - - This factory method is deprecated in favor of creating a Hashbits object - directly via 'new Hashbits(...)'. - - Keyword argument: - k -- kmer size to use - starting_size -- lower bound on hashsize to use - n_tables -- number of hash tables to use (default = 2) - """ - primes = get_n_primes_above_x(n_tables, starting_size) - - return _Hashbits(k, primes) - - -def new_counting_hash(k, starting_size, n_tables=2): - """Return a new countinghash object. - - Keyword arguments: - k -- kmer size to use - starting_size -- lower bound on hashsize to use - n_tables -- number of hash tables to use (default = 2) - n_threads -- number of simultaneous threads to execute (default = 1) - """ - primes = get_n_primes_above_x(n_tables, starting_size) - - return CountingHash(k, primes) - - def load_hashbits(filename): """Load a hashbits object from the given filename and return it. @@ -93,7 +63,7 @@ def load_counting_hash(filename): Keyword argument: filename -- the name of the counting_hash file """ - hashtable = CountingHash(1, [1]) + hashtable = _CountingHash(1, [1]) hashtable.load(filename) return hashtable @@ -192,13 +162,19 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2): if fp_all > max_false_pos: print("**", file=sys.stderr) - print( - "** ERROR: the graph structure is too small for ", file=sys.stderr) - print( - "this data set. Increase k-mer presence table ", file=sys.stderr) - print("size/num of tables.", file=sys.stderr) + print("** ERROR: the graph structure is too small for ", + file=sys.stderr) + print("** this data set. Increase data structure size", + file=sys.stderr) + print("** with --max_memory_usage/-M.", file=sys.stderr) + print("**", file=sys.stderr) print("** Do not use these results!!", file=sys.stderr) print("**", file=sys.stderr) + print("** (estimated false positive rate of %.3f;" % fp_all, + file=sys.stderr) + print("max allowable %.3f" % max_false_pos, file=sys.stderr) + print("**", file=sys.stderr) + if not force: sys.exit(1) @@ -229,6 +205,9 @@ def get_n_primes_near_x(number, target): number -- the number of primes to find target -- the number to step backwards from """ + if target == 1 and number == 1: + return [1] + primes = [] i = target - 1 if i % 2 == 0: @@ -237,27 +216,11 @@ def get_n_primes_near_x(number, target): if is_prime(i): primes.append(i) i -= 2 - return primes + if len(primes) != number: + raise Exception("unable to find %d prime numbers < %d" % (number, + target)) -def get_n_primes_above_x(number, target): - """Forward-find primes smaller than target. - - Step forwards until a number of primes (other than 2) have been - found that are smaller than the target and return them. - - Keyword arguments: - number -- the number of primes to find - target -- the number to step forwards from - """ - primes = [] - i = target + 1 - if i % 2 == 0: - i += 1 - while len(primes) != number and i > 0: - if is_prime(i): - primes.append(i) - i += 2 return primes @@ -267,6 +230,15 @@ def get_n_primes_above_x(number, target): # Additional functionality can be added to these classes as appropriate. +class CountingHash(_CountingHash): + + def __new__(cls, k, starting_size, n_tables): + primes = get_n_primes_near_x(n_tables, starting_size) + c = _CountingHash.__new__(cls, k, primes) + c.primes = primes + return c + + class LabelHash(_LabelHash): def __new__(cls, k, starting_size, n_tables): @@ -279,8 +251,8 @@ def __new__(cls, k, starting_size, n_tables): class CountingLabelHash(_LabelHash): def __new__(cls, k, starting_size, n_tables): - primes = get_n_primes_above_x(n_tables, starting_size) - hb = CountingHash(k, primes) + primes = get_n_primes_near_x(n_tables, starting_size) + hb = _CountingHash(k, primes) c = _LabelHash.__new__(cls, hb) c.graph = hb return c @@ -289,7 +261,7 @@ def __new__(cls, k, starting_size, n_tables): class Hashbits(_Hashbits): def __new__(cls, k, starting_size, n_tables): - primes = get_n_primes_above_x(n_tables, starting_size) + primes = get_n_primes_near_x(n_tables, starting_size) c = _Hashbits.__new__(cls, k, primes) c.primes = primes return c diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 00bd155612..d58e832d3e 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -3103,7 +3103,7 @@ static PyTypeObject khmer_KCountingHash_Type CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object") = { PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ - "_khmer.KCountingHash", /*tp_name*/ + "_khmer.CountingHash", /*tp_name*/ sizeof(khmer_KCountingHash_Object), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)khmer_counting_dealloc, /*tp_dealloc*/ @@ -3121,7 +3121,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object") 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ "counting hash object", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ diff --git a/khmer/kfile.py b/khmer/kfile.py index 8fec14e7f8..9a01f590d9 100644 --- a/khmer/kfile.py +++ b/khmer/kfile.py @@ -13,6 +13,7 @@ import sys import errno from stat import S_ISBLK, S_ISFIFO +from khmer import khmer_args def check_input_files(file_path, force): @@ -111,8 +112,11 @@ def check_space(in_files, force, _testhook_free_space=None): sys.exit(1) -def check_space_for_hashtable(hash_size, force, _testhook_free_space=None): +def check_space_for_hashtable(args, hashtype, force, + _testhook_free_space=None): """Check we have enough size to write a hash table.""" + hash_size = khmer_args._calculate_tablesize(args, hashtype) + cwd = os.getcwd() dir_path = os.path.dirname(os.path.realpath(cwd)) target = os.statvfs(dir_path) diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index 436abd01f4..c1c634a453 100644 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -12,13 +12,17 @@ import os import argparse from argparse import _VersionAction + +import screed +import khmer from khmer import extract_countinghash_info, extract_hashbits_info from khmer import __version__ -import screed +from khmer.utils import print_error + DEFAULT_K = 32 DEFAULT_N_TABLES = 4 -DEFAULT_MIN_TABLESIZE = 1e6 +DEFAULT_MAX_TABLESIZE = 1e6 DEFAULT_N_THREADS = 1 @@ -45,24 +49,26 @@ def build_hash_args(descr=None, epilog=None, parser=None): parser = argparse.ArgumentParser(description=descr, epilog=epilog, formatter_class=ComboFormatter) - env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) - env_n_tables = os.environ.get('KHMER_N_TABLES', DEFAULT_N_TABLES) - env_tablesize = os.environ.get('KHMER_MIN_TABLESIZE', - DEFAULT_MIN_TABLESIZE) - parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') - parser.add_argument('--ksize', '-k', type=int, default=env_ksize, + parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K, help='k-mer size to use') + parser.add_argument('--n_tables', '-N', type=int, - default=env_n_tables, + default=DEFAULT_N_TABLES, help='number of k-mer counting tables to use') - parser.add_argument('--min-tablesize', '-x', type=float, - default=env_tablesize, - help='lower bound on tablesize to use') + + group = parser.add_mutually_exclusive_group() + group.add_argument('--max-tablesize', '-x', type=float, + default=DEFAULT_MAX_TABLESIZE, + help='upper bound on tablesize to use; overrides ' + + '--max-memory-usage/-M.') + group.add_argument('-M', '--max-memory-usage', type=float, + help='maximum amount of memory to use for data ' + + 'structure.') return parser @@ -70,7 +76,7 @@ def build_hash_args(descr=None, epilog=None, parser=None): def build_counting_args(descr=None, epilog=None): """Build an ArgumentParser with args for counting_hash based scripts.""" parser = build_hash_args(descr=descr, epilog=epilog) - parser.hashtype = 'counting' + parser.hashtype = 'countgraph' return parser @@ -79,7 +85,7 @@ def build_hashbits_args(descr=None, epilog=None, parser=None): """Build an ArgumentParser with args for hashbits based scripts.""" parser = build_hash_args(descr=descr, epilog=epilog, parser=parser) - parser.hashtype = 'hashbits' + parser.hashtype = 'nodegraph' return parser @@ -91,31 +97,24 @@ def add_loadhash_args(parser): class LoadAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): - env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K) - env_n_tables = os.environ.get('KHMER_N_TABLES', DEFAULT_N_TABLES) - env_tablesize = os.environ.get('KHMER_MIN_TABLESIZE', - DEFAULT_MIN_TABLESIZE) - - from khmer.utils import print_error - setattr(namespace, self.dest, values) - if getattr(namespace, 'ksize') != env_ksize or \ - getattr(namespace, 'n_tables') != env_n_tables or \ - getattr(namespace, 'min_tablesize') != env_tablesize: + if getattr(namespace, 'ksize') != DEFAULT_K or \ + getattr(namespace, 'n_tables') != DEFAULT_N_TABLES or \ + getattr(namespace, 'max_tablesize') != DEFAULT_MAX_TABLESIZE: if values: print_error(''' ** WARNING: You are loading a saved k-mer table from -{hashfile}, but have set k-mer table parameters. -Your values for ksize, n_tables, and tablesize -will be ignored.'''.format(hashfile=values)) +** {hashfile}, but have set k-mer table parameters. +** Your values for ksize, n_tables, and tablesize +** will be ignored.'''.format(hashfile=values)) if hasattr(parser, 'hashtype'): info = None - if parser.hashtype == 'hashbits': + if parser.hashtype == 'nodegraph': info = extract_hashbits_info( getattr(namespace, self.dest)) - elif parser.hashtype == 'counting': + elif parser.hashtype == 'countgraph': info = extract_countinghash_info( getattr(namespace, self.dest)) if info: @@ -124,52 +123,96 @@ def __call__(self, parser, namespace, values, option_string=None): n = info[2] setattr(namespace, 'ksize', K) setattr(namespace, 'n_tables', n) - setattr(namespace, 'min_tablesize', x) + setattr(namespace, 'max_tablesize', x) parser.add_argument('-l', '--loadtable', metavar="filename", default=None, help='load a precomputed k-mer table from disk', action=LoadAction) -def report_on_config(args, hashtype='counting'): +def _calculate_tablesize(args, hashtype, multiplier=1.0): + if hashtype not in ('countgraph', 'nodegraph'): + raise Exception, "unknown graph type: %s" % (hashtype,) + + if args.max_memory_usage: + if hashtype == 'countgraph': + tablesize = args.max_memory_usage / args.n_tables / \ + float(multiplier) + elif hashtype == 'nodegraph': + tablesize = 8. * args.max_memory_usage / args.n_tables / \ + float(multiplier) + else: + tablesize = args.max_tablesize + + return tablesize + + +def create_nodegraph(args, ksize=None, multiplier=1.0): + if ksize is None: + ksize = args.ksize + if ksize > 32: + print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n") + sys.exit(1) + + tablesize = _calculate_tablesize(args, 'nodegraph', multiplier=multiplier) + return khmer.Hashbits(ksize, tablesize, args.n_tables) + + +def create_countgraph(args, ksize=None, multiplier=1.0): + if ksize is None: + ksize = args.ksize + if ksize > 32: + print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n") + sys.exit(1) + + tablesize = _calculate_tablesize(args, 'countgraph', multiplier=multiplier) + return khmer.CountingHash(ksize, tablesize, args.n_tables) + + +def report_on_config(args, hashtype='countgraph'): """Print out configuration. Summarize the configuration produced by the command-line arguments made available by this module. """ from khmer.utils import print_error + if hashtype not in ('countgraph', 'nodegraph'): + raise Exception, "unknown graph type: %s" % (hashtype,) if args.quiet: return + tablesize = _calculate_tablesize(args, hashtype) + print_error("\nPARAMETERS:") print_error(" - kmer size = {0} \t\t(-k)".format(args.ksize)) print_error(" - n tables = {0} \t\t(-N)".format(args.n_tables)) print_error( - " - min tablesize = {0:5.2g} \t(-x)".format(args.min_tablesize) + " - max tablesize = {0:5.2g} \t(-x)".format(tablesize) ) print_error("") - if hashtype == 'counting': + if hashtype == 'countgraph': print_error( "Estimated memory usage is {0:.2g} bytes " - "(n_tables x min_tablesize)".format( - args.n_tables * args.min_tablesize)) - elif hashtype == 'hashbits': + "(n_tables x max_tablesize)".format( + args.n_tables * args.max_tablesize)) + elif hashtype == 'nodegraph': print_error( "Estimated memory usage is {0:.2g} bytes " - "(n_tables x min_tablesize / 8)".format(args.n_tables * - args.min_tablesize / 8) + "(n_tables x max_tablesize / 8)".format(args.n_tables * + args.max_tablesize / 8) ) print_error("-" * 8) - if DEFAULT_MIN_TABLESIZE == args.min_tablesize and \ - not hasattr(args, 'loadtable'): - print_error( - "** WARNING: tablesize is default! " - "You absodefly want to increase this!\n** " - "Please read the docs!\n" - ) + if DEFAULT_MAX_TABLESIZE == args.max_tablesize and \ + not getattr(args, 'loadtable', None): + print_error('''\ + +** WARNING: tablesize is default! +** You probably want to increase this with -M/--max-memory-usage! +** Please read the docs! +''') def add_threading_args(parser): diff --git a/oxli/build_graph.py b/oxli/build_graph.py index 012a54f65c..43b5a337db 100644 --- a/oxli/build_graph.py +++ b/oxli/build_graph.py @@ -19,6 +19,7 @@ import sys import khmer +from khmer import khmer_args from khmer.khmer_args import (report_on_config, info, add_threading_args) from khmer.kfile import check_input_files, check_space from khmer.kfile import check_space_for_hashtable @@ -43,7 +44,7 @@ def build_parser(parser): def main(args): info('build-graph.py', ['graph', 'SeqAn']) - report_on_config(args, hashtype='hashbits') + report_on_config(args, hashtype='nodegraph') base = args.output_filename filenames = args.input_filenames @@ -51,8 +52,7 @@ def main(args): check_input_files(fname, args.force) check_space(args.input_filenames, args.force) - check_space_for_hashtable( - (float(args.n_tables * args.min_tablesize) / 8.), args.force) + check_space_for_hashtable(args, 'nodegraph', args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % @@ -63,8 +63,8 @@ def main(args): print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) - print('making k-mer presence table', file=sys.stderr) - htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) + print('making nodegraph', file=sys.stderr) + htable = khmer_args.create_nodegraph(args) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) diff --git a/sandbox/assembly-diff-2.py b/sandbox/assembly-diff-2.py index 883675b54e..1d39969eef 100755 --- a/sandbox/assembly-diff-2.py +++ b/sandbox/assembly-diff-2.py @@ -24,7 +24,7 @@ def main(): filename2 = sys.argv[2] uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w') - kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) + kh = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) for n, record in enumerate(screed.open(filename1)): if n % 10000 == 0: print('...', filename1, n) diff --git a/sandbox/assembly-diff.py b/sandbox/assembly-diff.py index 003d54af65..e3fdee0052 100755 --- a/sandbox/assembly-diff.py +++ b/sandbox/assembly-diff.py @@ -26,9 +26,9 @@ def main(): uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w') paths = sys.argv[3] - kh1 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) + kh1 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) kh1.consume_fasta(filename1) - kh2 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) + kh2 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) kh2.consume_fasta(filename2) for record in screed.open(paths): diff --git a/sandbox/bloom-count-intersection.py b/sandbox/bloom-count-intersection.py index c50d823e66..71405d46e8 100755 --- a/sandbox/bloom-count-intersection.py +++ b/sandbox/bloom-count-intersection.py @@ -20,7 +20,7 @@ def main(): HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): @@ -36,7 +36,7 @@ def main(): print('# of occupied bin:', ht.n_occupied()) filename2 = sys.argv[5] - ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 n_overlap = 0 for n, record in enumerate(fasta_iter(open(filename2))): diff --git a/sandbox/bloom-count.py b/sandbox/bloom-count.py index 3fed152daf..fc833cc8eb 100755 --- a/sandbox/bloom-count.py +++ b/sandbox/bloom-count.py @@ -20,7 +20,7 @@ def main(): HT_SIZE = int(sys.argv[3]) # size of hashtable N_HT = int(sys.argv[4]) # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py index 28f995f638..6686d732e1 100755 --- a/sandbox/build-sparse-graph.py +++ b/sandbox/build-sparse-graph.py @@ -20,7 +20,7 @@ def main(): K = int(sys.argv[1]) x = float(sys.argv[2]) - ht = khmer.new_hashbits(K, x, 4) + ht = khmer.Hashbits(K, x, 4) sparse_graph = gt.Graph() hashes = sparse_graph.new_vertex_property("long long") diff --git a/sandbox/calc-error-profile.py b/sandbox/calc-error-profile.py index 00cd013fdd..2cceb306fd 100755 --- a/sandbox/calc-error-profile.py +++ b/sandbox/calc-error-profile.py @@ -67,7 +67,7 @@ def main(): # build a small counting hash w/default parameters. In general there # should be no need to change these parameters. - ht = khmer.new_counting_hash(K, HASHSIZE, N_HT) + ht = khmer.CountingHash(K, HASHSIZE, N_HT) # initialize list to contain counts of errors by position positions = [0] * MAX_SEQ_LEN diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py index 512359497e..f02c0ea8d3 100755 --- a/sandbox/collect-reads.py +++ b/sandbox/collect-reads.py @@ -20,6 +20,7 @@ import sys import textwrap import khmer +from khmer import khmer_args from khmer.khmer_args import build_counting_args, report_on_config, info from khmer.kfile import check_input_files, check_space from khmer.kfile import check_space_for_hashtable @@ -76,15 +77,15 @@ def main(): check_input_files(name, False) check_space(args.input_sequence_filename, False) - check_space_for_hashtable(args.n_tables * args.min_tablesize, False) + check_space_for_hashtable(args, 'countgraph', False) print('Saving k-mer counting table to %s' % base) print('Loading sequences from %s' % repr(filenames)) if args.output: print('Outputting sequences to', args.output) - print('making k-mer counting table') - htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) + print('making countgraph', file=sys.stderr) + htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) total_coverage = 0. @@ -132,7 +133,8 @@ def main(): info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. - fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) + fp_rate = khmer.calc_expected_collisions(htable, False, + max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate) print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp) diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py index bb81dc58e1..db368f0e78 100755 --- a/sandbox/collect-variants.py +++ b/sandbox/collect-variants.py @@ -64,7 +64,7 @@ def main(): ht = khmer.load_counting_hash(args.loadhash) else: print('making hashtable') - ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) + ht = khmer.CountingHash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) diff --git a/sandbox/correct-errors.py b/sandbox/correct-errors.py index b82e172ac2..71c6890ad7 100755 --- a/sandbox/correct-errors.py +++ b/sandbox/correct-errors.py @@ -92,7 +92,7 @@ def main(): NORMALIZE_LIMIT = args.normalize_to print('making hashtable') - ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) + ht = khmer.CountingHash(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) diff --git a/sandbox/fasta-to-abundance-hist.py b/sandbox/fasta-to-abundance-hist.py index 132f748522..b4776cf68a 100755 --- a/sandbox/fasta-to-abundance-hist.py +++ b/sandbox/fasta-to-abundance-hist.py @@ -18,9 +18,7 @@ def main(): n_seq_kept = len(files) * [0] print('loading ht') - ht = khmer.new_counting_hash(1, 1, 1) - - ht.load(sys.argv[1]) + ht = khmer.load_counting_hash(sys.argv[1]) for i, infile in enumerate(files): print('outputting', infile + '.freq') diff --git a/sandbox/find-high-abund-kmers.py b/sandbox/find-high-abund-kmers.py index beb3ac9a4e..db43686997 100755 --- a/sandbox/find-high-abund-kmers.py +++ b/sandbox/find-high-abund-kmers.py @@ -13,7 +13,7 @@ import sys import screed import khmer -from khmer.khmer_args import build_counting_args, DEFAULT_MIN_TABLESIZE +from khmer.khmer_args import build_counting_args, DEFAULT_MAX_TABLESIZE DEFAULT_LOWER_CUTOFF = 2000 DEFAULT_UPPER_CUTOFF = 65535 @@ -34,7 +34,7 @@ def main(): args = parser.parse_args() if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) @@ -65,7 +65,7 @@ def main(): ### print('making hashtable') - ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) + ht = khmer.CountingHash(K, HT_SIZE, N_HT) ht.set_use_bigcount(True) print('consuming input', input) diff --git a/sandbox/graph-size.py b/sandbox/graph-size.py index cc0d9fb1b3..41cdf07b88 100755 --- a/sandbox/graph-size.py +++ b/sandbox/graph-size.py @@ -42,7 +42,7 @@ def main(): print('--') print('creating ht') - ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) + ht = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT) print('eating fa', infile) total_reads, n_consumed = ht.consume_fasta(infile) outfp = open(outfile, 'w') diff --git a/sandbox/normalize-by-median-pct.py b/sandbox/normalize-by-median-pct.py index 1888047625..f7dad25c7f 100755 --- a/sandbox/normalize-by-median-pct.py +++ b/sandbox/normalize-by-median-pct.py @@ -21,7 +21,7 @@ import os import khmer -from khmer.khmer_args import build_counting_args, DEFAULT_MIN_TABLESIZE +from khmer.khmer_args import build_counting_args, DEFAULT_MAX_TABLESIZE import argparse DEFAULT_DESIRED_COVERAGE = 5 @@ -58,7 +58,7 @@ def main(): args = parser.parse_args() if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash: + if args.min_hashsize == DEFAULT_MAX_HASHSIZE and not args.loadhash: print("** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!", file=sys.stderr) print('\nPARAMETERS:', file=sys.stderr) @@ -88,7 +88,7 @@ def main(): ht = khmer.load_counting_hash(args.loadhash) else: print('making hashtable') - ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) + ht = khmer.CountingHash(K, HT_SIZE, N_HT) total = 0 discarded = 0 diff --git a/sandbox/print-stoptags.py b/sandbox/print-stoptags.py index f7633b0db5..1e59b44f53 100755 --- a/sandbox/print-stoptags.py +++ b/sandbox/print-stoptags.py @@ -13,7 +13,7 @@ def main(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer.Hashbits(32, 1, 1) ht.load_stop_tags(sys.argv[1]) ht.print_stop_tags(os.path.basename(sys.argv[1]) + '.txt') diff --git a/sandbox/print-tagset.py b/sandbox/print-tagset.py index 8a093071e4..c8612784a1 100755 --- a/sandbox/print-tagset.py +++ b/sandbox/print-tagset.py @@ -14,7 +14,7 @@ def main(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer.Hashbits(32, 1, 1) ht.load_tagset(sys.argv[1]) print('loaded!') ht.print_tagset(os.path.basename(sys.argv[1]) + '.txt') diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py index 422024db23..40aee14df8 100755 --- a/sandbox/saturate-by-median.py +++ b/sandbox/saturate-by-median.py @@ -21,7 +21,7 @@ import textwrap from khmer.khmer_args import (build_counting_args, add_loadhash_args, - report_on_config, info) + report_on_config, info, create_countgraph) import argparse from khmer.kfile import (check_space, check_space_for_hashtable, check_valid_file_exists) @@ -187,7 +187,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, False) if args.savetable: - check_space_for_hashtable(args.n_tables * args.min_tablesize, False) + check_space_for_hashtable(args, 'countgraph', False) # list to save error files along with throwing exceptions if args.force: @@ -197,9 +197,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements print('loading k-mer counting table from', args.loadtable) htable = khmer.load_counting_hash(args.loadtable) else: - print('making k-mer counting table') - htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, - args.n_tables) + print('making countgraph') + htable = create_countgraph(args) total = 0 discarded = 0 diff --git a/sandbox/stoptags-by-position.py b/sandbox/stoptags-by-position.py index 5431261e2f..1b92fa621b 100755 --- a/sandbox/stoptags-by-position.py +++ b/sandbox/stoptags-by-position.py @@ -14,7 +14,7 @@ def main(): - ht = khmer.new_hashbits(K, 1, 1) + ht = khmer.Hashbits(K, 1, 1) x = [0] * 255 y = [0] * 255 diff --git a/sandbox/subset-report.py b/sandbox/subset-report.py index bea112420c..fe230e6238 100755 --- a/sandbox/subset-report.py +++ b/sandbox/subset-report.py @@ -17,7 +17,7 @@ def main(): subset_filenames = sys.argv[1:] - ht = khmer.new_hashbits(K, 1, 1) + ht = khmer.Hashbits(K, 1, 1) for filename in subset_filenames: print('--') print('partition map:', filename) diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py index 426bebe52b..6b44ea6235 100755 --- a/sandbox/sweep-files.py +++ b/sandbox/sweep-files.py @@ -108,7 +108,7 @@ def main(): if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE - report_on_config(args, hashtype='hashbits') + report_on_config(args, hashtype='nodegraph') K = args.ksize HT_SIZE = args.min_tablesize diff --git a/sandbox/sweep-out-reads-with-contigs.py b/sandbox/sweep-out-reads-with-contigs.py index 450057f5bd..f053f6280f 100755 --- a/sandbox/sweep-out-reads-with-contigs.py +++ b/sandbox/sweep-out-reads-with-contigs.py @@ -22,7 +22,7 @@ def main(): outfile = sys.argv[3] # create a hashbits data structure - ht = khmer.new_hashbits(K, 1, 1) + ht = khmer.Hashbits(K, 1, 1) # tag every k-mer in the contigs ht._set_tag_density(0) diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py index 706fa726a5..fbf2ccb4a5 100755 --- a/sandbox/sweep-reads.py +++ b/sandbox/sweep-reads.py @@ -50,7 +50,7 @@ DEFAULT_OUT_PREF = 'reads' DEFAULT_RANGE = -1 -MIN_HSIZE = 4e7 +MAX_HSIZE = 4e7 MIN_KSIZE = 21 @@ -208,15 +208,15 @@ def main(): parser = get_parser() args = parser.parse_args() - if args.min_tablesize < MIN_HSIZE: - args.min_tablesize = MIN_HSIZE + if args.max_tablesize < MAX_HSIZE: + args.max_tablesize = MAX_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE - report_on_config(args, hashtype='hashbits') + report_on_config(args, hashtype='nodegraph') K = args.ksize - HT_SIZE = args.min_tablesize + HT_SIZE = args.max_tablesize N_HT = args.n_tables traversal_range = args.traversal_range diff --git a/sandbox/sweep-reads2.py b/sandbox/sweep-reads2.py index 6ad2af592d..38d6e6a1a9 100755 --- a/sandbox/sweep-reads2.py +++ b/sandbox/sweep-reads2.py @@ -20,7 +20,7 @@ import khmer import os.path import screed -from khmer.khmer_args import (build_hashbits_args, DEFAULT_MIN_TABLESIZE) +from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE) def main(): @@ -31,7 +31,7 @@ def main(): args = parser.parse_args() if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) @@ -58,7 +58,7 @@ def main(): outfp = open(outfile, 'w') # create a hashbits data structure - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) # load contigs, connect into N partitions print('loading input reads from', inp) diff --git a/sandbox/sweep-reads3.py b/sandbox/sweep-reads3.py index 276c16a083..c0c5329c16 100755 --- a/sandbox/sweep-reads3.py +++ b/sandbox/sweep-reads3.py @@ -20,7 +20,7 @@ import os.path import screed import khmer -from khmer.khmer_args import (build_hashbits_args, DEFAULT_MIN_TABLESIZE) +from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE) def output_single(r): @@ -38,7 +38,7 @@ def main(): args = parser.parse_args() if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + if args.min_hashsize == DEFAULT_MAX_TABLESIZE: print("** WARNING: hashsize is default! " \ "You absodefly want to increase this!\n** " \ "Please read the docs!", file=sys.stderr) @@ -64,7 +64,7 @@ def main(): query_list = [] for n, inp_name in enumerate(inputlist): # create a hashbits data structure - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) outfile = os.path.basename(inp_name) + '.sweep3' outfp = open(outfile, 'w') diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py index 5bc901b56b..10d2109c77 100755 --- a/scripts/abundance-dist-single.py +++ b/scripts/abundance-dist-single.py @@ -22,6 +22,7 @@ import khmer import threading import textwrap +from khmer import khmer_args from khmer.khmer_args import (build_counting_args, add_threading_args, report_on_config, info) from khmer.kfile import (check_input_files, check_space, @@ -79,8 +80,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches check_input_files(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: - check_space_for_hashtable(args.n_tables * args.min_tablesize, - args.force) + check_space_for_hashtable(args, 'countgraph', args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): @@ -95,14 +95,12 @@ def main(): # pylint: disable=too-many-locals,too-many-branches hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) - print('making k-mer counting table', file=sys.stderr) - counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, - args.n_tables) + print('making countgraph', file=sys.stderr) + counting_hash = khmer_args.create_countgraph(args, multiplier=1.1) counting_hash.set_use_bigcount(args.bigcount) print('building k-mer tracking table', file=sys.stderr) - tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, - args.n_tables) + tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print('kmer_size:', counting_hash.ksize(), file=sys.stderr) print('k-mer counting table sizes:', diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py index 8a95a273b2..e09f95895f 100755 --- a/scripts/annotate-partitions.py +++ b/scripts/annotate-partitions.py @@ -67,7 +67,7 @@ def main(): ksize = args.ksize filenames = args.input_filenames - htable = khmer.new_hashbits(ksize, 1, 1) + htable = khmer.Hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py index 9d4cec8dd5..a8c715d161 100755 --- a/scripts/count-overlap.py +++ b/scripts/count-overlap.py @@ -23,13 +23,10 @@ import csv import khmer import textwrap +from khmer import khmer_args from khmer.kfile import check_input_files, check_space from khmer.khmer_args import (build_hashbits_args, report_on_config, info) -DEFAULT_K = 32 -DEFAULT_N_HT = 4 -DEFAULT_HASHSIZE = 1e6 - def get_parser(): epilog = """ @@ -58,7 +55,7 @@ def get_parser(): def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() - report_on_config(args, hashtype='hashbits') + report_on_config(args, hashtype='nodegraph') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) @@ -76,7 +73,7 @@ def main(): # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) - ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) + ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) diff --git a/scripts/do-partition.py b/scripts/do-partition.py index b99d8e56c4..f5021451ec 100755 --- a/scripts/do-partition.py +++ b/scripts/do-partition.py @@ -22,6 +22,7 @@ import os.path import os import textwrap +from khmer import khmer_args from khmer.khmer_args import (build_hashbits_args, report_on_config, info, add_threading_args) import glob @@ -113,7 +114,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() - report_on_config(args, hashtype='hashbits') + report_on_config(args, hashtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) @@ -131,8 +132,8 @@ def main(): # pylint: disable=too-many-locals,too-many-statements # load-graph - print('making k-mer presence table', file=sys.stderr) - htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) + print('making nodegraph', file=sys.stderr) + htable = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) @@ -210,7 +211,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) - htable = khmer.new_hashbits(args.ksize, 1, 1) + htable = khmer.Hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index b4d2215e9d..b22a4946bd 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -24,6 +24,7 @@ import threading import textwrap from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader +from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, add_threading_args, info) from khmer.kfile import (check_input_files, check_space, @@ -70,13 +71,11 @@ def main(): check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: - check_space_for_hashtable( - args.n_tables * args.min_tablesize, args.force) + check_space_for_hashtable(args, 'countgraph', args.force) report_on_config(args) - print('making k-mer counting table', file=sys.stderr) - htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, - args.n_tables) + print('making countgraph', file=sys.stderr) + htable = khmer_args.create_countgraph(args) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py index fe2fb6d001..d7e87f2255 100755 --- a/scripts/filter-stoptags.py +++ b/scripts/filter-stoptags.py @@ -65,7 +65,7 @@ def main(): check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) - htable = khmer.new_hashbits(args.ksize, 1, 1) + htable = khmer.Hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): diff --git a/scripts/find-knots.py b/scripts/find-knots.py index b37fd687a6..d224e064e7 100755 --- a/scripts/find-knots.py +++ b/scripts/find-knots.py @@ -22,7 +22,9 @@ import khmer import sys from khmer.kfile import check_input_files, check_space -from khmer.khmer_args import info +from khmer import khmer_args +from khmer.khmer_args import (build_counting_args, info, add_loadhash_args, + report_on_config) # counting hash parameters. DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes @@ -62,21 +64,12 @@ def get_parser(): process, and if you eliminate the already-processed pmap files, you can continue where you left off. """ - parser = argparse.ArgumentParser( - description="Find all highly connected k-mers.", - epilog=textwrap.dedent(epilog), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('--n_tables', '-N', type=int, - default=DEFAULT_COUNTING_HT_N, - help='number of k-mer counting tables to use') - parser.add_argument('--min-tablesize', '-x', type=float, - default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on' - ' the size of the k-mer counting table(s)') + parser = build_counting_args( + descr="Find all highly connected k-mers.", + epilog=textwrap.dedent(epilog)) + parser.add_argument('graphbase', help='Basename for the input and output ' 'files.') - parser.add_argument('--version', action='version', version='%(prog)s ' + - khmer.__version__) parser.add_argument('-f', '--force', default=False, action='store_true', help='Continue past warnings') return parser @@ -124,8 +117,7 @@ def main(): # create counting hash ksize = htable.ksize() - counting = khmer.new_counting_hash(ksize, args.min_tablesize, - args.n_tables) + counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index 833901aa28..f907c36723 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -20,6 +20,7 @@ import threading import textwrap import khmer +from khmer import khmer_args from khmer.khmer_args import build_counting_args, report_on_config, info,\ add_threading_args from khmer.kfile import check_file_writable @@ -84,7 +85,7 @@ def main(): check_input_files(name, args.force) check_space(args.input_sequence_filename, args.force) - check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) + check_space_for_hashtable(args, 'countgraph', args.force) check_file_writable(base) check_file_writable(base + ".info") @@ -97,9 +98,8 @@ def main(): if os.path.exists(base + '.info'): os.remove(base + '.info') - print('making k-mer counting table', file=sys.stderr) - htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, - args.n_tables) + print('making countgraph', file=sys.stderr) + htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) filename = None @@ -124,9 +124,9 @@ def main(): thread.join() if index > 0 and index % 10 == 0: - check_space_for_hashtable(args.n_tables * args.min_tablesize, - args.force) + check_space_for_hashtable(args, 'countgraph', args.force) print('mid-save', base, file=sys.stderr) + htable.save(base) with open(base + '.info', 'a') as info_fh: print('through', filename, file=info_fh) diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py index 2f31617e37..29a08ef7d7 100755 --- a/scripts/make-initial-stoptags.py +++ b/scripts/make-initial-stoptags.py @@ -16,6 +16,7 @@ import sys import textwrap import khmer +from khmer import khmer_args from khmer.khmer_args import (build_counting_args, info) from khmer.kfile import check_input_files, check_space @@ -96,8 +97,7 @@ def main(): htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() - counting = khmer.new_counting_hash(ksize, args.min_tablesize, - args.n_tables) + counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py index 5ca4941d6b..c77d822b85 100755 --- a/scripts/merge-partitions.py +++ b/scripts/merge-partitions.py @@ -61,7 +61,7 @@ def main(): (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize - htable = khmer.new_hashbits(ksize, 1, 1) + htable = khmer.Hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 908c1a45ab..3bb2ba7152 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -24,6 +24,7 @@ import os import khmer import textwrap +from khmer import khmer_args from contextlib import contextmanager from khmer.khmer_args import (build_counting_args, add_loadhash_args, @@ -273,8 +274,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: - check_space_for_hashtable( - args.n_tables * args.min_tablesize, args.force) + check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: @@ -282,9 +282,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: - print('making k-mer counting table', file=sys.stderr) - htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, - args.n_tables) + print('making countgraph', file=sys.stderr) + htable = khmer_args.create_countgraph(args) input_filename = None diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 6bab0a4434..741b181775 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -25,6 +25,8 @@ import argparse from screed import Record +from khmer import khmer_args + from khmer.khmer_args import (build_counting_args, info, add_loadhash_args, report_on_config) from khmer.utils import write_record, write_record_pair, broken_paired_reader @@ -124,21 +126,18 @@ def main(): check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: - check_space_for_hashtable( - args.n_tables * args.min_tablesize, args.force) - - K = args.ksize - - CUTOFF = args.cutoff - NORMALIZE_LIMIT = args.normalize_to + check_space_for_hashtable(args, 'countgraph', args.force) if args.loadtable: - print('loading k-mer counting table from', - args.loadtable, file=sys.stderr) + print('loading countgraph from', args.loadtable, file=sys.stderr) ct = khmer.load_counting_hash(args.loadtable) else: - print('making k-mer counting table', file=sys.stderr) - ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables) + print('making countgraph', file=sys.stderr) + ct = khmer_args.create_countgraph(args) + + K = ct.ksize() + CUTOFF = args.cutoff + NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py index 54ef426d9d..5e08abaf7e 100644 --- a/tests/test_counting_hash.py +++ b/tests/test_counting_hash.py @@ -43,7 +43,7 @@ def teardown(): class Test_CountingHash(object): def setup(self): - self.hi = khmer.CountingHash(12, PRIMES_1m) + self.hi = khmer._CountingHash(12, PRIMES_1m) def test_failed_get(self): GG = 'G' * 12 # forward_hash: 11184810 @@ -124,7 +124,7 @@ def test_collision_3(self): def test_get_raw_tables(): - ht = khmer.new_counting_hash(20, 1e5, 4) + ht = khmer.CountingHash(20, 1e5, 4) tables = ht.get_raw_tables() for size, table in zip(ht.hashsizes(), tables): @@ -133,7 +133,7 @@ def test_get_raw_tables(): def test_get_raw_tables_view(): - ht = khmer.new_counting_hash(20, 1e5, 4) + ht = khmer.CountingHash(20, 1e5, 4) tables = ht.get_raw_tables() for tab in tables: assert sum(tab.tolist()) == 0 @@ -145,7 +145,7 @@ def test_get_raw_tables_view(): @attr('huge') def test_toobig(): try: - ct = khmer.new_counting_hash(30, 1e13, 1) + ct = khmer.CountingHash(30, 1e13, 1) assert 0, "this should fail" except MemoryError as err: print(str(err)) @@ -155,7 +155,7 @@ def test_3_tables(): x = list(PRIMES_1m) x.append(1000005) - hi = khmer.CountingHash(12, x) + hi = khmer._CountingHash(12, x) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 @@ -186,7 +186,7 @@ def test_3_tables(): def test_simple_median(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") (median, average, stddev) = hi.get_median_count("AAAAAA") @@ -225,7 +225,7 @@ def test_simple_median(): def test_median_too_short(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") try: @@ -236,7 +236,7 @@ def test_median_too_short(): def test_median_at_least(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") assert hi.median_at_least("AAAAAA", 1) @@ -261,7 +261,7 @@ def test_median_at_least(): def test_median_at_least_single_gt(): K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) kmers = ['ATCGATCGATCGATCGATCG', 'GTACGTACGTACGTACGTAC', @@ -274,7 +274,7 @@ def test_median_at_least_single_gt(): def test_median_at_least_single_lt(): K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) kmers = ['ATCGATCGATCGATCGATCG', 'GTACGTACGTACGTACGTAC', @@ -288,7 +288,7 @@ def test_median_at_least_single_lt(): def test_median_at_least_odd_gt(): # test w/odd number of k-mers K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) seqs = ['ATCGATCGATCGATCGATCGCC', 'GTACGTACGTACGTACGTACCC', @@ -301,7 +301,7 @@ def test_median_at_least_odd_gt(): def test_median_at_least_odd_lt(): K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) seqs = ['ATCGATCGATCGATCGATCGCC', 'GTACGTACGTACGTACGTACCC', @@ -315,7 +315,7 @@ def test_median_at_least_odd_lt(): # Test median with even number of k-mers def test_median_at_least_even_gt(): K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) seqs = ['ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC', @@ -328,7 +328,7 @@ def test_median_at_least_even_gt(): def test_median_at_least_even_lt(): K = 20 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) seqs = ['ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC', @@ -342,7 +342,7 @@ def test_median_at_least_even_lt(): def test_median_at_least_comp(): K = 20 C = 4 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) seqs = ['ATCGATCGATCGATCGATCGCCC', 'GTACGTACGTACGTACGTACCCC', @@ -358,7 +358,7 @@ def test_median_at_least_comp(): def test_median_at_least_exception(): - ht = khmer.new_counting_hash(20, 1e6, 2) + ht = khmer.CountingHash(20, 1e6, 2) try: ht.median_at_least('ATGGCTGATCGAT', 1) assert 0, "should have thrown ValueError" @@ -367,25 +367,25 @@ def test_median_at_least_exception(): def test_simple_kadian(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1 - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") hi.consume("ACTGCTATCTCTAGAcCTATG") # ---------------^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") hi.consume("ACTGCTATCTCTAGAcCTATG") # ---------------^---^ x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2 - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") hi.consume("ACTGCTATCTCTAGtGCTAcG") # --------------^^---^ @@ -394,11 +394,11 @@ def test_simple_kadian(): def test_simple_kadian_2(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1 - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAGCTATG") hi.consume("ACAGCTATCTCTAGAGCTATG") @@ -406,7 +406,7 @@ def test_simple_kadian_2(): x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 2, x - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACaGCTATCTCTAGAcCTATG") hi.consume("ACAGCTATCTCTAGACCTATG") @@ -414,7 +414,7 @@ def test_simple_kadian_2(): x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") assert x == 1, x - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCgCTAGAGCTATG") hi.consume("ACTGCTATCGCTAGAGCTATG") @@ -424,11 +424,11 @@ def test_simple_kadian_2(): def test_2_kadian(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1 - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTATG") hi.consume("ACTGCTATCTCTAGACCTATG") @@ -436,14 +436,14 @@ def test_2_kadian(): x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTAGAcCTAtG") hi.consume("ACTGCTATCTCTAGACCTATG") # ---------------^---^ assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2 - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTATCTCTACtcCTAtG") hi.consume("ACTGCTATCTCTACTCCTATG") @@ -451,7 +451,7 @@ def test_2_kadian(): x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) assert x == 2, x - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("ACTGCTATCTCTAGAGCTATG") # hi.consume("ACTGCTgTCTCTACtcCTAtG") hi.consume("ACTGCTGTCTCTACTCCTATG") @@ -461,7 +461,7 @@ def test_2_kadian(): def test_get_kmer_counts_too_short(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") counts = hi.get_kmer_counts("A") @@ -469,7 +469,7 @@ def test_get_kmer_counts_too_short(): def test_get_kmer_hashes_too_short(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("A") @@ -477,7 +477,7 @@ def test_get_kmer_hashes_too_short(): def test_get_kmers_too_short(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") kmers = hi.get_kmers("A") @@ -485,7 +485,7 @@ def test_get_kmers_too_short(): def test_get_kmer_counts(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") counts = hi.get_kmer_counts("AAAAAA") @@ -522,7 +522,7 @@ def test_get_kmer_counts(): def test_get_kmer_hashes(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume("AAAAAA") hashes = hi.get_kmer_hashes("AAAAAA") @@ -559,7 +559,7 @@ def test_get_kmer_hashes(): def test_get_kmers(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) kmers = hi.get_kmers("AAAAAA") assert kmers == ["AAAAAA"] @@ -574,9 +574,9 @@ def do_test(ctfile): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename(ctfile) - sizes = khmer.get_n_primes_above_x(1, 2**31) + sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000) - orig = khmer.CountingHash(12, sizes) + orig = khmer._CountingHash(12, sizes) orig.consume_fasta(inpath) orig.save(savepath) @@ -598,11 +598,11 @@ def test_save_load(): sizes = list(PRIMES_1m) sizes.append(1000005) - hi = khmer.CountingHash(12, sizes) + hi = khmer._CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) - ht = khmer.CountingHash(12, sizes) + ht = khmer._CountingHash(12, sizes) try: ht.load(savepath) except IOError as err: @@ -625,7 +625,7 @@ def test_load_truncated(): sizes = khmer.get_n_primes_near_x(3, 200) - hi = khmer.CountingHash(12, sizes) + hi = khmer._CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) @@ -652,7 +652,7 @@ def test_load_gz(): sizes.append(1000005) # save uncompressed hashtable. - hi = khmer.CountingHash(12, sizes) + hi = khmer._CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) @@ -664,7 +664,7 @@ def test_load_gz(): in_file.close() # load compressed hashtable. - ht = khmer.CountingHash(12, sizes) + ht = khmer._CountingHash(12, sizes) try: ht.load(loadpath) except IOError as err: @@ -687,11 +687,11 @@ def test_save_load_gz(): sizes = list(PRIMES_1m) sizes.append(1000005) - hi = khmer.CountingHash(12, sizes) + hi = khmer._CountingHash(12, sizes) hi.consume_fasta(inpath) hi.save(savepath) - ht = khmer.CountingHash(12, sizes) + ht = khmer._CountingHash(12, sizes) try: ht.load(savepath) except IOError as err: @@ -708,7 +708,7 @@ def test_save_load_gz(): def test_trim_full(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume(DNA) hi.consume(DNA) @@ -718,7 +718,7 @@ def test_trim_full(): def test_trim_short(): - hi = khmer.new_counting_hash(6, 1e6, 2) + hi = khmer.CountingHash(6, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:50]) @@ -730,7 +730,7 @@ def test_trim_short(): def test_find_spectral_error_positions_1(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:30]) @@ -743,7 +743,7 @@ def test_find_spectral_error_positions_1(): def test_find_spectral_error_positions_2(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA) @@ -753,7 +753,7 @@ def test_find_spectral_error_positions_2(): def test_find_spectral_error_positions_6(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[1:]) @@ -766,7 +766,7 @@ def test_find_spectral_error_positions_6(): def test_find_spectral_error_positions_4(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) @@ -775,7 +775,7 @@ def test_find_spectral_error_positions_4(): def test_find_spectral_error_positions_5(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) hi.consume(DNA) hi.consume(DNA[:10]) @@ -787,7 +787,7 @@ def test_find_spectral_error_positions_5(): def test_find_spectral_error_locs7(): K = 8 - hi = khmer.new_counting_hash(K, 1e6, 2) + hi = khmer.CountingHash(K, 1e6, 2) hi.consume(DNA) hi.consume(DNA[K:]) @@ -800,7 +800,7 @@ def test_find_spectral_error_locs7(): def test_find_spectral_error_positions_err(): - hi = khmer.new_counting_hash(8, 1e6, 2) + hi = khmer.CountingHash(8, 1e6, 2) try: posns = hi.find_spectral_error_positions(DNA[:6], 1) @@ -817,7 +817,7 @@ def test_find_spectral_error_positions_err(): def test_maxcount(): # hashtable should saturate at some point so as not to overflow counter - kh = khmer.new_counting_hash(4, 4 ** 4, 4) + kh = khmer.CountingHash(4, 4 ** 4, 4) kh.set_use_bigcount(False) last_count = None @@ -835,7 +835,7 @@ def test_maxcount(): def test_maxcount_with_bigcount(): # hashtable should not saturate, if use_bigcount is set. - kh = khmer.new_counting_hash(4, 4 ** 4, 4) + kh = khmer.CountingHash(4, 4 ** 4, 4) kh.set_use_bigcount(True) last_count = None @@ -853,7 +853,7 @@ def test_maxcount_with_bigcount(): def test_maxcount_with_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. - kh = khmer.new_counting_hash(4, 4 ** 4, 4) + kh = khmer.CountingHash(4, 4 ** 4, 4) kh.set_use_bigcount(True) for i in range(0, 1000): @@ -863,7 +863,7 @@ def test_maxcount_with_bigcount_save(): savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) - kh = khmer.new_counting_hash(1, 1, 1) + kh = khmer.CountingHash(1, 1, 1) try: kh.load(savepath) except IOError as err: @@ -876,13 +876,13 @@ def test_maxcount_with_bigcount_save(): def test_bigcount_save(): # hashtable should not saturate, if use_bigcount is set. - kh = khmer.new_counting_hash(4, 4 ** 4, 4) + kh = khmer.CountingHash(4, 4 ** 4, 4) kh.set_use_bigcount(True) savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) - kh = khmer.new_counting_hash(1, 1, 1) + kh = khmer.CountingHash(1, 1, 1) try: kh.load(savepath) except IOError as err: @@ -900,13 +900,13 @@ def test_bigcount_save(): def test_nobigcount_save(): - kh = khmer.new_counting_hash(4, 4 ** 4, 4) + kh = khmer.CountingHash(4, 4 ** 4, 4) # kh.set_use_bigcount(False) <-- this is the default savepath = utils.get_temp_filename('tempcountingsave.ht') kh.save(savepath) - kh = khmer.new_counting_hash(1, 1, 1) + kh = khmer.CountingHash(1, 1, 1) try: kh.load(savepath) except IOError as err: @@ -924,8 +924,8 @@ def test_nobigcount_save(): def test_bigcount_abund_dist(): - kh = khmer.new_counting_hash(18, 1e2, 4) - tracking = khmer.new_hashbits(18, 1e2, 4) + kh = khmer.CountingHash(18, 1e2, 4) + tracking = khmer.Hashbits(18, 1e2, 4) kh.set_use_bigcount(True) seqpath = utils.get_test_data('test-abund-read-2.fa') @@ -936,12 +936,12 @@ def test_bigcount_abund_dist(): print(kh.get('GGTTGACGGGGCTCAGGG')) pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]] - assert dist[1001] == 1, pdist + assert dist[1002] == 1, pdist def test_bigcount_abund_dist_2(): - kh = khmer.new_counting_hash(18, 1e7, 4) - tracking = khmer.new_hashbits(18, 1e7, 4) + kh = khmer.CountingHash(18, 1e7, 4) + tracking = khmer.Hashbits(18, 1e7, 4) kh.set_use_bigcount(True) seqpath = utils.get_test_data('test-abund-read.fa') @@ -958,7 +958,7 @@ def test_bigcount_abund_dist_2(): def test_bigcount_overflow(): - kh = khmer.new_counting_hash(18, 1e7, 4) + kh = khmer.CountingHash(18, 1e7, 4) kh.set_use_bigcount(True) for i in range(0, 70000): @@ -968,18 +968,18 @@ def test_bigcount_overflow(): def test_get_ksize(): - kh = khmer.new_counting_hash(22, 1, 1) + kh = khmer.CountingHash(22, 1, 1) assert kh.ksize() == 22 def test_get_hashsizes(): - kh = khmer.new_counting_hash(22, 100, 4) - assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes() + kh = khmer.CountingHash(22, 100, 4) + assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes() # def test_collect_high_abundance_kmers(): # seqpath = utils.get_test_data('test-abund-read-2.fa') # -# kh = khmer.new_counting_hash(18, 1e6, 4) +# kh = khmer.CountingHash(18, 1e6, 4) # hb = kh.collect_high_abundance_kmers(seqpath, 2, 4) @@ -989,7 +989,7 @@ def test_get_hashsizes(): def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht') - hi = khmer.new_counting_hash(12, 1000) + hi = khmer.CountingHash(12, 1000, 2) try: hi.load(savepath) assert 0, "load should fail" @@ -1001,7 +1001,7 @@ def test_load_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht') - hi = khmer.new_counting_hash(12, 1000) + hi = khmer.CountingHash(12, 1000, 2) hi.consume_fasta(inpath) hi.save(savepath) @@ -1013,7 +1013,7 @@ def test_load_truncated_should_fail(): fp.write(data[:1000]) fp.close() - hi = khmer.new_counting_hash(12, 1) + hi = khmer._CountingHash(12, [1]) try: hi.load(savepath) assert 0, "load should fail" @@ -1024,7 +1024,7 @@ def test_load_truncated_should_fail(): def test_load_gz_notexist_should_fail(): savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') - hi = khmer.new_counting_hash(12, 1000) + hi = khmer.CountingHash(12, 1000, 2) try: hi.load(savepath) assert 0, "load should fail" @@ -1036,7 +1036,7 @@ def test_load_gz_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('tempcountingsave0.ht.gz') - hi = khmer.new_counting_hash(12, 1000) + hi = khmer.CountingHash(12, 1000, 2) hi.consume_fasta(inpath) hi.save(savepath) @@ -1048,7 +1048,7 @@ def test_load_gz_truncated_should_fail(): fp.write(data[:1000]) fp.close() - hi = khmer.new_counting_hash(12, 1) + hi = khmer._CountingHash(12, [1]) try: hi.load(savepath) assert 0, "load should fail" @@ -1057,7 +1057,7 @@ def test_load_gz_truncated_should_fail(): def test_counting_file_version_check(): - ht = khmer.new_counting_hash(12, 1, 1) + ht = khmer.CountingHash(12, 1, 1) inpath = utils.get_test_data('badversion-k12.ct') @@ -1069,7 +1069,7 @@ def test_counting_file_version_check(): def test_counting_gz_file_version_check(): - ht = khmer.new_counting_hash(12, 1, 1) + ht = khmer.CountingHash(12, 1, 1) inpath = utils.get_test_data('badversion-k12.ct.gz') @@ -1083,7 +1083,7 @@ def test_counting_gz_file_version_check(): def test_counting_file_type_check(): inpath = utils.get_test_data('goodversion-k12.ht') - kh = khmer.new_counting_hash(12, 1, 1) + kh = khmer.CountingHash(12, 1, 1) try: kh.load(inpath) @@ -1093,11 +1093,11 @@ def test_counting_file_type_check(): def test_counting_gz_file_type_check(): - ht = khmer.new_hashbits(12, 1, 1) + ht = khmer.Hashbits(12, 1, 1) inpath = utils.get_test_data('goodversion-k12.ht.gz') - kh = khmer.new_counting_hash(12, 1, 1) + kh = khmer.CountingHash(12, 1, 1) try: kh.load(inpath) @@ -1108,14 +1108,14 @@ def test_counting_gz_file_type_check(): def test_counting_bad_primes_list(): try: - ht = khmer.CountingHash(12, ["a", "b", "c"], 1) + ht = khmer._CountingHash(12, ["a", "b", "c"], 1) assert 0, "bad list of primes should fail" except TypeError as e: print(str(e)) def test_bad_use_bigcount(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) countingtable.set_use_bigcount(True) assert countingtable.get_use_bigcount() try: @@ -1126,7 +1126,7 @@ def test_bad_use_bigcount(): def test_consume_absentfasta(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_fasta("absent_file.fa") assert 0, "This should fail" @@ -1135,7 +1135,7 @@ def test_consume_absentfasta(): def test_consume_absentfasta_with_reads_parser(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_fasta_with_reads_parser() assert 0, "this should fail" @@ -1152,7 +1152,7 @@ def test_consume_absentfasta_with_reads_parser(): def test_badconsume(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume() assert 0, "this should fail" @@ -1166,7 +1166,7 @@ def test_badconsume(): def test_get_badmin_count(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.get_min_count() assert 0, "this should fail" @@ -1180,7 +1180,7 @@ def test_get_badmin_count(): def test_get_badmax_count(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.get_max_count() assert 0, "this should fail" @@ -1194,7 +1194,7 @@ def test_get_badmax_count(): def test_get_badmedian_count(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.get_median_count() assert 0, "this should fail" @@ -1208,7 +1208,7 @@ def test_get_badmedian_count(): def test_get_badkadian_count(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.get_kadian_count() assert 0, "this should fail" @@ -1222,7 +1222,7 @@ def test_get_badkadian_count(): def test_badget(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.get() assert 0, "this should fail" @@ -1231,7 +1231,7 @@ def test_badget(): def test_badget_2(): - countingtable = khmer.new_counting_hash(6, 1e6) + countingtable = khmer.CountingHash(6, 1e6, 2) countingtable.consume(DNA) @@ -1247,7 +1247,7 @@ def test_badget_2(): def test_badtrim(): - countingtable = khmer.new_counting_hash(6, 1e6, 2) + countingtable = khmer.CountingHash(6, 1e6, 2) countingtable.consume(DNA) try: @@ -1259,7 +1259,7 @@ def test_badtrim(): def test_badfasta_count_kmers_by_position(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.fasta_count_kmers_by_position() except TypeError as err: @@ -1279,7 +1279,7 @@ def test_badfasta_count_kmers_by_position(): def test_badload(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.load() assert 0, "this should fail" @@ -1288,7 +1288,7 @@ def test_badload(): def test_badsave(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.save() assert 0, "this should fail" @@ -1297,7 +1297,7 @@ def test_badsave(): def test_badksize(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.ksize(True) assert 0, "this should fail" @@ -1306,7 +1306,7 @@ def test_badksize(): def test_badhashsizes(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.hashsizes(True) assert 0, "this should fail" @@ -1315,7 +1315,7 @@ def test_badhashsizes(): def test_badconsume_and_tag(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_and_tag() assert 0, "this should fail" @@ -1324,7 +1324,7 @@ def test_badconsume_and_tag(): def test_consume_fasta_and_tag(): - countingtable = khmer.new_counting_hash(4, 4 ** 4, 4) + countingtable = khmer.CountingHash(4, 4 ** 4, 4) try: countingtable.consume_fasta_and_tag() assert 0, "this should fail" @@ -1334,7 +1334,7 @@ def test_consume_fasta_and_tag(): def test_consume_and_retrieve_tags_1(): - ct = khmer.new_counting_hash(4, 4 ** 4, 4) + ct = khmer.CountingHash(4, 4 ** 4, 4) # first, for each sequence, build tags. for record in screed.open(utils.get_test_data('test-graph2.fa')): @@ -1357,7 +1357,7 @@ def test_consume_and_retrieve_tags_1(): def test_consume_and_retrieve_tags_empty(): - ct = khmer.new_counting_hash(4, 4 ** 4, 4) + ct = khmer.CountingHash(4, 4 ** 4, 4) # load each sequence but do not build tags - everything should be empty. for record in screed.open(utils.get_test_data('test-graph2.fa')): @@ -1381,7 +1381,7 @@ def test_consume_and_retrieve_tags_empty(): def test_find_all_tags_list_error(): - ct = khmer.new_counting_hash(4, 4 ** 4, 4) + ct = khmer.CountingHash(4, 4 ** 4, 4) # load each sequence but do not build tags - everything should be empty. for record in screed.open(utils.get_test_data('test-graph2.fa')): @@ -1435,7 +1435,7 @@ def test_abund_dist_gz_bigcount(): def test_counting_load_bigcount(): - count_table = khmer.new_counting_hash(10, 1e5, 4) + count_table = khmer.CountingHash(10, 1e5, 4) count_table.set_use_bigcount(True) for i in range(500): print(i, count_table.count('ATATATATAT')) diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py index 3848690d14..a76e63c4cd 100644 --- a/tests/test_counting_single.py +++ b/tests/test_counting_single.py @@ -17,7 +17,7 @@ def test_no_collision(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) kh.count('AAAA') assert kh.get('AAAA') == 1 @@ -29,14 +29,14 @@ def test_no_collision(): @attr('huge') def test_toobig(): try: - ct = khmer.new_hashtable(4, 1000000000000) + ct = khmer.CountingHash(4, 1000000000000, 1) assert 0, "this should fail" except MemoryError as err: print(str(err)) def test_collision(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) kh.count('AAAA') assert kh.get('AAAA') == 1 @@ -46,7 +46,7 @@ def test_collision(): def test_badcount(): - countingtable = khmer.new_hashtable(4, 4) + countingtable = khmer._CountingHash(4, [5]) try: countingtable.count() assert 0, "count should require one argument" @@ -60,7 +60,7 @@ def test_badcount(): def test_hashtable_n_entries(): - countingtable = khmer.new_hashtable(4, 4) + countingtable = khmer._CountingHash(4, [5]) try: countingtable.n_entries("nope") assert 0, "n_entries should accept no arguments" @@ -69,7 +69,7 @@ def test_hashtable_n_entries(): def test_complete_no_collision(): - kh = khmer.new_hashtable(4, 4 ** 2) + kh = khmer._CountingHash(4, [4**4]) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) @@ -89,13 +89,13 @@ def test_complete_no_collision(): n_fwd_filled += 1 assert n_rc_filled == kh.n_entries(), n_rc_filled - assert n_palindromes == 16, n_palindromes # @CTB check this + assert n_palindromes == 16, n_palindromes assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, \ n_fwd_filled def test_complete_2_collision(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) @@ -116,7 +116,7 @@ def test_complete_2_collision(): def test_complete_4_collision(): - kh = khmer.new_hashtable(4, 2) + kh = khmer._CountingHash(4, [3]) for i in range(0, kh.n_entries()): s = khmer.reverse_hash(i, 4) @@ -138,7 +138,7 @@ def test_complete_4_collision(): def test_maxcount(): # hashtable should saturate at some point so as not to overflow counter - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) last_count = None for _ in range(0, 10000): @@ -156,7 +156,7 @@ def test_maxcount(): def test_maxcount_with_bigcount(): # hashtable should not saturate, if use_bigcount is set. - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) kh.set_use_bigcount(True) last_count = None @@ -174,7 +174,7 @@ def test_maxcount_with_bigcount(): def test_consume_uniqify_first(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) s = "TTTT" s_rc = "AAAA" @@ -186,7 +186,7 @@ def test_consume_uniqify_first(): def test_maxcount_consume(): # hashtable should saturate at some point so as not to overflow counter - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) s = "A" * 10000 kh.consume(s) @@ -197,7 +197,7 @@ def test_maxcount_consume(): def test_maxcount_consume_with_bigcount(): # use the bigcount hack to avoid saturating the hashtable. - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) kh.set_use_bigcount(True) s = "A" * 10000 @@ -208,21 +208,21 @@ def test_maxcount_consume_with_bigcount(): def test_get_mincount(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) s = "AAAAACGT" kh.consume(s) x = kh.get_min_count(s) - assert x == 1 + assert x == 1, x kh.consume(s) x = kh.get_min_count(s) - assert x == 2 + assert x == 2, x def test_get_maxcount(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [7]) s = "AAAAACGT" kh.consume(s) @@ -236,29 +236,29 @@ def test_get_maxcount(): def test_get_maxcount_rc(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [7]) s = "AAAAACGT" src = "ACGTTTTT" kh.consume(s) x = kh.get_max_count(s) - assert x == 2 + assert x == 2, x kh.consume(src) x = kh.get_max_count(s) - assert x == 4 + assert x == 4, x def test_get_mincount_rc(): - kh = khmer.new_hashtable(4, 4) + kh = khmer._CountingHash(4, [5]) s = "AAAAACGT" src = "ACGTTTTT" kh.consume(s) x = kh.get_min_count(s) - assert x == 1 + assert x == 1, x kh.consume(src) x = kh.get_min_count(s) @@ -266,7 +266,7 @@ def test_get_mincount_rc(): def test_badget(): - kh = khmer.new_hashtable(6, 4 ** 10) + kh = khmer.CountingHash(6, 4 ** 10, 1) DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG" @@ -284,7 +284,7 @@ def test_badget(): def test_64bitshift(): - kh = khmer.new_hashtable(25, 4) + kh = khmer.CountingHash(25, 4, 1) fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG" substr = "ATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGC" @@ -293,7 +293,7 @@ def test_64bitshift(): def test_64bitshift_2(): - kh = khmer.new_hashtable(25, 4) + kh = khmer.CountingHash(25, 4, 1) fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG" kh.consume(fullstr) @@ -304,12 +304,12 @@ def test_64bitshift_2(): def test_very_short_read(): short_filename = utils.get_test_data('test-short.fa') - kh = khmer.new_hashtable(9, 4) + kh = khmer.CountingHash(9, 4, 1) n_reads, n_kmers = kh.consume_fasta(short_filename) assert n_reads == 1, n_reads assert n_kmers == 0, n_kmers - kh = khmer.new_hashtable(8, 4) + kh = khmer.CountingHash(8, 4, 1) n_reads, n_kmers = kh.consume_fasta(short_filename) assert n_reads == 1, n_reads assert n_kmers == 1, n_kmers @@ -318,7 +318,7 @@ def test_very_short_read(): class Test_ConsumeString(object): def setup(self): - self.kh = khmer.new_hashtable(4, 4 ** 4) + self.kh = khmer._CountingHash(4, [4**4]) def test_n_occupied(self): assert self.kh.n_occupied() == 0 @@ -375,7 +375,7 @@ def test_n_occupied_args(self): assert self.kh.n_occupied() == 0 self.kh.consume('AAAA') assert self.kh.n_occupied(0, 1) == 1 - assert self.kh.n_occupied(1, 4 ** 4) == 0 + assert self.kh.n_occupied(1, 4 ** 4) == 0, self.kh.n_occupied() hashvalue = khmer.forward_hash('AACT', 4) self.kh.consume('AACT') @@ -414,14 +414,14 @@ def test_max_count(self): class Test_AbundanceDistribution(object): def setup(self): - self.kh = khmer.new_hashtable(4, 4) + self.kh = khmer._CountingHash(4, [5]) A_filename = utils.get_test_data('all-A.fa') self.kh.consume_fasta(A_filename) def test_count_A(self): A_filename = utils.get_test_data('all-A.fa') - tracking = khmer.new_hashbits(4, 4, 1) + tracking = khmer._Hashbits(4, [5]) dist = self.kh.abundance_distribution(A_filename, tracking) assert sum(dist) == 1 diff --git a/tests/test_filter.py b/tests/test_filter.py index 193ac89a24..2ff9091c6f 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -27,7 +27,7 @@ def load_fa_seq_names(filename): class Test_Filter(object): def test_abund(self): - ht = khmer.new_hashtable(10, 4 ** 10) + ht = khmer.CountingHash(10, 4 ** 10, 1) filename = utils.get_test_data('test-abund-read.fa') outname = utils.get_temp_filename('test_abund.out') diff --git a/tests/test_functions.py b/tests/test_functions.py index 46e7e6b47e..72a747538a 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -78,6 +78,16 @@ def test_get_primes(): assert primes == [19, 17, 13, 11, 7, 5, 3] +def test_get_primes_fal(): + try: + primes = khmer.get_n_primes_near_x(5, 5) + assert 0, "previous statement should fail" + except AssertionError: + raise + except Exception as err: + assert "unable to find 5 prime numbers < 5" in str(err) + + def test_extract_countinghash_info_badfile(): try: khmer.extract_countinghash_info( @@ -90,7 +100,7 @@ def test_extract_countinghash_info_badfile(): def test_extract_countinghash_info(): fn = utils.get_temp_filename('test_extract_counting.ct') for size in [1e6, 2e6, 5e6, 1e7]: - ht = khmer.new_counting_hash(25, size, 4) + ht = khmer.CountingHash(25, size, 4) ht.save(fn) try: diff --git a/tests/test_graph.py b/tests/test_graph.py index 350db5d1e1..5afcb92671 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -21,7 +21,7 @@ def teardown(): class Test_ExactGraphFu(object): def setup(self): - self.ht = khmer.new_hashbits(12, 1e4) + self.ht = khmer.Hashbits(12, 1e4, 2) def test_counts(self): ht = self.ht @@ -115,7 +115,7 @@ def test_graph_links_prev_t(self): class Test_InexactGraphFu(object): def setup(self): - self.ht = khmer.new_hashbits(12, 4 ** 2 + 1) + self.ht = khmer.Hashbits(12, 4 ** 3 + 1, 2) def test_graph_links_next_a(self): ht = self.ht @@ -133,7 +133,7 @@ def test_graph_links_next_c(self): ht.consume(word[1:] + "C") x = ht.calc_connected_graph_size(word) - assert x == 2 + assert x == 2, x def test_graph_links_next_g(self): ht = self.ht @@ -199,7 +199,7 @@ def test_output_unassigned(self): filename = utils.get_test_data('random-20-a.fa') - ht = khmer.new_hashbits(21, 4, 4) + ht = khmer._Hashbits(21, [5, 7, 11, 13]) ht.consume_fasta_and_tag(filename) output_file = utils.get_temp_filename('part0test') @@ -216,7 +216,7 @@ def test_not_output_unassigned(self): filename = utils.get_test_data('random-20-a.fa') - ht = khmer.new_hashbits(21, 4, 4) + ht = khmer._Hashbits(21, [5, 7, 11, 13]) ht.consume_fasta_and_tag(filename) output_file = utils.get_temp_filename('parttest') @@ -231,7 +231,7 @@ def test_not_output_unassigned(self): def test_output_fq(self): filename = utils.get_test_data('random-20-a.fq') - ht = khmer.new_hashbits(20, 1e4, 4) + ht = khmer.Hashbits(20, 1e4, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) ht.merge_subset(subset) @@ -247,7 +247,7 @@ def test_output_fq(self): def test_disconnected_20_a(self): filename = utils.get_test_data('random-20-a.fa') - ht = khmer.new_hashbits(21, 1e5, 4) + ht = khmer.Hashbits(21, 1e5, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -257,7 +257,7 @@ def test_disconnected_20_a(self): def test_connected_20_a(self): filename = utils.get_test_data('random-20-a.fa') - ht = khmer.new_hashbits(20, 1e4, 4) + ht = khmer.Hashbits(20, 1e4, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -267,7 +267,7 @@ def test_connected_20_a(self): def test_disconnected_20_b(self): filename = utils.get_test_data('random-20-b.fa') - ht = khmer.new_hashbits(21, 1e4, 4) + ht = khmer.Hashbits(21, 1e4, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -277,7 +277,7 @@ def test_disconnected_20_b(self): def test_connected_20_b(self): filename = utils.get_test_data('random-20-b.fa') - ht = khmer.new_hashbits(20, 1e4, 4) + ht = khmer.Hashbits(20, 1e4, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -287,7 +287,7 @@ def test_connected_20_b(self): def test_disconnected_31_c(self): filename = utils.get_test_data('random-31-c.fa') - ht = khmer.new_hashbits(32, 1e6, 4) + ht = khmer.Hashbits(32, 1e6, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -297,7 +297,7 @@ def test_disconnected_31_c(self): def test_connected_31_c(self): filename = utils.get_test_data('random-31-c.fa') - ht = khmer.new_hashbits(31, 1e5, 4) + ht = khmer.Hashbits(31, 1e5, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -310,7 +310,7 @@ def test_connected_31_c(self): class Test_PythonAPI(object): def test_find_all_tags_kmersize(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT" b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" @@ -330,7 +330,7 @@ def test_find_all_tags_kmersize(self): pass def test_ordered_connect(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT" b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT" diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 0e5ed16a57..12a4f09606 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -21,8 +21,17 @@ def teardown(): utils.cleanup() +@attr('huge') +def test_toobig(): + try: + pt = khmer.Hashbits(32, 1e13, 1) + assert 0, "This should fail" + except MemoryError as err: + print(str(err)) + + def test__get_set_tag_density(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) orig = ht._get_tag_density() assert orig != 2 @@ -108,13 +117,13 @@ def test_n_occupied_1(): N_HT = 1 # number of hashtables # test modified c++ n_occupied code - ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht1.consume(record['sequence']) # this number calculated independently - assert ht1.n_occupied() == 3877 + assert ht1.n_occupied() == 3884, ht1.n_occupied() def test_bloom_python_1(): @@ -125,7 +134,7 @@ def test_bloom_python_1(): HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables - ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) n_unique = 0 for n, record in enumerate(fasta_iter(open(filename))): @@ -138,8 +147,10 @@ def test_bloom_python_1(): ht2.count(kmer) assert n_unique == 3960 - assert ht2.n_occupied() == 3882 - assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique + assert ht2.n_occupied() == 3885, ht2.n_occupied() + + # this number equals n_unique + assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers() def test_bloom_c_1(): @@ -151,12 +162,12 @@ def test_bloom_c_1(): HT_SIZE = 100000 # size of hashtable N_HT = 3 # number of hashtables - ht3 = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht3 = khmer.Hashbits(K, HT_SIZE, N_HT) for n, record in enumerate(fasta_iter(open(filename))): ht3.consume(record['sequence']) - assert ht3.n_occupied() == 3882 + assert ht3.n_occupied() == 3885 assert ht3.n_unique_kmers() == 3960 @@ -165,7 +176,7 @@ def test_n_occupied_2(): # simple one HT_SIZE = 10 # use 11 N_HT = 1 - ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht1 = khmer._Hashbits(K, [11]) ht1.count('AAAA') # 00 00 00 00 = 0 assert ht1.n_occupied() == 1 @@ -176,17 +187,14 @@ def test_n_occupied_2(): # simple one assert ht1.n_occupied() == 2 ht1.count('AGAC') # 00 11 00 10 # collision 2 - assert ht1.n_occupied() == 2 + assert ht1.n_occupied() == 2, ht1.n_occupied() def test_bloom_c_2(): # simple one K = 4 - HT_SIZE = 10 # use 11 - N_HT1 = 1 # hashtable size = 11 - N_HT2 = 2 # hashtable size = 11,13 # use only 1 hashtable, no bloom filter - ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT1) + ht1 = khmer._Hashbits(K, [11]) ht1.count('AAAA') # 00 00 00 00 = 0 ht1.count('ACTG') # 00 10 01 11 = assert ht1.n_unique_kmers() == 2 @@ -196,7 +204,7 @@ def test_bloom_c_2(): # simple one assert ht1.n_unique_kmers() == 2 # use two hashtables with 11,13 - ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT2) + ht2 = khmer._Hashbits(K, [11, 13]) ht2.count('AAAA') # 00 00 00 00 = 0 ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 @@ -210,7 +218,7 @@ def test_bloom_c_2(): # simple one def test_filter_if_present(): - ht = khmer.new_hashbits(32, 2, 2) + ht = khmer._Hashbits(32, [3, 5]) maskfile = utils.get_test_data('filter-test-A.fa') inputfile = utils.get_test_data('filter-test-B.fa') @@ -226,7 +234,7 @@ def test_filter_if_present(): def test_combine_pe(): inpfile = utils.get_test_data('combine_parts_1.fa') - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) @@ -251,7 +259,7 @@ def test_combine_pe(): def test_load_partitioned(): inpfile = utils.get_test_data('combine_parts_1.fa') - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) ht.consume_partitioned_fasta(inpfile) assert ht.count_partitions() == (2, 0) @@ -268,7 +276,7 @@ def test_load_partitioned(): def test_count_within_radius_simple(): inpfile = utils.get_test_data('all-A.fa') - ht = khmer.new_hashbits(4, 2, 2) + ht = khmer._Hashbits(4, [3, 5]) print(ht.consume_fasta(inpfile)) n = ht.count_kmers_within_radius('AAAA', 1) @@ -280,13 +288,13 @@ def test_count_within_radius_simple(): def test_count_within_radius_big(): inpfile = utils.get_test_data('random-20-a.fa') - ht = khmer.new_hashbits(20, 1e5, 4) + ht = khmer.Hashbits(20, 1e5, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) - assert n == 3960 + assert n == 3961, n - ht = khmer.new_hashbits(21, 1e5, 4) + ht = khmer.Hashbits(21, 1e5, 4) ht.consume_fasta(inpfile) n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) assert n == 39 @@ -294,7 +302,7 @@ def test_count_within_radius_big(): def test_count_kmer_degree(): inpfile = utils.get_test_data('all-A.fa') - ht = khmer.new_hashbits(4, 2, 2) + ht = khmer._Hashbits(4, [3, 5]) ht.consume_fasta(inpfile) assert ht.kmer_degree('AAAA') == 2 @@ -304,7 +312,7 @@ def test_count_kmer_degree(): def test_save_load_tagset(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) outfile = utils.get_temp_filename('tagset') @@ -326,7 +334,7 @@ def test_save_load_tagset(): def test_save_load_tagset_noclear(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) outfile = utils.get_temp_filename('tagset') @@ -354,7 +362,7 @@ def test_stop_traverse(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. @@ -375,7 +383,7 @@ def test_tag_across_stoptraverse(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) # without tagging/joining across consume, this breaks into two partition; # with, it is one partition. @@ -403,7 +411,7 @@ def test_notag_across_stoptraverse(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) # connecting k-mer at the beginning/end of a read: breaks up into two. ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') @@ -418,7 +426,7 @@ def test_notag_across_stoptraverse(): def test_find_stoptags(): - ht = khmer.new_hashbits(5, 1, 1) + ht = khmer._Hashbits(5, [1]) ht.add_stop_tag("AAAAA") assert ht.identify_stoptags_by_position("AAAAA") == [0] @@ -428,7 +436,7 @@ def test_find_stoptags(): def test_find_stoptags2(): - ht = khmer.new_hashbits(4, 1, 1) + ht = khmer._Hashbits(4, [1]) ht.add_stop_tag("ATGC") x = ht.identify_stoptags_by_position("ATGCATGCGCAT") @@ -436,17 +444,17 @@ def test_find_stoptags2(): def test_get_ksize(): - kh = khmer.new_hashbits(22, 1, 1) + kh = khmer._Hashbits(22, [1]) assert kh.ksize() == 22 def test_get_hashsizes(): - kh = khmer.new_hashbits(22, 100, 4) - assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes() + kh = khmer.Hashbits(22, 100, 4) + assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes() def test_extract_unique_paths_0(): - kh = khmer.new_hashbits(10, 4, 4) + kh = khmer._Hashbits(10, [5, 7, 11, 13]) x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] @@ -457,7 +465,7 @@ def test_extract_unique_paths_0(): def test_extract_unique_paths_1(): - kh = khmer.new_hashbits(10, 4, 4) + kh = khmer._Hashbits(10, [5, 7, 11, 13]) kh.consume('AGTGGCGATG') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) @@ -466,7 +474,7 @@ def test_extract_unique_paths_1(): def test_extract_unique_paths_2(): - kh = khmer.new_hashbits(10, 4, 4) + kh = khmer._Hashbits(10, [5, 7, 11, 13]) kh.consume('ATGGAGAGAC') x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) @@ -475,7 +483,7 @@ def test_extract_unique_paths_2(): def test_extract_unique_paths_3(): - kh = khmer.new_hashbits(10, 4, 4) + kh = khmer._Hashbits(10, [5, 7, 11, 13]) kh.consume('ATGGAGAGAC') kh.consume('AGTGGCGATG') @@ -486,7 +494,7 @@ def test_extract_unique_paths_3(): def test_extract_unique_paths_4(): - kh = khmer.new_hashbits(10, 4, 4) + kh = khmer.Hashbits(10, 1e6, 4) kh.consume('ATGGAGAGAC') kh.consume('AGTGGCGATG') @@ -506,7 +514,7 @@ def test_find_unpart(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -528,7 +536,7 @@ def test_find_unpart_notraverse(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -550,7 +558,7 @@ def test_find_unpart_fail(): HT_SIZE = 1e4 # size of hashtable N_HT = 3 # number of hashtables - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.Hashbits(K, HT_SIZE, N_HT) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -565,7 +573,7 @@ def test_find_unpart_fail(): def test_simple_median(): - hi = khmer.new_hashbits(6, 2, 2) + hi = khmer.Hashbits(6, 1e5, 2) (median, average, stddev) = hi.get_median_count("AAAAAA") print(median, average, stddev) @@ -582,7 +590,7 @@ def test_simple_median(): def test_badget(): - hbts = khmer.new_hashbits(6, 1e6, 1) + hbts = khmer.Hashbits(6, 1e6, 1) dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG" @@ -611,7 +619,7 @@ def test_badget(): def test_load_notexist_should_fail(): savepath = utils.get_temp_filename('temphashbitssave0.ht') - hi = khmer.new_counting_hash(12, 2) + hi = khmer._CountingHash(12, [1]) try: hi.load(savepath) assert 0, "load should fail" @@ -623,7 +631,8 @@ def test_load_truncated_should_fail(): inpath = utils.get_test_data('random-20-a.fa') savepath = utils.get_temp_filename('temphashbitssave0.ct') - hi = khmer.new_counting_hash(12, 1000) + hi = khmer.CountingHash(12, 1000, 2) + hi.consume_fasta(inpath) hi.save(savepath) @@ -635,7 +644,7 @@ def test_load_truncated_should_fail(): fp.write(data[:1000]) fp.close() - hi = khmer.new_counting_hash(12, 1) + hi = khmer._CountingHash(12, [1]) try: hi.load(savepath) assert 0, "load should fail" @@ -644,7 +653,7 @@ def test_load_truncated_should_fail(): def test_save_load_tagset_notexist(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) outfile = utils.get_temp_filename('tagset') try: @@ -655,7 +664,7 @@ def test_save_load_tagset_notexist(): def test_save_load_tagset_trunc(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) outfile = utils.get_temp_filename('tagset') @@ -690,13 +699,13 @@ def _build_testfiles(): # hashbits file inpath = utils.get_test_data('random-20-a.fa') - hi = khmer.new_hashbits(12, 2) + hi = khmer.Hashbits(12, 2) hi.consume_fasta(inpath) hi.save('/tmp/goodversion-k12.ht') # tagset file - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) ht.add_tag('A' * 32) ht.add_tag('G' * 32) @@ -706,7 +715,7 @@ def _build_testfiles(): fakelump_fa = utils.get_test_data('fakelump.fa') - ht = khmer.new_hashbits(32, 4, 4) + ht = khmer.Hashbits(32, 4, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) @@ -715,7 +724,7 @@ def _build_testfiles(): EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 - counting = khmer.new_counting_hash(32, 4, 4) + counting = khmer.CountingHash(32, 4, 4) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, @@ -726,7 +735,7 @@ def _build_testfiles(): def test_hashbits_file_version_check(): - ht = khmer.new_hashbits(12, 1, 1) + ht = khmer._Hashbits(12, [1]) inpath = utils.get_test_data('badversion-k12.ht') @@ -738,11 +747,11 @@ def test_hashbits_file_version_check(): def test_hashbits_file_type_check(): - kh = khmer.new_counting_hash(12, 1, 1) + kh = khmer._CountingHash(12, [1]) savepath = utils.get_temp_filename('tempcountingsave0.ct') kh.save(savepath) - ht = khmer.new_hashbits(12, 1, 1) + ht = khmer._Hashbits(12, [1]) try: ht.load(savepath) @@ -752,7 +761,7 @@ def test_hashbits_file_type_check(): def test_stoptags_file_version_check(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) inpath = utils.get_test_data('badversion-k32.stoptags') @@ -764,7 +773,7 @@ def test_stoptags_file_version_check(): def test_stoptags_ksize_check(): - ht = khmer.new_hashbits(31, 1, 1) + ht = khmer._Hashbits(31, [1]) inpath = utils.get_test_data('goodversion-k32.stoptags') try: @@ -775,7 +784,7 @@ def test_stoptags_ksize_check(): def test_stop_tags_filetype_check(): - ht = khmer.new_hashbits(31, 1, 1) + ht = khmer._Hashbits(31, [1]) inpath = utils.get_test_data('goodversion-k32.tagset') try: @@ -786,7 +795,7 @@ def test_stop_tags_filetype_check(): def test_tagset_file_version_check(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) inpath = utils.get_test_data('badversion-k32.tagset') @@ -798,7 +807,7 @@ def test_tagset_file_version_check(): def test_stop_tags_truncate_check(): - ht = khmer.new_hashbits(32, 1, 1) + ht = khmer._Hashbits(32, [1]) inpath = utils.get_test_data('goodversion-k32.tagset') data = open(inpath, 'rb').read() @@ -817,7 +826,7 @@ def test_stop_tags_truncate_check(): def test_tagset_ksize_check(): - ht = khmer.new_hashbits(31, 1, 1) + ht = khmer._Hashbits(31, [1]) inpath = utils.get_test_data('goodversion-k32.tagset') try: @@ -828,7 +837,7 @@ def test_tagset_ksize_check(): def test_tagset_filetype_check(): - ht = khmer.new_hashbits(31, 1, 1) + ht = khmer._Hashbits(31, [1]) inpath = utils.get_test_data('goodversion-k32.stoptags') try: @@ -847,7 +856,7 @@ def test_bad_primes_list(): def test_consume_absentfasta_with_reads_parser(): - presencetable = khmer.new_hashbits(31, 1, 1) + presencetable = khmer._Hashbits(31, [1]) try: presencetable.consume_fasta_with_reads_parser() assert 0, "this should fail" @@ -861,3 +870,24 @@ def test_consume_absentfasta_with_reads_parser(): print(str(err)) except ValueError as err: print(str(err)) + + +def test_bad_primes(): + try: + countingtable = khmer._Hashbits.__new__( + khmer._Hashbits, 6, ["a", "b", "c"]) + assert 0, "this should fail" + except TypeError as e: + print(str(e)) + + +def test_consume_fasta_and_tag_with_badreads_parser(): + presencetable = khmer.Hashbits(6, 1e6, 2) + try: + readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa")) + presencetable.consume_fasta_and_tag_with_reads_parser(readsparser) + assert 0, "this should fail" + except IOError as e: + print(str(e)) + except ValueError as e: + print(str(e)) diff --git a/tests/test_hashbits_obj.py b/tests/test_hashbits_obj.py deleted file mode 100644 index c23fcebd8d..0000000000 --- a/tests/test_hashbits_obj.py +++ /dev/null @@ -1,565 +0,0 @@ -from __future__ import print_function -from __future__ import absolute_import -# -# This file is part of khmer, https://github.com/dib-lab/khmer/, and is -# Copyright (C) Michigan State University, 2009-2015. It is licensed under -# the three-clause BSD license; see LICENSE. -# Contact: khmer-project@idyll.org -# -# pylint: disable=missing-docstring,protected-access - -# -# This is an exact copy of test_hashbits, with all invocations of -# khmer.new_hashbits replaced by khmer.Hashbits constructor calls -# - -import khmer -from khmer import Hashbits - -from screed.fasta import fasta_iter -import screed - -from . import khmer_tst_utils as utils -from nose.plugins.attrib import attr - - -def teardown(): - utils.cleanup() - - -@attr('huge') -def test_toobig(): - try: - pt = khmer.Hashbits(32, 1e13, 1) - assert 0, "This should fail" - except MemoryError as err: - print(str(err)) - - -def test__get_set_tag_density(): - ht = khmer.Hashbits(32, 1, 1) - - orig = ht._get_tag_density() - assert orig != 2 - ht._set_tag_density(2) - assert ht._get_tag_density() == 2 - - -def test_n_occupied_1(): - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 1 # number of hashtables - - # test modified c++ n_occupied code - ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) - - for n, record in enumerate(fasta_iter(open(filename))): - ht1.consume(record['sequence']) - - # this number calculated independently - assert ht1.n_occupied() == 3877 - - -def test_bloom_python_1(): - # test python code to count unique kmers using bloom filter - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) - - n_unique = 0 - for n, record in enumerate(fasta_iter(open(filename))): - sequence = record['sequence'] - seq_len = len(sequence) - for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] - if (not ht2.get(kmer)): - n_unique += 1 - ht2.count(kmer) - - assert n_unique == 3960 - assert ht2.n_occupied() == 3882 - assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique - - -def test_bloom_c_1(): - # test c++ code to count unique kmers using bloom filter - - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht3 = khmer.Hashbits(K, HT_SIZE, N_HT) - - for n, record in enumerate(fasta_iter(open(filename))): - ht3.consume(record['sequence']) - - assert ht3.n_occupied() == 3882 - assert ht3.n_unique_kmers() == 3960 - - -def test_n_occupied_2(): # simple one - K = 4 - HT_SIZE = 10 # use 11 - N_HT = 1 - - ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) - ht1.count('AAAA') # 00 00 00 00 = 0 - assert ht1.n_occupied() == 1 - - ht1.count('ACTG') # 00 10 01 11 = - assert ht1.n_occupied() == 2 - - ht1.count('AACG') # 00 00 10 11 = 11 # collision 1 - - assert ht1.n_occupied() == 2 - ht1.count('AGAC') # 00 11 00 10 # collision 2 - assert ht1.n_occupied() == 2 - - -def test_bloom_c_2(): # simple one - K = 4 - HT_SIZE = 10 # use 11 - N_HT1 = 1 # hashtable size = 11 - N_HT2 = 2 # hashtable size = 11,13 - - # use only 1 hashtable, no bloom filter - ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1) - ht1.count('AAAA') # 00 00 00 00 = 0 - ht1.count('ACTG') # 00 10 01 11 = - assert ht1.n_unique_kmers() == 2 - ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer - assert ht1.n_unique_kmers() == 2 - ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer - assert ht1.n_unique_kmers() == 2 - - # use two hashtables with 11,13 - ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2) - ht2.count('AAAA') # 00 00 00 00 = 0 - - ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 - assert ht2.n_unique_kmers() == 2 - ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer - assert ht2.n_unique_kmers() == 3 - ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50 - # collision with both 2nd and 3rd kmers - - assert ht2.n_unique_kmers() == 3 - - -def test_filter_if_present(): - ht = khmer.Hashbits(32, 1e6, 2) - - maskfile = utils.get_test_data('filter-test-A.fa') - inputfile = utils.get_test_data('filter-test-B.fa') - outfile = utils.get_temp_filename('filter') - - ht.consume_fasta(maskfile) - ht.filter_if_present(inputfile, outfile) - - records = list(fasta_iter(open(outfile))) - assert len(records) == 1 - assert records[0]['name'] == '3' - - -def test_combine_pe(): - inpfile = utils.get_test_data('combine_parts_1.fa') - ht = khmer.Hashbits(32, 1, 1) - - ht.consume_partitioned_fasta(inpfile) - assert ht.count_partitions() == (2, 0) - - s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" - pid1 = ht.get_partition_id(s1) - - s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" - pid2 = ht.get_partition_id(s2) - - assert pid1 == 2 - assert pid2 == 80293 - - ht.join_partitions(pid1, pid2) - - pid1 = ht.get_partition_id(s1) - pid2 = ht.get_partition_id(s2) - - assert pid1 == pid2 - assert ht.count_partitions() == (1, 0) - - -def test_load_partitioned(): - inpfile = utils.get_test_data('combine_parts_1.fa') - ht = khmer.Hashbits(32, 1, 1) - - ht.consume_partitioned_fasta(inpfile) - assert ht.count_partitions() == (2, 0) - - s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" - assert ht.get(s1) - - s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" - assert ht.get(s2) - - s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] - assert ht.get(s3) - - -def test_count_within_radius_simple(): - inpfile = utils.get_test_data('all-A.fa') - ht = khmer.Hashbits(4, 1e6, 2) - - print(ht.consume_fasta(inpfile)) - n = ht.count_kmers_within_radius('AAAA', 1) - assert n == 1 - - n = ht.count_kmers_within_radius('AAAA', 10) - assert n == 1 - - -def test_count_within_radius_big(): - inpfile = utils.get_test_data('random-20-a.fa') - ht = khmer.Hashbits(20, 1e6, 4) - - ht.consume_fasta(inpfile) - n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) - assert n == 3960 - - ht = khmer.Hashbits(21, 1e6, 4) - ht.consume_fasta(inpfile) - n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) - assert n == 39 - - -def test_count_kmer_degree(): - inpfile = utils.get_test_data('all-A.fa') - ht = khmer.Hashbits(4, 1e6, 2) - ht.consume_fasta(inpfile) - - assert ht.kmer_degree('AAAA') == 2 - assert ht.kmer_degree('AAAT') == 1 - assert ht.kmer_degree('AATA') == 0 - assert ht.kmer_degree('TAAA') == 1 - - -def test_save_load_tagset(): - ht = khmer.Hashbits(32, 1, 1) - - outfile = utils.get_temp_filename('tagset') - - ht.add_tag('A' * 32) - ht.save_tagset(outfile) - - ht.add_tag('G' * 32) - - ht.load_tagset(outfile) # implicitly => clear_tags=True - ht.save_tagset(outfile) - - # if tags have been cleared, then the new tagfile will be larger (34 bytes) - # else smaller (26 bytes). - - fp = open(outfile, 'rb') - data = fp.read() - fp.close() - assert len(data) == 30, len(data) - - -def test_save_load_tagset_noclear(): - ht = khmer.Hashbits(32, 1, 1) - - outfile = utils.get_temp_filename('tagset') - - ht.add_tag('A' * 32) - ht.save_tagset(outfile) - - ht.add_tag('G' * 32) - - ht.load_tagset(outfile, False) # set clear_tags => False; zero tags - ht.save_tagset(outfile) - - # if tags have been cleared, then the new tagfile will be large (34 bytes); - # else small (26 bytes). - - fp = open(outfile, 'rb') - data = fp.read() - fp.close() - assert len(data) == 38, len(data) - - -def test_stop_traverse(): - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - - # without tagging/joining across consume, this breaks into two partition; - # with, it is one partition. - ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') - - ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags - subset = ht.do_subset_partition(0, 0, True) - ht.merge_subset(subset) - - n, _ = ht.count_partitions() - assert n == 2, n - - -def test_tag_across_stoptraverse(): - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - - # without tagging/joining across consume, this breaks into two partition; - # with, it is one partition. - ht.add_stop_tag('CCGAATATATAACAGCGACG') - - ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across - - subset = ht.do_subset_partition(0, 0) - n, _ = ht.count_partitions() - assert n == 99 # reads only connected by traversal... - - n, _ = ht.subset_count_partitions(subset) - assert n == 2 # but need main to cross stoptags. - - ht.merge_subset(subset) - - n, _ = ht.count_partitions() # ta-da! - assert n == 1, n - - -def test_notag_across_stoptraverse(): - filename = utils.get_test_data('random-20-a.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - - # connecting k-mer at the beginning/end of a read: breaks up into two. - ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') - - ht.consume_fasta_and_tag_with_stoptags(filename) - - subset = ht.do_subset_partition(0, 0) - ht.merge_subset(subset) - - n, _ = ht.count_partitions() - assert n == 2, n - - -def test_find_stoptags(): - ht = khmer.Hashbits(5, 1, 1) - ht.add_stop_tag("AAAAA") - - assert ht.identify_stoptags_by_position("AAAAA") == [0] - assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1] - assert ht.identify_stoptags_by_position("TTTTT") == [0] - assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1] - - -def test_find_stoptags2(): - ht = khmer.Hashbits(4, 1, 1) - ht.add_stop_tag("ATGC") - - x = ht.identify_stoptags_by_position("ATGCATGCGCAT") - assert x == [0, 2, 4, 8], x - - -def test_get_ksize(): - kh = khmer.Hashbits(22, 1, 1) - assert kh.ksize() == 22 - - -def test_get_hashsizes(): - kh = khmer.Hashbits(22, 100, 4) - assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes() - - -def test_extract_unique_paths_0(): - kh = khmer.Hashbits(10, 1e5, 4) - - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] - - kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - assert not x - - -def test_extract_unique_paths_1(): - kh = khmer.Hashbits(10, 1e5, 4) - - kh.consume('AGTGGCGATG') - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print(x) - assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer - - -def test_extract_unique_paths_2(): - kh = khmer.Hashbits(10, 1e5, 4) - - kh.consume('ATGGAGAGAC') - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print(x) - assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer - - -def test_extract_unique_paths_3(): - kh = khmer.Hashbits(10, 1e5, 4) - - kh.consume('ATGGAGAGAC') - kh.consume('AGTGGCGATG') - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print(x) - # all but the 1st/last k-mer - assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT'] - - -def test_extract_unique_paths_4(): - kh = khmer.Hashbits(10, 1e5, 4) - - kh.consume('ATGGAGAGAC') - kh.consume('AGTGGCGATG') - - kh.consume('ATAGACAGGA') - - x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) - print(x) - assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT'] - - -def test_find_unpart(): - filename = utils.get_test_data('random-20-a.odd.fa') - filename2 = utils.get_test_data('random-20-a.even.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - ht.consume_fasta_and_tag(filename) - - subset = ht.do_subset_partition(0, 0) - ht.merge_subset(subset) - - n, _ = ht.count_partitions() - assert n == 49 - - ht.find_unpart(filename2, True, False) - n, _ = ht.count_partitions() - assert n == 1, n # all sequences connect - - -def test_find_unpart_notraverse(): - filename = utils.get_test_data('random-20-a.odd.fa') - filename2 = utils.get_test_data('random-20-a.even.fa') - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - ht.consume_fasta_and_tag(filename) - - subset = ht.do_subset_partition(0, 0) - ht.merge_subset(subset) - - n, _ = ht.count_partitions() - assert n == 49 - - ht.find_unpart(filename2, False, False) # <-- don't traverse - n, _ = ht.count_partitions() - assert n == 99, n # all sequences disconnected - - -def test_find_unpart_fail(): - filename = utils.get_test_data('random-20-a.odd.fa') - filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd - - K = 20 # size of kmer - HT_SIZE = 100000 # size of hashtable - N_HT = 3 # number of hashtables - - ht = khmer.Hashbits(K, HT_SIZE, N_HT) - ht.consume_fasta_and_tag(filename) - - subset = ht.do_subset_partition(0, 0) - ht.merge_subset(subset) - - n, _ = ht.count_partitions() - assert n == 49 - - ht.find_unpart(filename2, True, False) - n, _ = ht.count_partitions() - assert n == 49, n # only 49 sequences worth of tags - - -def test_simple_median(): - hi = khmer.Hashbits(6, 1e6, 2) - - (median, average, stddev) = hi.get_median_count("AAAAAA") - print(median, average, stddev) - assert median == 0 - assert average == 0.0 - assert stddev == 0.0 - - hi.consume("AAAAAA") - (median, average, stddev) = hi.get_median_count("AAAAAA") - print(median, average, stddev) - assert median == 1 - assert average == 1.0 - assert stddev == 0.0 - - -def test_badget(): - hbts = khmer.Hashbits(6, 1e6, 1) - - dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG" - - hbts.consume(dna) - - assert hbts.get("AGCTTT") == 1 - - assert hbts.get("GATGAG") == 0 - - try: - hbts.get("AGCTT") - assert 0, "this should fail" - except ValueError as err: - print(str(err)) - - -def test_bad_primes(): - try: - countingtable = khmer._Hashbits.__new__( - khmer._Hashbits, 6, ["a", "b", "c"]) - assert 0, "this should fail" - except TypeError as e: - print(str(e)) - - -def test_consume_fasta_and_tag_with_badreads_parser(): - presencetable = khmer.Hashbits(6, 1e6, 2) - try: - readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa")) - presencetable.consume_fasta_and_tag_with_reads_parser(readsparser) - assert 0, "this should fail" - except IOError as e: - print(str(e)) - except ValueError as e: - print(str(e)) diff --git a/tests/test_lump.py b/tests/test_lump.py index f503bd7dad..c7eeb0dae7 100644 --- a/tests/test_lump.py +++ b/tests/test_lump.py @@ -19,7 +19,7 @@ def test_fakelump_together(): fakelump_fa = utils.get_test_data('fakelump.fa') - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) @@ -35,7 +35,7 @@ def test_fakelump_stop(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_stoptags_txt = utils.get_test_data('fakelump.fa.stoptags.txt') - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) for line in open(fakelump_stoptags_txt): @@ -53,7 +53,7 @@ def test_fakelump_stop(): def test_fakelump_stop2(): fakelump_fa = utils.get_test_data('fakelump.fa') - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) ht.add_stop_tag('GGGGAGGGGTGCAGTTGTGACTTGCTCGAGAG') @@ -71,7 +71,7 @@ def test_fakelump_repartitioning(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) @@ -88,7 +88,7 @@ def test_fakelump_repartitioning(): EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 - counting = khmer.new_counting_hash(32, 1e4, 4) + counting = khmer.CountingHash(32, 1e5, 4) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, @@ -99,7 +99,7 @@ def test_fakelump_repartitioning(): # ok, now re-do everything with these stop tags, specifically. - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) ht.load_stop_tags(fakelump_fa_foo) @@ -107,14 +107,14 @@ def test_fakelump_repartitioning(): ht.merge_subset(subset) (n_partitions, n_singletons) = ht.count_partitions() - assert n_partitions == 3, n_partitions + assert n_partitions == 6, n_partitions def test_fakelump_load_stop_tags_trunc(): fakelump_fa = utils.get_test_data('fakelump.fa') fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') - ht = khmer.new_hashbits(32, 1e5, 4) + ht = khmer.Hashbits(32, 1e5, 4) ht.consume_fasta_and_tag(fakelump_fa) subset = ht.do_subset_partition(0, 0) @@ -131,7 +131,7 @@ def test_fakelump_load_stop_tags_trunc(): EXCURSION_DISTANCE = 40 EXCURSION_KMER_THRESHOLD = 82 EXCURSION_KMER_COUNT_THRESHOLD = 1 - counting = khmer.new_counting_hash(32, 4, 4) + counting = khmer._CountingHash(32, [5, 7, 11, 13]) ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE, @@ -146,7 +146,7 @@ def test_fakelump_load_stop_tags_trunc(): fp.close() # ok, now try loading these stop tags; should fail. - ht = khmer.new_hashbits(32, 4, 4) + ht = khmer._Hashbits(32, [5, 7, 11, 13]) ht.consume_fasta_and_tag(fakelump_fa) try: @@ -160,7 +160,7 @@ def test_fakelump_load_stop_tags_notexist(): fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo') # ok, now try loading these stop tags; should fail. - ht = khmer.new_hashbits(32, 4, 4) + ht = khmer._Hashbits(32, [5, 7, 11, 13]) try: ht.load_stop_tags(fakelump_fa_foo) diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py index fc0cf64b0f..069b08d8ff 100644 --- a/tests/test_normalize_by_median.py +++ b/tests/test_normalize_by_median.py @@ -162,10 +162,10 @@ def test_normalize_by_median_report_fp(): args = ['-C', '1', '-k', '17', '-R', outfile, infile] (status, out, err) = utils.runscript(script, args, in_dir) - assert "fp rate estimated to be 0.626" in err, err + assert "fp rate estimated to be 0.623" in err, err report = open(outfile, 'r') line = report.readline() - assert "100000 25232 0.25232" in line, line + assert "100000 25261 0.25261" in line, line def test_normalize_by_median_unpaired_and_paired(): @@ -183,7 +183,7 @@ def test_normalize_by_median_unpaired_and_paired(): args = ['-C', CUTOFF, '-k', '17', '-u', unpairedfile, '-p', infile] (status, out, err) = utils.runscript(script, args, in_dir) - assert 'Total number of unique k-mers: 4029' in err, err + assert 'Total number of unique k-mers: 4030' in err, err outfile = infile + '.keep' assert os.path.exists(outfile), outfile @@ -420,18 +420,21 @@ def test_normalize_by_median_emptycountingtable(): def test_normalize_by_median_fpr(): - MIN_TABLESIZE_PARAM = 1 + MAX_TABLESIZE_PARAM = 12 infile = utils.get_temp_filename('test-fpr.fq') in_dir = os.path.dirname(infile) shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile) script = 'normalize-by-median.py' - args = ['-f', '-k 17', '-x ' + str(MIN_TABLESIZE_PARAM), infile] + args = ['-f', '-k 17', '-x ' + str(MAX_TABLESIZE_PARAM), infile] (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True) - assert os.path.exists(infile + '.keep') + print(out) + print(err) + + assert os.path.exists(infile + '.keep'), infile assert '** ERROR: the graph structure is too small' in err, err diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py index f53df5e427..0fa9eeca55 100644 --- a/tests/test_read_aligner.py +++ b/tests/test_read_aligner.py @@ -15,7 +15,7 @@ def eq_(v1, v2): def test_alignnocov(): - ch = khmer.new_counting_hash(10, 1048576, 1) + ch = khmer.CountingHash(10, 1048576, 1) read = "ACCTAGGTTCGACATGTACC" aligner = khmer.ReadAligner(ch, 0, 0) for i in range(20): @@ -29,7 +29,7 @@ def test_alignnocov(): def test_simple_readalign(): - ch = khmer.new_counting_hash(10, 1048576, 1) + ch = khmer.CountingHash(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 2, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT") @@ -48,7 +48,7 @@ def test_simple_readalign(): def test_readalign(): - ch = khmer.new_counting_hash(10, 1048576, 1) + ch = khmer.CountingHash(10, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for i in range(20): ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT") @@ -209,7 +209,7 @@ def test_readalign(): def test_readalign_new(): - ch = khmer.new_counting_hash(32, 1048576, 1) + ch = khmer.CountingHash(32, 1048576, 1) aligner = khmer.ReadAligner(ch, 1, 0) for seq in ht_seqs: ch.consume(seq) diff --git a/tests/test_sandbox_scripts.py b/tests/test_sandbox_scripts.py index d99925c4ff..807f246a2c 100644 --- a/tests/test_sandbox_scripts.py +++ b/tests/test_sandbox_scripts.py @@ -232,3 +232,25 @@ def test_sweep_reads_3(): assert os.path.exists(counts_fn) assert os.path.exists(os.path.join(wdir, 'test.dist.txt')) assert not os.path.exists(os.path.join(wdir, 'test_multi.fa')) + + +def test_collect_reads(): + outfile = utils.get_temp_filename('out.graph') + infile = utils.get_test_data('test-reads.fa') + script = 'collect-reads.py' + args = ['-M', '1e7', outfile, infile] + + status, out, err = utils.runscript(script, args, sandbox=True) + + assert status == 0 + assert os.path.exists(outfile) + + +def test_saturate_by_median(): + infile = utils.get_test_data('test-reads.fa') + script = 'saturate-by-median.py' + args = ['-M', '1e7', infile] + + status, out, err = utils.runscript(script, args, sandbox=True) + + assert status == 0 diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py index 3369da5555..565d854eb0 100644 --- a/tests/test_script_arguments.py +++ b/tests/test_script_arguments.py @@ -12,9 +12,12 @@ import sys import io +import collections from . import khmer_tst_utils as utils +import argparse import khmer.kfile +from khmer import khmer_args def test_check_space(): @@ -33,9 +36,13 @@ def test_check_space(): def test_check_tablespace(): save_stderr, sys.stderr = sys.stderr, io.StringIO() + + parser = khmer_args.build_counting_args() + args = parser.parse_args(['-M', '1e9']) + try: - khmer.kfile.check_space_for_hashtable( - 1e9, force=False, _testhook_free_space=0) + khmer.kfile.check_space_for_hashtable(args, 'countgraph', force=False, + _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: print(str(e)) @@ -59,9 +66,13 @@ def test_check_space_force(): def test_check_tablespace_force(): save_stderr, sys.stderr = sys.stderr, io.StringIO() + + parser = khmer_args.build_counting_args() + args = parser.parse_args(['-M', '1e9']) + try: - khmer.kfile.check_space_for_hashtable( - 1e9, force=True, _testhook_free_space=0) + khmer.kfile.check_space_for_hashtable(args, 'countgraph', True, + _testhook_free_space=0) assert True, "this should pass" except SystemExit as e: print(str(e)) @@ -79,3 +90,161 @@ def test_invalid_file_warn(): print(str(e)) finally: sys.stderr = save_stderr + + +FakeArgparseObject = collections.namedtuple('FakeArgs', + ['ksize', 'n_tables', + 'max_tablesize', + 'max_memory_usage']) + + +def test_create_countgraph_1(): + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + countgraph = khmer_args.create_countgraph(args) + assert countgraph.hashsizes() == [2499997L, 2499989L, 2499983L, 2499967L] + assert sum(countgraph.hashsizes()) < max_mem, sum(countgraph.hashsizes()) + + +def test_create_countgraph_2(): + # tests overriding ksize by passing into create_nodegraph explicitly. + + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + countgraph = khmer_args.create_countgraph(args, ksize=15) + assert countgraph.ksize() == 15 + + +def test_create_countgraph_3(): + # tests too-big ksize + + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + try: + countgraph = khmer_args.create_countgraph(args, ksize=35) + assert 0, "should not reach this" + except SystemExit as err: + print(str(err)) + + +def test_create_countgraph_4_multiplier(): + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + countgraph = khmer_args.create_countgraph(args, multiplier=2.0) + assert sum(countgraph.hashsizes()) < max_mem / 2.0, \ + sum(countgraph.hashsizes()) + + +def test_create_nodegraph_1(): + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + nodegraph = khmer_args.create_nodegraph(args) + assert nodegraph.hashsizes() == [19999999L, 19999981L, + 19999963L, 19999927L] + + assert sum(nodegraph.hashsizes())/8.0 < max_mem, sum(nodegraph.hashsizes()) + + +def test_create_nodegraph_2(): + # tests overriding ksize by passing into create_nodegraph explicitly. + + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + nodegraph = khmer_args.create_nodegraph(args, ksize=15) + assert nodegraph.ksize() == 15 + + +def test_create_nodegraph_3(): + # tests too-big ksize + + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + try: + nodegraph = khmer_args.create_nodegraph(args, ksize=35) + assert 0, "should not reach this" + except SystemExit as err: + print(str(err)) + + +def test_create_nodegraph_4_multiplier(): + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0) + assert sum(nodegraph.hashsizes())/8.0 < max_mem / 2.0, \ + sum(nodegraph.hashsizes()) + + +def test_report_on_config_bad_hashtype(): + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + try: + khmer_args.report_on_config(args, 'foograph') + assert 0, "the previous statement should raise an exception" + except AssertionError: + raise + except Exception as err: + assert "unknown graph type: foograph" in str(err), str(err) + + +def test_fail_calculate_foograph_size(): + # tests unknown graph type + + ksize = khmer_args.DEFAULT_K + n_tables = khmer_args.DEFAULT_N_TABLES + max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE + max_mem = 1e7 + + args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem) + + try: + nodegraph = khmer_args._calculate_tablesize(args, 'foograph') + assert 0, "previous statement should fail" + except AssertionError: + raise + except Exception as err: + assert "unknown graph type: foograph" in str(err), str(err) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index ecd9ff2b4f..6178c959af 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -50,10 +50,26 @@ def test_load_into_counting(): args.extend([outfile, infile]) (status, out, err) = utils.runscript(script, args) - assert 'Total number of unique k-mers: 89' in err, err + assert 'Total number of unique k-mers: 83' in err, err assert os.path.exists(outfile) +def test_load_into_counting_max_memory_usage_parameter(): + script = 'load-into-counting.py' + args = ['-M', '2e3', '-k', '20', '-t'] + + outfile = utils.get_temp_filename('out.ct') + infile = utils.get_test_data('test-abund-read-2.fa') + + args.extend([outfile, infile]) + + (status, out, err) = utils.runscript(script, args) + assert os.path.exists(outfile) + + kh = khmer.load_counting_hash(outfile) + assert sum(kh.hashsizes()) < 3e8 + + def test_load_into_counting_abundance_dist_nobig(): script = 'load-into-counting.py' args = ['-x', '1e3', '-N', '2', '-k', '20', '-t', '-b'] @@ -64,7 +80,7 @@ def test_load_into_counting_abundance_dist_nobig(): args.extend([outfile, infile]) (status, out, err) = utils.runscript(script, args) - assert 'Total number of unique k-mers: 89' in err, err + assert 'Total number of unique k-mers: 83' in err, err assert os.path.exists(outfile) htfile = outfile @@ -180,13 +196,14 @@ def test_load_into_counting_json(): with open(jsonfile) as jsonfh: got_json = json.load(jsonfh) outbase = os.path.basename(outfile) + expected_json = { - "files": [infile], - "ht_name": outbase, - "num_kmers": 95, - "num_reads": 1001, - "fpr": 9.024965705097741e-11, - "mrinfo_version": "0.2.0", + u"files": [infile], + u"ht_name": outbase, + u"num_kmers": 95, + u"num_reads": 1001, + u"fpr": 9.025048735197377e-11, + u"mrinfo_version": "0.2.0", } assert got_json == expected_json, got_json @@ -534,7 +551,7 @@ def test_filter_stoptags(): # now, create a file with some stop tags in it -- K = 18 - kh = khmer.new_hashbits(K, 1, 1) + kh = khmer._Hashbits(K, [1]) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh @@ -565,7 +582,7 @@ def test_filter_stoptags_fq(): # now, create a file with some stop tags in it -- K = 18 - kh = khmer.new_hashbits(K, 1, 1) + kh = khmer._Hashbits(K, [1]) kh.add_stop_tag('GTTGACGGGGCTCAGGGG') kh.save_stop_tags(stopfile) del kh @@ -859,6 +876,30 @@ def test_oxli_build_graph_multithread(): (status, out, err) = utils.runscript(script, args) +def test_load_graph_max_memory_usage_parameter(): + script = 'load-graph.py' + args = ['-M', '2e7', '-k', '20', '-n'] + + outfile = utils.get_temp_filename('out') + infile = utils.get_test_data('random-20-a.fa') + + args.extend([outfile, infile]) + + (status, out, err) = utils.runscript(script, args) + + assert 'Total number of unique k-mers: 3960' in err, err + + ht_file = outfile + '.pt' + assert os.path.exists(ht_file), ht_file + + try: + ht = khmer.load_hashbits(ht_file) + except IOError as err: + assert 0, str(err) + + assert (sum(ht.hashsizes()) / 8.) < 2e7, ht.hashsizes() + + def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20, do_partition=False, annotate_partitions=False, @@ -1176,13 +1217,13 @@ def test_extract_partitions_header_whitespace(): assert os.path.exists(groupfile) dist = open(distfile).readline() - assert dist.strip() == '1 11957 11957 11957' + assert dist.strip() == '1 11960 11960 11960', dist.strip() parts = [r.name.split('\t')[1] for r in screed.open(partfile, parse_description=False)] assert len(parts) == 13538, len(parts) parts = set(parts) - assert len(parts) == 12601, len(parts) + assert len(parts) == 12602, len(parts) def test_extract_partitions_fq(): @@ -2393,7 +2434,7 @@ def test_count_overlap_invalid_datafile(): htfile = _make_graph(seqfile1, ksize=20) outfile = utils.get_temp_filename('overlap.out', in_dir) script = 'count-overlap.py' - args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', + args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000', htfile + '.pt', htfile + '.pt', outfile] (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True) if sys.version_info.major == 2: @@ -2412,21 +2453,21 @@ def test_count_overlap(): shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = 'count-overlap.py' - args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000', + args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000', htfile + '.pt', seqfile2, outfile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) - assert '# of unique k-mers in dataset2: 759047' in data - assert '# of overlap unique k-mers: 245621' in data + assert '# of unique k-mers in dataset2: 759020' in data, data + assert '# of overlap unique k-mers: 245547' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) - assert '178633 1155' in data - assert '496285 2970' in data - assert '752053 238627' in data + assert '178630 1134' in data, data + assert '496280 2904' in data + assert '752031 238558' in data def test_count_overlap_csv(): @@ -2439,21 +2480,21 @@ def test_count_overlap_csv(): shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2) htfile = _make_graph(seqfile1, ksize=20) script = 'count-overlap.py' - args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', + args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000', '--csv', htfile + '.pt', seqfile2, outfile] (status, out, err) = utils.runscript(script, args, in_dir) assert status == 0 assert os.path.exists(outfile), outfile data = [x.strip() for x in open(outfile)] data = set(data) - assert '# of unique k-mers in dataset2: 759047' in data - assert '# of overlap unique k-mers: 245621' in data + assert '# of unique k-mers in dataset2: 759020' in data + assert '# of overlap unique k-mers: 245547' in data assert os.path.exists(curvefile), curvefile data = [x.strip() for x in open(curvefile)] data = set(data) - assert '178633,1155' in data - assert '496285,2970' in data - assert '752053,238627' in data + assert '178630,1134' in data, data + assert '496280,2904' in data + assert '752031,238558' in data def execute_streaming_diginorm(ifilename): diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py index fc3f470d81..b5a42098fa 100644 --- a/tests/test_subset_graph.py +++ b/tests/test_subset_graph.py @@ -21,7 +21,7 @@ def teardown(): class Test_RandomData(object): def test_3_merge_013(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') @@ -43,7 +43,7 @@ def test_3_merge_013(self): assert n_partitions == 1, n_partitions # combined. def test_3_merge_023(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -64,7 +64,7 @@ def test_3_merge_023(self): assert n_partitions == 1, n_partitions # combined. def test_5_merge_046(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph5.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -83,7 +83,7 @@ def test_5_merge_046(self): assert n_partitions == 1, n_partitions # combined. def test_random_20_a_succ(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') @@ -102,7 +102,7 @@ def test_random_20_a_succ(self): assert n_partitions == 1, n_partitions def test_random_20_a_succ_II(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') @@ -121,7 +121,7 @@ def test_random_20_a_succ_II(self): assert n_partitions == 1, n_partitions def test_random_20_a_succ_III(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') @@ -144,7 +144,7 @@ def test_random_20_a_succ_III(self): assert n_partitions == 1, n_partitions def test_random_20_a_succ_IV(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') outfile = utils.get_temp_filename('out') @@ -164,7 +164,7 @@ def test_random_20_a_succ_IV(self): assert n_partitions == 1, n_partitions def test_random_20_a_succ_IV_save(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') savefile_ht = utils.get_temp_filename('ht') @@ -177,7 +177,7 @@ def test_random_20_a_succ_IV_save(self): ht.save_tagset(savefile_tags) del ht - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) ht.load(savefile_ht) ht.load_tagset(savefile_tags) @@ -200,7 +200,7 @@ def test_random_20_a_succ_IV_save(self): class Test_SaveLoadPmap(object): def test_save_load_merge(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -233,7 +233,7 @@ def test_save_load_merge(self): assert n_partitions == 1, n_partitions # combined. def test_save_load_merge_truncate(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -270,7 +270,7 @@ def test_save_load_merge_truncate(self): print(str(err), i) def test_save_load_merge_2(self): - ht = khmer.new_hashbits(20, 4 ** 8 + 1) + ht = khmer.Hashbits(20, 4 ** 8 + 1, 2) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -302,7 +302,7 @@ def test_save_load_merge_2(self): assert n_partitions == 1, n_partitions # combined. def test_save_load_merge_nexist(self): - ht = khmer.new_hashbits(20, 1) + ht = khmer._Hashbits(20, [1]) try: a = ht.load_subset_partitionmap('this does not exist') assert 0, "this should not succeed" @@ -310,7 +310,7 @@ def test_save_load_merge_nexist(self): print(str(e)) def test_save_merge_from_disk(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -339,7 +339,7 @@ def test_save_merge_from_disk(self): assert n_partitions == 1, n_partitions # combined. def test_save_merge_from_disk_2(self): - ht = khmer.new_hashbits(20, 4 ** 7 + 1) + ht = khmer.Hashbits(20, 4 ** 7 + 1, 2) filename = utils.get_test_data('random-20-a.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -368,7 +368,7 @@ def test_save_merge_from_disk_2(self): assert n_partitions == 1, n_partitions # combined. def test_save_merge_from_disk_file_not_exist(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -389,7 +389,7 @@ def test_save_merge_from_disk_file_not_exist(self): print(str(e)) def test_merge_from_disk_file_bad_type(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) infile = utils.get_test_data('goodversion-k12.ht') try: @@ -399,7 +399,7 @@ def test_merge_from_disk_file_bad_type(self): print(str(e)) def test_merge_from_disk_file_version(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) infile = utils.get_test_data('badversion-k12.ht') try: @@ -409,7 +409,7 @@ def test_merge_from_disk_file_version(self): print(str(e)) def test_save_merge_from_disk_ksize(self): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -424,7 +424,7 @@ def test_save_merge_from_disk_ksize(self): ht.save_subset_partitionmap(x, outfile1) del x - ht = khmer.new_hashbits(19, 1, 1) + ht = khmer._Hashbits(19, [1]) try: ht.merge_subset_from_disk(outfile1) assert 0, "this should fail" @@ -433,7 +433,7 @@ def test_save_merge_from_disk_ksize(self): def test_save_load_merge_on_graph(): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -466,7 +466,7 @@ def test_save_load_merge_on_graph(): def test_save_load_on_graph_truncate(): - ht = khmer.new_hashbits(20, 4 ** 4 + 1) + ht = khmer.Hashbits(20, 4 ** 4 + 1, 2) filename = utils.get_test_data('test-graph2.fa') (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename) @@ -506,7 +506,7 @@ def test_save_load_on_graph_truncate(): def test_output_partitions(): filename = utils.get_test_data('test-output-partitions.fa') - ht = khmer.new_hashbits(10, 1, 1) + ht = khmer._Hashbits(10, [1]) ht.set_partition_id('TTAGGACTGC', 2) ht.set_partition_id('TGCGTTTCAA', 3) ht.set_partition_id('ATACTGTAAA', 4) @@ -531,7 +531,7 @@ def test_output_partitions(): def test_tiny_real_partitions(): filename = utils.get_test_data('real-partition-tiny.fa') - ht = khmer.new_hashbits(32, 8e1, 4) + ht = khmer.Hashbits(32, 8e2, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -558,7 +558,7 @@ def test_tiny_real_partitions(): def test_small_real_partitions(): filename = utils.get_test_data('real-partition-small.fa') - ht = khmer.new_hashbits(32, 2e2, 4) + ht = khmer.Hashbits(32, 2e3, 4) ht.consume_fasta_and_tag(filename) subset = ht.do_subset_partition(0, 0) @@ -600,7 +600,7 @@ def test_small_real_partitions(): def test_partition_on_abundance_1(): print((a,)) print((b,)) - kh = khmer.new_counting_hash(20, 1e3, 4) + kh = khmer.CountingHash(20, 1e3, 4) for i in range(10): print(kh.consume_and_tag(a)) @@ -614,7 +614,7 @@ def test_partition_on_abundance_1(): def test_partition_on_abundance_2(): - kh = khmer.new_counting_hash(20, 1e3, 4) + kh = khmer.CountingHash(20, 1e3, 4) for i in range(10): print(kh.consume_and_tag(a)) @@ -628,7 +628,7 @@ def test_partition_on_abundance_2(): def test_partition_on_abundance_3(): - kh = khmer.new_counting_hash(20, 1e4, 4) + kh = khmer.CountingHash(20, 1e4, 4) for i in range(10): print(kh.consume_and_tag(a)) @@ -647,7 +647,7 @@ def test_partition_on_abundance_3(): def test_partition_overlap_1(): - kh = khmer.new_counting_hash(20, 1e3, 4) + kh = khmer.CountingHash(20, 1e3, 4) for i in range(10): kh.consume_and_tag(a) @@ -668,7 +668,7 @@ def test_partition_overlap_1(): def test_partition_overlap_2(): - kh = khmer.new_counting_hash(20, 1e4, 4) + kh = khmer.CountingHash(20, 1e4, 4) for i in range(10): kh.consume_and_tag(a)