dib-lab · mr-c · Jun 27, 2015 · May 31, 2015 · May 31, 2015 · May 31, 2015
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,38 @@
+2015-06-19  Titus Brown  <titus@idyll.org>
+
+   * khmer/__init__.py: split CountingHash into _CountingHash (CPython) and
+   CountingHash to mimic Hashbits behavior; pass IOError through
+   extract_countinghash_info and extract_hashbits_info so that
+   file-does-not-exist errors are correctly reported; fixed FP rate reporting;
+   changed to using get_n_primes_near_x to build hashtable sizes; removed
+   get_n_primes_above_x, new_hashbits, and new_counting_hash functions.
+   * khmer/_khmer.cc: changed tp_flags for KCountingHash so that it could
+   be a base class.
+   * khmer/khmer_args.py: removed environment variable override for hash size
+   defaults; added -M/--max_memory_usage, and functions create_nodegraph()
+   and create_countgraph().  Also renamed --min-tablesize to --max-tablesize.
+   * khmer/kfile.py: fixed check_space_for_hashtable to depend on args obj.
+   * oxli/build_graph.py, scripts/{annotate-partitions.py,count-overlap.py,
+   do-partition.py,filter-stoptags.py,
+   merge-partitions.py}, sandbox/{assembly-diff.py,assembly-diff-2.py,
+   bloom-count-intersection.py,bloom-count.py,build-sparse-graph.py,
+   collect-reads.py,saturate-by-median.py, graph-size.py,print-stoptags.py,
+   print-tagset.py,stoptags-by-position.py, subset-report.py,
+   sweep-out-reads-with-contigs.py,sweep-reads2.py,sweep-reads3.py}: changed
+   hashtype over to 'nodegraph' and 'countgraph' in call to report_on_config;
+   replaced counting hash/hashbits creation with new khmer_args create*
+   functions, and/or new_counting_hash/new_hashbits with CountingHash/Hashbits.
+   * doc/scripts.rst: updated hashtable size help text.
+   * doc/whats-new-2.0.rst: updated with description of -M/--max-memory-usage.
+   * tests/test*.py: switched from new_counting_hash to CountingHash, and
+   new_hashbits to Hashbits; adjusts tests for new behavior of hashtable
+   size calculation.
+   * tests/test_hashbits_obj.py: merged into test_hashbits.py and removed file.
+   * tests/test_script_arguments.py: updated for new check_space_for_hashtable
+   behavior; added tests for create_countgraph and create_nodegraph.
+   * tests/test_counting_single.py: fixed countgraph size & palindrome testing
+   beahavior in test_complete_no_collision.
+
 2015-06-19  Titus Brown  <titus@idyll.org>
 
    * Makefile: temporarily disable 'huge' tests on Linux.

diff --git a/doc/user/choosing-table-sizes.rst b/doc/user/choosing-table-sizes.rst
@@ -1,53 +1,55 @@
 .. vim: set filetype=rst
 
-==============================
-Choosing table sizes for khmer
-==============================
+==========================
+Setting khmer memory usage
+==========================
 
 If you look at the documentation for the scripts (:doc:`scripts`) you'll
-see two mysterious parameters -- :option:`-N` and :option:`-x`, or, more
-verbosely, :option:`-n_tables` and :option:`--tablesize`.  What are these, and
-how do you specify them?
+see a :option:`-M` parameter that sets the maximum memory usage for
+any script that uses k-mer counting tables or k-mer graphs.  What is this?
+
+khmer uses a special data structure that lets it store counting tables
+and k-mer graphs in very low memory; the trick is that you must fix
+the amount of memory khmer can use before running it. (See `Pell et
+al., 2012 <http://www.ncbi.nlm.nih.gov/pubmed/22847406>`__ and `Zhang
+et al., 2014 <http://www.ncbi.nlm.nih.gov/pubmed/25062443>`__ for the
+details.)  This is what the :option:`-M` parameter does.
+
+If you set it too low, khmer will warn you to set it higher at the end.
+See below for some good choices for various kinds of data.
+
+**Note for khmer 1.x users:** as of khmer 2.0, the :option:`-M`
+parameter sets the :option:`-N`/:option:`--n_tables` and
+:option:`-x`/:option:`--max_tablesize` parameters automatically.
+You can still set these parameters directly if you wish.
 
 The really short version
 ========================
 
 There is no way (except for experience, rules of thumb, and intuition) to
-know what these parameters should be up front.  So, make the product of
-these two parameters be the size of your available memory::
+know what this parameter should be up front.  So, use the maximum
+available memory::
 
-  -N 4 -x 4e9
+  -M 16e9
 
-for a machine with 16 GB of free memory, for example.  Also see
-the rules of thumb, below.
+for a machine with 16 GB of free memory, for example.
 
 The short version
 =================
 
-These parameters specify the maximum memory usage of the primary data
+This parameter specifies the maximum memory usage of the primary data
 structure in khmer, which is basically N big hash tables of size x.
 The **product** of the number of hash tables and the size of the hash
-tables specifies the total amount of memory used.
+tables specifies the total amount of memory used, which is what the
+:option:`-M` parameter sets.
 
-This table is used to track k-mers.  If it is too small, khmer
-will fail in various ways (and should complain), but there is no harm
+These tables are used to track k-mers.  If they are too small, khmer
+will fail in various ways (and will complain), but there is no harm
 in making it too large. So, **the absolute safest thing to do is to
 specify as much memory as is available**.  Most scripts will inform
 you of the total memory usage, and (at the end) will complain if it's
 too small.
 
-For normalize-by-median, khmer uses one byte per hash entry, so: if
-you had 16 GB of available RAM, you should specify something like ``-N
-4 -x 4e9``, which multiplies out to about 16 GB.
-
-For the graph partitioning stuff, khmer uses only 1 bit per k-mer, so
-you can multiple your available memory by 8: for 16 GB of RAM, you could
-use ::
-
-   -N 4 -x 32e9
-
-which multiplies out to 128 Gbits of RAM, or 16 Gbytes.
-
 Life is a bit more complicated than this, however, because some scripts --
 load-into-counting and load-graph -- keep ancillary information that will
 consume memory beyond this table data structure.  So if you run out of
@@ -124,26 +126,24 @@ an error-code.
 Rules of thumb
 --------------
 
-Just use -N 4, always, and vary the -x parameter.
-
 For digital normalization, we recommend:
 
- - ``-x 2e9`` for any amount of sequencing for a single microbial genome,
+ - ``-M 8e9`` for any amount of sequencing for a single microbial genome,
    MDA-amplified or single colony.
 
- - ``-x 4e9`` for up to a billion mRNAseq reads from any organism.  Past that,
+ - ``-M 16e9`` for up to a billion mRNAseq reads from any organism.  Past that,
    increase it.
 
- - ``-x 8e9`` for most eukaryotic genome samples.
+ - ``-M 32e9`` for most eukaryotic genome samples.
 
- - ``-x 8e9`` will also handle most "simple" metagenomic samples (HMP on down)
+ - ``-M 32e9`` will also handle most "simple" metagenomic samples (HMP on down)
 
  - For metagenomic samples that are more complex, such as soil or marine,
-   start as high as possible.  For example, we are using ``-x 64e9`` for
+   start as high as possible.  For example, we are using ``-M 256e9`` for
    ~300 Gbp of soil reads.
 
 For partitioning of complex metagenome samples, we recommend starting
 as high as you can -- something like half your system memory.  So if
-you have 256 GB of RAM, use ``-N 4 -x 256e9`` which will use 4 x 256 /
-8 = 128 GB of RAM for the basic graph storage, leaving other memory
-for the ancillary data structures.
+you have 256 GB of RAM, use ``-M 128e9`` which will use 128 GB of RAM
+for the basic graph storage, leaving other memory for the ancillary
+data structures.
diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst
@@ -10,14 +10,10 @@ distribution.  Below is our documentation for these scripts.  Note
 that all scripts can be given :option:`-h` which will print out
 a list of arguments taken by that script.
 
-Many scripts take :option:`-x` and :option:`-N` parameters, which drive khmer's
-memory usage. These parameters depend on details of your data set; for more information
-on how to choose them, see :doc:`choosing-table-sizes`.
-
-You can also override the default values of :option:`--ksize`/:option:`-k`,
-:option:`--n_tables`/:option:`-N`, and :option:`--min-tablesize`/:option:`-x` with
-the environment variables `KHMER_KSIZE`, `KHMER_N_TABLES`, and
-`KHMER_MIN_TABLESIZE` respectively.
+Scripts that use k-mer counting tables or k-mer graphs take an
+:option:`-M` parameter, which sets the maximum memory usage in bytes.
+This should generally be set as high as possible; see
+:doc:`choosing-table-sizes` for more information.
 
 1. :ref:`scripts-counting`
 2. :ref:`scripts-partitioning`

diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst
@@ -3,10 +3,27 @@
 What's New In khmer 2.0?
 ########################
 
-All binary khmer formats (presence tables, counting tables, tag sets, stop tags,
-and partition subsets) have changed. Files are
-now pre-pended with the string ``OXLI`` to indicate that they are from this
-project.
+Incompatible changes
+====================
+
+New parameter for tablesize/number of table parameters.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is now a :option:`-M`/:option:`--max-memory-usage` parameter
+that sets the number of tables (:option:`-N`/:option:`--num_tables`)
+and tablesize (:option:`-x`/:option:`--max-tablesize`) parameters
+automatically to match the desired memory usage.
+
+(:option:`--min-tablesize` was also renamed to
+:option:`--max-tablesize` to reflect this more desirable behavior.)
+
+Binary file formats have changed!
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All binary khmer formats (presence tables, counting tables, tag sets,
+stop tags, and partition subsets) have changed. Files are now
+pre-pended with the string ``OXLI`` to indicate that they are from
+this project.
 
 Files of the above types made in previous versions of khmer are not compatible
 with v2.0; the reverse is also true.
diff --git a/khmer/__init__.py b/khmer/__init__.py
@@ -8,7 +8,7 @@
 
 from __future__ import print_function
 
-from khmer._khmer import CountingHash
+from khmer._khmer import CountingHash as _CountingHash
 from khmer._khmer import LabelHash as _LabelHash
 from khmer._khmer import Hashbits as _Hashbits
 from khmer._khmer import HLLCounter as _HLLCounter
@@ -45,36 +45,6 @@
 del get_versions
 
 
-def new_hashbits(k, starting_size, n_tables=2):
-    """Return a new hashbits object. Deprecated.
-
-    This factory method is deprecated in favor of creating a Hashbits object
-    directly via 'new Hashbits(...)'.
-
-    Keyword argument:
-    k -- kmer size to use
-    starting_size -- lower bound on hashsize to use
-    n_tables -- number of hash tables to use (default = 2)
-    """
-    primes = get_n_primes_above_x(n_tables, starting_size)
-
-    return _Hashbits(k, primes)
-
-
-def new_counting_hash(k, starting_size, n_tables=2):
-    """Return a new countinghash object.
-
-    Keyword arguments:
-    k -- kmer size to use
-    starting_size -- lower bound on hashsize to use
-    n_tables -- number of hash tables to use (default = 2)
-    n_threads  -- number of simultaneous threads to execute (default = 1)
-    """
-    primes = get_n_primes_above_x(n_tables, starting_size)
-
-    return CountingHash(k, primes)
-
-
 def load_hashbits(filename):
     """Load a hashbits object from the given filename and return it.
 
@@ -93,7 +63,7 @@ def load_counting_hash(filename):
     Keyword argument:
     filename -- the name of the counting_hash file
     """
-    hashtable = CountingHash(1, [1])
+    hashtable = _CountingHash(1, [1])
     hashtable.load(filename)
 
     return hashtable
@@ -192,13 +162,19 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
 
     if fp_all > max_false_pos:
         print("**", file=sys.stderr)
-        print(
-            "** ERROR: the graph structure is too small for ", file=sys.stderr)
-        print(
-            "this data set.  Increase k-mer presence table ", file=sys.stderr)
-        print("size/num of tables.", file=sys.stderr)
+        print("** ERROR: the graph structure is too small for ",
+              file=sys.stderr)
+        print("** this data set.  Increase data structure size",
+              file=sys.stderr)
+        print("** with --max_memory_usage/-M.", file=sys.stderr)
+        print("**", file=sys.stderr)
         print("** Do not use these results!!", file=sys.stderr)
         print("**", file=sys.stderr)
+        print("** (estimated false positive rate of %.3f;" % fp_all,
+              file=sys.stderr)
+        print("max allowable %.3f" % max_false_pos, file=sys.stderr)
+        print("**", file=sys.stderr)
+
         if not force:
             sys.exit(1)
 
@@ -229,6 +205,9 @@ def get_n_primes_near_x(number, target):
     number -- the number of primes to find
     target -- the number to step backwards from
     """
+    if target == 1 and number == 1:
+        return [1]
+
     primes = []
     i = target - 1
     if i % 2 == 0:
@@ -237,27 +216,11 @@ def get_n_primes_near_x(number, target):
         if is_prime(i):
             primes.append(i)
         i -= 2
-    return primes
 
+    if len(primes) != number:
+        raise Exception("unable to find %d prime numbers < %d" % (number,
+                                                                  target))
 
-def get_n_primes_above_x(number, target):
-    """Forward-find primes smaller than target.
-
-    Step forwards until a number of primes (other than 2) have been
-    found that are smaller than the target and return them.
-
-    Keyword arguments:
-    number -- the number of primes to find
-    target -- the number to step forwards from
-    """
-    primes = []
-    i = target + 1
-    if i % 2 == 0:
-        i += 1
-    while len(primes) != number and i > 0:
-        if is_prime(i):
-            primes.append(i)
-        i += 2
     return primes
 
 
@@ -267,6 +230,15 @@ def get_n_primes_above_x(number, target):
 # Additional functionality can be added to these classes as appropriate.
 
 
+class CountingHash(_CountingHash):
+
+    def __new__(cls, k, starting_size, n_tables):
+        primes = get_n_primes_near_x(n_tables, starting_size)
+        c = _CountingHash.__new__(cls, k, primes)
+        c.primes = primes
+        return c
+
+
 class LabelHash(_LabelHash):
 
     def __new__(cls, k, starting_size, n_tables):
@@ -279,8 +251,8 @@ def __new__(cls, k, starting_size, n_tables):
 class CountingLabelHash(_LabelHash):
 
     def __new__(cls, k, starting_size, n_tables):
-        primes = get_n_primes_above_x(n_tables, starting_size)
-        hb = CountingHash(k, primes)
+        primes = get_n_primes_near_x(n_tables, starting_size)
+        hb = _CountingHash(k, primes)
         c = _LabelHash.__new__(cls, hb)
         c.graph = hb
         return c
@@ -289,7 +261,7 @@ def __new__(cls, k, starting_size, n_tables):
 class Hashbits(_Hashbits):
 
     def __new__(cls, k, starting_size, n_tables):
-        primes = get_n_primes_above_x(n_tables, starting_size)
+        primes = get_n_primes_near_x(n_tables, starting_size)
         c = _Hashbits.__new__(cls, k, primes)
         c.primes = primes
         return c

diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc
@@ -3103,7 +3103,7 @@ static PyTypeObject khmer_KCountingHash_Type
 CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
 = {
     PyVarObject_HEAD_INIT(NULL, 0)       /* init & ob_size */
-    "_khmer.KCountingHash",              /*tp_name*/
+    "_khmer.CountingHash",              /*tp_name*/
     sizeof(khmer_KCountingHash_Object),  /*tp_basicsize*/
     0,                                   /*tp_itemsize*/
     (destructor)khmer_counting_dealloc,  /*tp_dealloc*/
@@ -3121,7 +3121,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
     0,                                   /*tp_getattro*/
     0,                                   /*tp_setattro*/
     0,                                   /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT,                  /*tp_flags*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,                  /*tp_flags*/
     "counting hash object",              /* tp_doc */
     0,                                   /* tp_traverse */
     0,                                   /* tp_clear */