dib-lab · mr-c · Jun 14, 2015 · May 29, 2015 · May 29, 2015 · May 29, 2015
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,9 @@
+2015-06-14  Kevin Murray  <spam@kdmurray.id.au>
+
+   * lib/counting.cc: Fix infinite loop in gzipped CountingHash I/O
+   * tests/test_counting_hash.py: Add test of large CountingHash I/O
+   * setup.cfg: Skip tests with the 'huge' label by default
+
 2015-06-13  Michael R. Crusoe  <crusoe@ucdavis.edu>
 
    * Makefile, build-jenkins.sh: unify sphinx dependencies

diff --git a/Makefile b/Makefile
@@ -22,7 +22,7 @@ UNAME := $(shell uname)
 ifeq ($(UNAME),Linux)
 	TESTATTR='!known_failing,!jenkins'
 else
-	TESTATTR='!known_failing,!jenkins,!linux'
+	TESTATTR='!known_failing,!jenkins,!huge'
 endif
 
 ## all         : default task; compile C++ code, build shared object library

diff --git a/lib/counting.cc b/lib/counting.cc
@@ -685,8 +685,15 @@ CountingHashGzFileReader::CountingHashGzFileReader(
 
         HashIntoType loaded = 0;
         while (loaded != tablesize) {
-            read_b = gzread(infile, (char *) ht._counts[i],
-                            (unsigned) (tablesize - loaded));
+            unsigned long long  to_read_ll = tablesize - loaded;
+            unsigned int        to_read_int;
+            // Zlib can only read chunks of at most INT_MAX bytes.
+            if (to_read_ll > INT_MAX) {
+                to_read_int = INT_MAX;
+            } else {
+                to_read_int = to_read_ll;
+            }
+            read_b = gzread(infile, (char *) ht._counts[i], to_read_int);
 
             if (read_b <= 0) {
                 std::string gzerr = gzerror(infile, &read_b);
@@ -845,8 +852,38 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
                 sizeof(save_tablesize));
         unsigned long long written = 0;
         while (written != save_tablesize) {
-            written += gzwrite(outfile, (const char *) ht._counts[i],
-                               (int) (save_tablesize - written));
+            unsigned long long  to_write_ll = save_tablesize - written;
+            unsigned int        to_write_int;
+            int                 gz_result;
+            // Zlib can only write chunks of at most INT_MAX bytes.
+            if (to_write_ll > INT_MAX) {
+                to_write_int = INT_MAX;
+            } else {
+                to_write_int = to_write_ll;
+            }
+            gz_result = gzwrite(outfile, (const char *) ht._counts[i],
+                                to_write_int);
+            // Zlib returns 0 on error
+            if (gz_result == 0) {
+                int errcode = 0;
+                const char *err_msg;
+                std::ostringstream msg;
+
+                msg << "gzwrite failed while writing counting hash: ";
+                // Get zlib error
+                err_msg = gzerror(outfile, &errcode);
+                if (errcode != Z_ERRNO) {
+                    // Zlib error, not stdlib
+                    msg << err_msg;
+                    gzclearerr(outfile);
+                } else {
+                    // stdlib error
+                    msg << strerror(errno);
+                }
+                gzclose(outfile);
+                throw khmer_file_exception(msg.str().c_str());
+            }
+            written += gz_result;
         }
     }
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,7 +1,7 @@
 [nosetests]
 verbosity = 2
 stop = TRUE
-attr = !known_failing,!jenkins,!linux
+attr = !known_failing,!jenkins,!huge
 #processes = -1 # breaks xunit output
 
 [build_ext]

diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py
@@ -142,7 +142,7 @@ def test_get_raw_tables_view():
         assert sum(tab.tolist()) == 1
 
 
-@attr('linux')
+@attr('huge')
 def test_toobig():
     try:
         ct = khmer.new_counting_hash(30, 1e13, 1)
@@ -568,6 +568,29 @@ def test_get_kmers():
     assert kmers == ["AAAAAA", "AAAAAT"]
 
 
+@attr("huge")
+def test_save_load_large():
+    def do_test(ctfile):
+        inpath = utils.get_test_data('random-20-a.fa')
+        savepath = utils.get_temp_filename(ctfile)
+
+        sizes = khmer.get_n_primes_above_x(1, 2**31)
+
+        orig = khmer.CountingHash(12, sizes)
+        orig.consume_fasta(inpath)
+        orig.save(savepath)
+
+        loaded = khmer.load_counting_hash(savepath)
+
+        orig_count = orig.n_occupied()
+        loaded_count = loaded.n_occupied()
+        assert orig_count == 3966, orig_count
+        assert loaded_count == orig_count, loaded_count
+
+    for ctfile in ['temp.ct.gz', 'temp.ct']:
+        do_test(ctfile)
+
+
 def test_save_load():
     inpath = utils.get_test_data('random-20-a.fa')
     savepath = utils.get_temp_filename('tempcountingsave0.ht')

diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py
@@ -26,7 +26,7 @@ def test_no_collision():
     assert kh.get('TTTT') == 2
 
 
-@attr('linux')
+@attr('huge')
 def test_toobig():
     try:
         ct = khmer.new_hashtable(4, 1000000000000)

diff --git a/tests/test_hashbits_obj.py b/tests/test_hashbits_obj.py
@@ -27,7 +27,7 @@ def teardown():
     utils.cleanup()
 
 
-@attr('linux')
+@attr('huge')
 def test_toobig():
     try:
         pt = khmer.Hashbits(32, 1e13, 1)

diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
@@ -25,7 +25,7 @@ def teardown():
 #  * thread-safety
 
 
-@attr('linux')
+@attr('huge')
 def test_toobig():
     try:
         lh = LabelHash(20, 1e13, 1)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
@@ -72,7 +72,7 @@ def test_load_into_counting_nonwritable():
     assert status == 1, status
 
 
-@attr('linux')
+@attr('huge')
 def test_load_into_counting_toobig():
     script = 'load-into-counting.py'
     args = ['-x', '1e12', '-N', '2', '-k', '20', '-t', '--force']