Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix infinite loop in CountingHash gzipped IO #1043

Merged
merged 6 commits into from
Jun 14, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
2015-06-14 Kevin Murray <spam@kdmurray.id.au>

* lib/counting.cc: Fix infinite loop in gzipped CountingHash I/O
* tests/test_counting_hash.py: Add test of large CountingHash I/O
* setup.cfg: Skip tests with the 'huge' label by default

2015-06-13 Michael R. Crusoe <crusoe@ucdavis.edu>

* Makefile, build-jenkins.sh: unify sphinx dependencies
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ UNAME := $(shell uname)
ifeq ($(UNAME),Linux)
TESTATTR='!known_failing,!jenkins'
else
TESTATTR='!known_failing,!jenkins,!linux'
TESTATTR='!known_failing,!jenkins,!huge'
endif

## all : default task; compile C++ code, build shared object library
Expand Down
45 changes: 41 additions & 4 deletions lib/counting.cc
Original file line number Diff line number Diff line change
Expand Up @@ -685,8 +685,15 @@ CountingHashGzFileReader::CountingHashGzFileReader(

HashIntoType loaded = 0;
while (loaded != tablesize) {
read_b = gzread(infile, (char *) ht._counts[i],
(unsigned) (tablesize - loaded));
unsigned long long to_read_ll = tablesize - loaded;
unsigned int to_read_int;
// Zlib can only read chunks of at most INT_MAX bytes.
if (to_read_ll > INT_MAX) {
to_read_int = INT_MAX;
} else {
to_read_int = to_read_ll;
}
read_b = gzread(infile, (char *) ht._counts[i], to_read_int);

if (read_b <= 0) {
std::string gzerr = gzerror(infile, &read_b);
Expand Down Expand Up @@ -845,8 +852,38 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
sizeof(save_tablesize));
unsigned long long written = 0;
while (written != save_tablesize) {
written += gzwrite(outfile, (const char *) ht._counts[i],
(int) (save_tablesize - written));
unsigned long long to_write_ll = save_tablesize - written;
unsigned int to_write_int;
int gz_result;
// Zlib can only write chunks of at most INT_MAX bytes.
if (to_write_ll > INT_MAX) {
to_write_int = INT_MAX;
} else {
to_write_int = to_write_ll;
}
gz_result = gzwrite(outfile, (const char *) ht._counts[i],
to_write_int);
// Zlib returns 0 on error
if (gz_result == 0) {
int errcode = 0;
const char *err_msg;
std::ostringstream msg;

msg << "gzwrite failed while writing counting hash: ";
// Get zlib error
err_msg = gzerror(outfile, &errcode);
if (errcode != Z_ERRNO) {
// Zlib error, not stdlib
msg << err_msg;
gzclearerr(outfile);
} else {
// stdlib error
msg << strerror(errno);
}
gzclose(outfile);
throw khmer_file_exception(msg.str().c_str());
}
written += gz_result;
}
}

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[nosetests]
verbosity = 2
stop = TRUE
attr = !known_failing,!jenkins,!linux
attr = !known_failing,!jenkins,!huge
#processes = -1 # breaks xunit output

[build_ext]
Expand Down
25 changes: 24 additions & 1 deletion tests/test_counting_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_get_raw_tables_view():
assert sum(tab.tolist()) == 1


@attr('linux')
@attr('huge')
def test_toobig():
try:
ct = khmer.new_counting_hash(30, 1e13, 1)
Expand Down Expand Up @@ -568,6 +568,29 @@ def test_get_kmers():
assert kmers == ["AAAAAA", "AAAAAT"]


@attr("huge")
def test_save_load_large():
def do_test(ctfile):
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename(ctfile)

sizes = khmer.get_n_primes_above_x(1, 2**31)

orig = khmer.CountingHash(12, sizes)
orig.consume_fasta(inpath)
orig.save(savepath)

loaded = khmer.load_counting_hash(savepath)

orig_count = orig.n_occupied()
loaded_count = loaded.n_occupied()
assert orig_count == 3966, orig_count
assert loaded_count == orig_count, loaded_count

for ctfile in ['temp.ct.gz', 'temp.ct']:
do_test(ctfile)


def test_save_load():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
Expand Down
2 changes: 1 addition & 1 deletion tests/test_counting_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_no_collision():
assert kh.get('TTTT') == 2


@attr('linux')
@attr('huge')
def test_toobig():
try:
ct = khmer.new_hashtable(4, 1000000000000)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_hashbits_obj.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def teardown():
utils.cleanup()


@attr('linux')
@attr('huge')
def test_toobig():
try:
pt = khmer.Hashbits(32, 1e13, 1)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_labelhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def teardown():
# * thread-safety


@attr('linux')
@attr('huge')
def test_toobig():
try:
lh = LabelHash(20, 1e13, 1)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_load_into_counting_nonwritable():
assert status == 1, status


@attr('linux')
@attr('huge')
def test_load_into_counting_toobig():
script = 'load-into-counting.py'
args = ['-x', '1e12', '-N', '2', '-k', '20', '-t', '--force']
Expand Down