From af9cfcac07dfa9473044bc7b55aa3aae677e0cbf Mon Sep 17 00:00:00 2001 From: "Brent Pedersen (brentp)" Date: Thu, 14 May 2015 08:30:09 -0600 Subject: [PATCH 1/2] fix bug for small grabix files. add tests --- grabix.cpp | 65 ++++++++++++++++++++++++++----------------------- grabix_main.cpp | 5 +++- test.sh | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 31 deletions(-) create mode 100644 test.sh diff --git a/grabix.cpp b/grabix.cpp index 32d837f..5a3cd30 100644 --- a/grabix.cpp +++ b/grabix.cpp @@ -46,7 +46,7 @@ bool bgzf_getline_counting(BGZF * stream) if (c == -1) return true; else if (c == 10) // \n - return false; + return false; } } @@ -62,11 +62,11 @@ int create_grabix_index(string bgzf_file) cerr << "[grabix] could not open file:" << bgzf_file << endl; exit (1); } - + // create an index for writing string index_file_name = bgzf_file + ".gbi"; ofstream index_file(index_file_name.c_str(), ios::out); - + // add the offset for the end of the header to the index int status; @@ -74,7 +74,7 @@ int create_grabix_index(string bgzf_file) line->s = '\0'; line->l = 0; line->m = 0; - + int64_t prev_offset = 0; int64_t offset = 0; while ((status = bgzf_getline(bgzf_fp, '\n', line)) >= 0) @@ -89,7 +89,7 @@ int create_grabix_index(string bgzf_file) // add the offsets for each CHUNK_SIZE // set of records to the index size_t chunk_count = 0; - int64_t total_lines = 0; + int64_t total_lines = 1; vector chunk_positions; chunk_positions.push_back (prev_offset); bool eof = false; @@ -98,20 +98,25 @@ int create_grabix_index(string bgzf_file) // grab the next line and store the offset eof = bgzf_getline_counting(bgzf_fp); offset = bgzf_tell (bgzf_fp); - chunk_count++; - total_lines++; // stop if we have encountered an empty line if (eof) { - if (bgzf_check_EOF(bgzf_fp) == 1) + if (bgzf_check_EOF(bgzf_fp) == 1) { + if (offset > prev_offset) { + total_lines++; + //prev_offset = offset; + } break; + } } // store the offset of this chunk start - else if (chunk_count == CHUNK_SIZE) + else if (chunk_count == CHUNK_SIZE) { chunk_positions.push_back(prev_offset); chunk_count = 0; } + chunk_count++; + total_lines++; prev_offset = offset; } chunk_positions.push_back (prev_offset); @@ -138,14 +143,14 @@ void load_index(string bgzf_file, index_info &index) ifstream index_file(index_file_name.c_str(), ios::in); if ( !index_file ) { - cerr << "[grabix] coould not find index file: " << index_file_name << ". Exiting!" << endl; + cerr << "[grabix] could not find index file: " << index_file_name << ". Exiting!" << endl; exit (1); } else { string line; getline (index_file, line); index.header_end = atol(line.c_str()); - + getline (index_file, line); index.num_lines = atol(line.c_str()); @@ -167,25 +172,25 @@ int grab(string bgzf_file, int64_t from_line, int64_t to_line) index_info index; load_index(bgzf_file, index); - if (((int) from_line > index.num_lines) - || + if (((int) from_line > index.num_lines) + || ((int) to_line > index.num_lines)) { cerr << "[grabix] requested lines exceed the number of lines in the file." << endl; exit(1); } - else if (from_line < 0) + else if (from_line < 0) { cerr << "[grabix] indexes must be positive numbers." << endl; exit(1); } - else if (from_line > to_line) + else if (from_line > to_line) { cerr << "[grabix] requested end line is less than the requested begin line." << endl; exit(1); } else { - + // load the BGZF file BGZF *bgzf_fp = bgzf_open(bgzf_file.c_str(), "r"); if (bgzf_fp == NULL) @@ -193,13 +198,13 @@ int grab(string bgzf_file, int64_t from_line, int64_t to_line) cerr << "[grabix] could not open file:" << bgzf_file << endl; exit (1); } - + // dump the header if there is one int status; kstring_t *line = new kstring_t; line->s = '\0'; - line->l = 0; - line->m = 0; + line->l = 0; + line->m = 0; while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0) { @@ -207,17 +212,17 @@ int grab(string bgzf_file, int64_t from_line, int64_t to_line) printf("%s\n", line->s); else break; } - + // easier to work in 0-based space int64_t from_line_0 = from_line - 1; // get the chunk index for the requested line int64_t requested_chunk = from_line_0 / CHUNK_SIZE; // derive the first line in that chunk int64_t chunk_line_start = (requested_chunk * CHUNK_SIZE); - + // jump to the correct offset for the relevant chunk - // and fast forward until we find the requested line - bgzf_seek (bgzf_fp, index.chunk_offsets[requested_chunk], SEEK_SET); + // and fast forward until we find the requested line + bgzf_seek (bgzf_fp, index.chunk_offsets[requested_chunk], SEEK_SET); while (chunk_line_start <= from_line_0) { status = bgzf_getline(bgzf_fp, '\n', line); @@ -244,7 +249,7 @@ int random(string bgzf_file, uint64_t K) index_info index; load_index(bgzf_file, index); - if ((int64_t) K > index.num_lines) + if ((int64_t) K > index.num_lines) { cerr << "[grabix] warning: requested more lines than in the file." << endl; exit(1); @@ -257,11 +262,11 @@ int random(string bgzf_file, uint64_t K) cerr << "[grabix] could not open file:" << bgzf_file << endl; exit (1); } - + // seed our random number generator size_t seed = (unsigned)time(0)+(unsigned)getpid(); srand(seed); - + // reservoir sample uint64_t s = 0; uint64_t N = 0; @@ -270,7 +275,7 @@ int random(string bgzf_file, uint64_t K) int status; kstring_t *line = new kstring_t; line->s = '\0'; - line->l = 0; + line->l = 0; line->m = 0; while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0) @@ -283,7 +288,7 @@ int random(string bgzf_file, uint64_t K) while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0) { N++; - + if (status < 0) break; @@ -292,7 +297,7 @@ int random(string bgzf_file, uint64_t K) sample.push_back(line->s); result_size++; } - else + else { s = (int) ((double)rand()/(double)RAND_MAX * N); if (s < K) @@ -300,7 +305,7 @@ int random(string bgzf_file, uint64_t K) } } bgzf_close(bgzf_fp); - + // report the sample for (size_t i = 0; i < sample.size(); ++i) printf("%s\n", sample[i].c_str()); diff --git a/grabix_main.cpp b/grabix_main.cpp index 184c398..0454dfb 100644 --- a/grabix_main.cpp +++ b/grabix_main.cpp @@ -10,7 +10,7 @@ using namespace std; int main (int argc, char **argv) { if (argc == 1) { return usage(); } - if (argc >= 3) + if (argc >= 3) { // create input file for the purpose of the example string bgzf_file = argv[2]; @@ -41,6 +41,9 @@ int main (int argc, char **argv) else if (sub_command == "size") { cout << size(bgzf_file) << "\n"; + } else { + cout << "unknown command:" << sub_command << endl; + cout << "available commands are: index, grab, random, check, size" << endl; } } diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..ee83f1d --- /dev/null +++ b/test.sh @@ -0,0 +1,64 @@ +make + +for V in \ + test.PLs.vcf \ + test.auto_dom.no_parents.2.vcf \ + test.auto_dom.no_parents.3.vcf \ + test.auto_dom.no_parents.4.vcf \ + test.auto_dom.no_parents.5.vcf \ + test.auto_dom.no_parents.vcf \ + test.auto_dom.vcf \ + test.auto_rec.no_parents.2.vcf \ + test.auto_rec.no_parents.3.vcf \ + test.auto_rec.no_parents.4.vcf \ + test.auto_rec.no_parents.5.vcf \ + test.auto_rec.no_parents.vcf \ + test.auto_rec.vcf \ + test.burden.vcf \ + test.cadd.vcf \ + test.clinvar.vcf \ + test.comp_het.2.vcf \ + test.comp_het.3.vcf \ + test.comp_het.4.vcf \ + test.comp_het.5.vcf \ + test.comp_het.6.vcf \ + test.comp_het.7.vcf \ + test.comp_het.vcf \ + test.cosmic.vcf \ + test.de_novo.affected.and.unaffected.vcf \ + test.de_novo.vcf \ + test.esp.zero.vcf \ + test.exac.vcf \ + test.family.vcf \ + test.fusions.vcf \ + test.mendel.vcf \ + test.multiple-alts.decomp.snpeff.vcf \ + test.passonly.vcf \ + test.query.vcf \ + test.region.vep.vcf \ + test.roh.vcf \ + test.snpeff.vcf \ + test.somatic.vcf \ + test.vcf_id.snpeff.vcf \ + test1.snpeff.vcf \ + test2.snpeff.vcf \ + test3.snpeff.vcf \ + test4.vep.snpeff.vcf \ + test5.vep.snpeff.vcf; do + + rm -f ${V}.* + wget --quiet https://raw.githubusercontent.com/arq5x/gemini/master/test/$V + bgzip -f $V + ./grabix index ${V}.gz + sleep 1 + exp=$(zgrep -cv "#" $V.gz) + obs=$(./grabix size $V.gz) + + if [[ "$exp" != "$obs" ]]; then + echo "FAIL: $V: expected $exp lines found $obs" + else + echo "OK $V" + fi + rm -f ${V}.* + +done From 1ef0db1efe98ceb6dd84385a31ccfc8329fd307b Mon Sep 17 00:00:00 2001 From: "Brent Pedersen (brentp)" Date: Thu, 14 May 2015 08:31:53 -0600 Subject: [PATCH 2/2] space --- grabix_main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grabix_main.cpp b/grabix_main.cpp index 0454dfb..8de1ac5 100644 --- a/grabix_main.cpp +++ b/grabix_main.cpp @@ -42,7 +42,7 @@ int main (int argc, char **argv) { cout << size(bgzf_file) << "\n"; } else { - cout << "unknown command:" << sub_command << endl; + cout << "unknown command: " << sub_command << endl; cout << "available commands are: index, grab, random, check, size" << endl; } }