Skip to content

Commit

Permalink
Merge pull request #19 from brentp/master
Browse files Browse the repository at this point in the history
add tests and fix off-by 1 bug for small files.
  • Loading branch information
arq5x committed May 14, 2015
2 parents d13743a + 1ef0db1 commit 8d81fed
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 31 deletions.
65 changes: 35 additions & 30 deletions grabix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ bool bgzf_getline_counting(BGZF * stream)
if (c == -1)
return true;
else if (c == 10) // \n
return false;
return false;
}
}

Expand All @@ -62,19 +62,19 @@ int create_grabix_index(string bgzf_file)
cerr << "[grabix] could not open file:" << bgzf_file << endl;
exit (1);
}

// create an index for writing
string index_file_name = bgzf_file + ".gbi";
ofstream index_file(index_file_name.c_str(), ios::out);

// add the offset for the end of the header to the index

int status;
kstring_t *line = new kstring_t;
line->s = '\0';
line->l = 0;
line->m = 0;

int64_t prev_offset = 0;
int64_t offset = 0;
while ((status = bgzf_getline(bgzf_fp, '\n', line)) >= 0)
Expand All @@ -89,7 +89,7 @@ int create_grabix_index(string bgzf_file)
// add the offsets for each CHUNK_SIZE
// set of records to the index
size_t chunk_count = 0;
int64_t total_lines = 0;
int64_t total_lines = 1;
vector<int64_t> chunk_positions;
chunk_positions.push_back (prev_offset);
bool eof = false;
Expand All @@ -98,20 +98,25 @@ int create_grabix_index(string bgzf_file)
// grab the next line and store the offset
eof = bgzf_getline_counting(bgzf_fp);
offset = bgzf_tell (bgzf_fp);
chunk_count++;
total_lines++;
// stop if we have encountered an empty line
if (eof)
{
if (bgzf_check_EOF(bgzf_fp) == 1)
if (bgzf_check_EOF(bgzf_fp) == 1) {
if (offset > prev_offset) {
total_lines++;
//prev_offset = offset;
}
break;
}
}
// store the offset of this chunk start
else if (chunk_count == CHUNK_SIZE)
else if (chunk_count == CHUNK_SIZE)
{
chunk_positions.push_back(prev_offset);
chunk_count = 0;
}
chunk_count++;
total_lines++;
prev_offset = offset;
}
chunk_positions.push_back (prev_offset);
Expand All @@ -138,14 +143,14 @@ void load_index(string bgzf_file, index_info &index)
ifstream index_file(index_file_name.c_str(), ios::in);

if ( !index_file ) {
cerr << "[grabix] coould not find index file: " << index_file_name << ". Exiting!" << endl;
cerr << "[grabix] could not find index file: " << index_file_name << ". Exiting!" << endl;
exit (1);
}
else {
string line;
getline (index_file, line);
index.header_end = atol(line.c_str());

getline (index_file, line);
index.num_lines = atol(line.c_str());

Expand All @@ -167,57 +172,57 @@ int grab(string bgzf_file, int64_t from_line, int64_t to_line)
index_info index;
load_index(bgzf_file, index);

if (((int) from_line > index.num_lines)
||
if (((int) from_line > index.num_lines)
||
((int) to_line > index.num_lines))
{
cerr << "[grabix] requested lines exceed the number of lines in the file." << endl;
exit(1);
}
else if (from_line < 0)
else if (from_line < 0)
{
cerr << "[grabix] indexes must be positive numbers." << endl;
exit(1);
}
else if (from_line > to_line)
else if (from_line > to_line)
{
cerr << "[grabix] requested end line is less than the requested begin line." << endl;
exit(1);
}
else {

// load the BGZF file
BGZF *bgzf_fp = bgzf_open(bgzf_file.c_str(), "r");
if (bgzf_fp == NULL)
{
cerr << "[grabix] could not open file:" << bgzf_file << endl;
exit (1);
}

// dump the header if there is one
int status;
kstring_t *line = new kstring_t;
line->s = '\0';
line->l = 0;
line->m = 0;
line->l = 0;
line->m = 0;

while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0)
{
if (line->s[0] == '#')
printf("%s\n", line->s);
else break;
}

// easier to work in 0-based space
int64_t from_line_0 = from_line - 1;
// get the chunk index for the requested line
int64_t requested_chunk = from_line_0 / CHUNK_SIZE;
// derive the first line in that chunk
int64_t chunk_line_start = (requested_chunk * CHUNK_SIZE);

// jump to the correct offset for the relevant chunk
// and fast forward until we find the requested line
bgzf_seek (bgzf_fp, index.chunk_offsets[requested_chunk], SEEK_SET);
// and fast forward until we find the requested line
bgzf_seek (bgzf_fp, index.chunk_offsets[requested_chunk], SEEK_SET);
while (chunk_line_start <= from_line_0)
{
status = bgzf_getline(bgzf_fp, '\n', line);
Expand All @@ -244,7 +249,7 @@ int random(string bgzf_file, uint64_t K)
index_info index;
load_index(bgzf_file, index);

if ((int64_t) K > index.num_lines)
if ((int64_t) K > index.num_lines)
{
cerr << "[grabix] warning: requested more lines than in the file." << endl;
exit(1);
Expand All @@ -257,11 +262,11 @@ int random(string bgzf_file, uint64_t K)
cerr << "[grabix] could not open file:" << bgzf_file << endl;
exit (1);
}

// seed our random number generator
size_t seed = (unsigned)time(0)+(unsigned)getpid();
srand(seed);

// reservoir sample
uint64_t s = 0;
uint64_t N = 0;
Expand All @@ -270,7 +275,7 @@ int random(string bgzf_file, uint64_t K)
int status;
kstring_t *line = new kstring_t;
line->s = '\0';
line->l = 0;
line->l = 0;
line->m = 0;

while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0)
Expand All @@ -283,7 +288,7 @@ int random(string bgzf_file, uint64_t K)
while ((status = bgzf_getline(bgzf_fp, '\n', line)) != 0)
{
N++;

if (status < 0)
break;

Expand All @@ -292,15 +297,15 @@ int random(string bgzf_file, uint64_t K)
sample.push_back(line->s);
result_size++;
}
else
else
{
s = (int) ((double)rand()/(double)RAND_MAX * N);
if (s < K)
sample[s] = line->s;
}
}
bgzf_close(bgzf_fp);

// report the sample
for (size_t i = 0; i < sample.size(); ++i)
printf("%s\n", sample[i].c_str());
Expand Down
5 changes: 4 additions & 1 deletion grabix_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ using namespace std;
int main (int argc, char **argv)
{
if (argc == 1) { return usage(); }
if (argc >= 3)
if (argc >= 3)
{
// create input file for the purpose of the example
string bgzf_file = argv[2];
Expand Down Expand Up @@ -41,6 +41,9 @@ int main (int argc, char **argv)
else if (sub_command == "size")
{
cout << size(bgzf_file) << "\n";
} else {
cout << "unknown command: " << sub_command << endl;
cout << "available commands are: index, grab, random, check, size" << endl;
}
}

Expand Down
64 changes: 64 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
make

for V in \
test.PLs.vcf \
test.auto_dom.no_parents.2.vcf \
test.auto_dom.no_parents.3.vcf \
test.auto_dom.no_parents.4.vcf \
test.auto_dom.no_parents.5.vcf \
test.auto_dom.no_parents.vcf \
test.auto_dom.vcf \
test.auto_rec.no_parents.2.vcf \
test.auto_rec.no_parents.3.vcf \
test.auto_rec.no_parents.4.vcf \
test.auto_rec.no_parents.5.vcf \
test.auto_rec.no_parents.vcf \
test.auto_rec.vcf \
test.burden.vcf \
test.cadd.vcf \
test.clinvar.vcf \
test.comp_het.2.vcf \
test.comp_het.3.vcf \
test.comp_het.4.vcf \
test.comp_het.5.vcf \
test.comp_het.6.vcf \
test.comp_het.7.vcf \
test.comp_het.vcf \
test.cosmic.vcf \
test.de_novo.affected.and.unaffected.vcf \
test.de_novo.vcf \
test.esp.zero.vcf \
test.exac.vcf \
test.family.vcf \
test.fusions.vcf \
test.mendel.vcf \
test.multiple-alts.decomp.snpeff.vcf \
test.passonly.vcf \
test.query.vcf \
test.region.vep.vcf \
test.roh.vcf \
test.snpeff.vcf \
test.somatic.vcf \
test.vcf_id.snpeff.vcf \
test1.snpeff.vcf \
test2.snpeff.vcf \
test3.snpeff.vcf \
test4.vep.snpeff.vcf \
test5.vep.snpeff.vcf; do

rm -f ${V}.*
wget --quiet https://raw.githubusercontent.com/arq5x/gemini/master/test/$V
bgzip -f $V
./grabix index ${V}.gz
sleep 1
exp=$(zgrep -cv "#" $V.gz)
obs=$(./grabix size $V.gz)

if [[ "$exp" != "$obs" ]]; then
echo "FAIL: $V: expected $exp lines found $obs"
else
echo "OK $V"
fi
rm -f ${V}.*

done

0 comments on commit 8d81fed

Please sign in to comment.