From fdd61439d54f0a5347a3443d46590876f5a9e77e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Apr 2024 17:13:46 +0100 Subject: [PATCH] Increase the input block size for bgzip. Commit e495718 changed bgzip from unix raw POSIX read() calls to hread(). Unfortunately hread gets its buffer size from stat of the input file descriptor, which can be 4kb for a pipe. We're reading 0xff00 bytes, so this ends up being split over two reads mostly, with one or both involving additional memcpys. This makes the buffered I/O worse performing than non-buffered. In the most extreme cases (cat data | bgzip -l0 > /dev/null) this is a two fold slow down. The easy solution is just to increase the buffer size to something sensible. It's a little messy as we have to use hfile_internal.h to get hfile_set_blksize, but it works. I'm not sure why we didn't elect to make that API more public. Probably simply out of caution. Fixes #1767 --- bgzip.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bgzip.c b/bgzip.c index 687b29d47..e920a5d92 100644 --- a/bgzip.c +++ b/bgzip.c @@ -39,6 +39,7 @@ #include "htslib/bgzf.h" #include "htslib/hts.h" #include "htslib/hfile.h" +#include "hfile_internal.h" // for hfile_set_blksize #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN @@ -337,6 +338,9 @@ int main(int argc, char **argv) return 1; } + // Increase block size to reduce performance bottlenecks on pipes + hfile_set_blksize(f_src, 128*1024); + if (write_fname) { if (!exp_out_open) { // only open this file once for writing, close at the end if ((fp = bgzf_open(write_fname, out_mode)) == NULL) {