diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 7719215c1..357684012 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -58,6 +58,15 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_NORETURN #endif +// Enable optimisation level 3, especially for gcc. To be used +// where we want to force vectorisation in hot loops and the default -O2 +// just doesn't cut it. +#if HTS_COMPILER_HAS(optimize) || HTS_GCC_AT_LEAST(4,4) +#define HTS_OPT3 __attribute__((optimize("O3"))) +#else +#define HTS_OPT3 +#endif + // GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later #if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5) #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__)) diff --git a/sam.c b/sam.c index 80c391939..1f2ecb7ba 100644 --- a/sam.c +++ b/sam.c @@ -4217,6 +4217,15 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) return pass_filter < 0 ? -2 : ret; } +// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise +// this code isn't vectorised and runs far slower than is necessary (even +// with the restrict keyword being used). +static inline void HTS_OPT3 +add33(uint8_t *a, const uint8_t * b, int32_t len) { + uint32_t i; + for (i = 0; i < len; i++) + a[i] = b[i]+33; +} static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) { @@ -4267,10 +4276,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st if (s[0] == 0xff) { cp[i++] = '*'; } else { - // local copy of c->l_qseq to aid unrolling - uint32_t lqseq = c->l_qseq; - for (i = 0; i < lqseq; ++i) - cp[i]=s[i]+33; + add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33; + i = c->l_qseq; } cp[i] = 0; cp += i;