From 93518026204808858462f3a99ef74d167c96ce05 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Mon, 22 Apr 2024 13:36:00 +0200 Subject: [PATCH 1/8] Move nibble2base_ssse3() SIMD code to new simd.c source file This function is always called via a function pointer, hence it is never inlined so need not be in a header. [Commit marked as authored by RV so git blame attributes the moved code to Ruben, as it is unchanged from the original code. Temporarily include simd.c from the header so this commit builds and runs. -JM] --- sam_internal.h | 65 +------------------------------------------------- simd.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 64 deletions(-) create mode 100644 simd.c diff --git a/sam_internal.h b/sam_internal.h index aed503925..00be2c215 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -102,70 +102,7 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { #if HTS_BUILD_IS_X86_64 \ && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ && HAVE_BUILTIN_CPU_SUPPORT_SSSE3 -#include "immintrin.h" -/* - * Convert a nibble encoded BAM sequence to a string of bases. - * - * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be - * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can - * be converted to the IUPAC characters. - * It falls back on the nibble2base_default function for the remainder. - */ - -__attribute__((target("ssse3"))) -static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { - seq[0] = 0; - const char *seq_end_ptr = seq + len; - char *seq_cursor = seq; - uint8_t *nibble_cursor = nib; - const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i)); - __m128i first_upper_shuffle = _mm_setr_epi8( - 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1); - __m128i first_lower_shuffle = _mm_setr_epi8( - -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7); - __m128i second_upper_shuffle = _mm_setr_epi8( - 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1); - __m128i second_lower_shuffle = _mm_setr_epi8( - -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15); - __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str); - /* Work on 16 encoded characters at the time resulting in 32 decoded characters - Examples are given for 8 encoded characters A until H to keep it readable. - Encoded stored as |AB|CD|EF|GH| - Shuffle into |AB|00|CD|00|EF|00|GH|00| and - |00|AB|00|CD|00|EF|00|GH| - Shift upper to the right resulting into - |0A|B0|0C|D0|0E|F0|0G|H0| and - |00|AB|00|CD|00|EF|00|GH| - Merge with or resulting into (X stands for garbage) - |0A|XB|0C|XD|0E|XF|0G|XH| - Bitwise and with 0b1111 leads to: - |0A|0B|0C|0D|0E|0F|0G|0H| - We can use the resulting 4-bit integers as indexes for the shuffle of - the nucleotide lookup. */ - while (seq_cursor < seq_vec_end_ptr) { - __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor); - - __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); - __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); - __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); - __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); - __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15)); - __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); - _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides); - - __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); - __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); - __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); - __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); - __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15)); - __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); - _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides); - - nibble_cursor += sizeof(__m128i); - seq_cursor += 2 * sizeof(__m128i); - } - nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); -} +#include "simd.c" static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; diff --git a/simd.c b/simd.c new file mode 100644 index 000000000..65e64cd2d --- /dev/null +++ b/simd.c @@ -0,0 +1,64 @@ +#include "immintrin.h" +/* + * Convert a nibble encoded BAM sequence to a string of bases. + * + * Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be + * unpacked into 32 indexes from 0-15. Using the pshufb instruction these can + * be converted to the IUPAC characters. + * It falls back on the nibble2base_default function for the remainder. + */ + +__attribute__((target("ssse3"))) +static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { + seq[0] = 0; + const char *seq_end_ptr = seq + len; + char *seq_cursor = seq; + uint8_t *nibble_cursor = nib; + const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i)); + __m128i first_upper_shuffle = _mm_setr_epi8( + 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1); + __m128i first_lower_shuffle = _mm_setr_epi8( + -1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7); + __m128i second_upper_shuffle = _mm_setr_epi8( + 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1); + __m128i second_lower_shuffle = _mm_setr_epi8( + -1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15); + __m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str); + /* Work on 16 encoded characters at the time resulting in 32 decoded characters + Examples are given for 8 encoded characters A until H to keep it readable. + Encoded stored as |AB|CD|EF|GH| + Shuffle into |AB|00|CD|00|EF|00|GH|00| and + |00|AB|00|CD|00|EF|00|GH| + Shift upper to the right resulting into + |0A|B0|0C|D0|0E|F0|0G|H0| and + |00|AB|00|CD|00|EF|00|GH| + Merge with or resulting into (X stands for garbage) + |0A|XB|0C|XD|0E|XF|0G|XH| + Bitwise and with 0b1111 leads to: + |0A|0B|0C|0D|0E|0F|0G|0H| + We can use the resulting 4-bit integers as indexes for the shuffle of + the nucleotide lookup. */ + while (seq_cursor < seq_vec_end_ptr) { + __m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor); + + __m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle); + __m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle); + __m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4); + __m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower); + __m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15)); + __m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes); + _mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides); + + __m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle); + __m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle); + __m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4); + __m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower); + __m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15)); + __m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes); + _mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides); + + nibble_cursor += sizeof(__m128i); + seq_cursor += 2 * sizeof(__m128i); + } + nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); +} From 87186f8a6201256840c701d9c4a59ca6429286a5 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 16 May 2024 18:10:51 +0100 Subject: [PATCH 2/8] Move nibble2base_resolve() infrastructure to simd.c Also add copyright boilerplate to the new file. [Commit marked as authored by RMD so git blame attributes the boilerplate and moved code to Rob, as the latter is unchanged from the original code. -JM] --- sam_internal.h | 10 ---------- simd.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/sam_internal.h b/sam_internal.h index 00be2c215..32c3bff97 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -103,16 +103,6 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ && HAVE_BUILTIN_CPU_SUPPORT_SSSE3 #include "simd.c" - -static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; - -__attribute__((constructor)) -static void nibble2base_resolve(void) { - if (__builtin_cpu_supports("ssse3")) { - nibble2base = nibble2base_ssse3; - } -} - #else static inline void nibble2base(uint8_t *nib, char *seq, int len) { nibble2base_default(nib, seq, len); diff --git a/simd.c b/simd.c index 65e64cd2d..b95d0a930 100644 --- a/simd.c +++ b/simd.c @@ -1,3 +1,25 @@ +/* simd.c -- SIMD optimised versions of various internal functions. + + Copyright (C) 2024 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + #include "immintrin.h" /* * Convert a nibble encoded BAM sequence to a string of bases. @@ -62,3 +84,12 @@ static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { } nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); } + +static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; + +__attribute__((constructor)) +static void nibble2base_resolve(void) { + if (__builtin_cpu_supports("ssse3")) { + nibble2base = nibble2base_ssse3; + } +} From d562256b5965c2770d2810fd293f9cfed6e975ea Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 17 May 2024 21:47:02 +1200 Subject: [PATCH 3/8] Complete the refactoring of nibble2base_ssse3() as an ordinary source file Build simd.c as a source file, so there is one copy of nibble2base_ssse3() rather than potentially one per translation unit that uses it. When everything was static, the names needed no prefix. However now that there is a non-static function pointer, it needs an htslib-specific prefix to avoid polluting the namespace. (In libhts.so it will usually be a non-visible symbol, but in libhts.a it needs a prefix.) --- Makefile | 2 ++ sam_internal.h | 12 +++++++++--- simd.c | 25 ++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 54bca03d0..7d3f371b7 100644 --- a/Makefile +++ b/Makefile @@ -209,6 +209,7 @@ LIBHTS_OBJS = \ region.o \ sam.o \ sam_mods.o \ + simd.o \ synced_bcf_reader.o \ vcf_sweep.o \ tbx.o \ @@ -458,6 +459,7 @@ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) +simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) diff --git a/sam_internal.h b/sam_internal.h index 32c3bff97..505dc9979 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -102,12 +102,18 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { #if HTS_BUILD_IS_X86_64 \ && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ && HAVE_BUILTIN_CPU_SUPPORT_SSSE3 -#include "simd.c" -#else +#define BUILDING_SIMD_NIBBLE2BASE +#endif + static inline void nibble2base(uint8_t *nib, char *seq, int len) { +#ifdef BUILDING_SIMD_NIBBLE2BASE + extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len); + htslib_nibble2base(nib, seq, len); +#else nibble2base_default(nib, seq, len); -} #endif +} + #ifdef __cplusplus } #endif diff --git a/simd.c b/simd.c index b95d0a930..539677762 100644 --- a/simd.c +++ b/simd.c @@ -20,7 +20,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include "htslib/sam.h" +#include "sam_internal.h" + #include "immintrin.h" + +#ifdef BUILDING_SIMD_NIBBLE2BASE + /* * Convert a nibble encoded BAM sequence to a string of bases. * @@ -31,7 +40,7 @@ DEALINGS IN THE SOFTWARE. */ */ __attribute__((target("ssse3"))) -static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { +static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { seq[0] = 0; const char *seq_end_ptr = seq + len; char *seq_cursor = seq; @@ -85,11 +94,21 @@ static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); } -static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; +void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; __attribute__((constructor)) static void nibble2base_resolve(void) { if (__builtin_cpu_supports("ssse3")) { - nibble2base = nibble2base_ssse3; + htslib_nibble2base = nibble2base_ssse3; } } + +#endif // BUILDING_SIMD_NIBBLE2BASE + +// Potentially useful diagnostic, and prevents "empty translation unit" errors +const char htslib_simd[] = + "SIMD functions present:" +#ifdef BUILDING_SIMD_NIBBLE2BASE + " nibble2base" +#endif + "."; From 115c7ccd528be44928d0234642a21afa1fcd1beb Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 19 May 2024 10:09:23 +1200 Subject: [PATCH 4/8] Move __attribute__((target)) check to configure too Configure checks for __builtin_cpu_supports("ssse3"), so the check for __attribute__((target)) might as well be there too. This enables this internal-only infrastructure to be removed from htslib/hts_defs.h, which is primarily for defines needed in the public headers. Add a configure check for __attribute__((constructor)) too, as we will likely be using it on other SIMD-using platforms as well. --- Makefile | 2 ++ configure.ac | 24 ++++++++++++++++++++++++ htslib/hts_defs.h | 18 ------------------ sam_internal.h | 7 +++---- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 7d3f371b7..2c9445b53 100644 --- a/Makefile +++ b/Makefile @@ -298,6 +298,8 @@ config.h: echo '#define HAVE_AVX512 1' >> $@ ; \ fi echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@ + echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@ + echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@ echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@ echo '#endif' >> $@ diff --git a/configure.ac b/configure.ac index 13d91d218..cc2b0023d 100644 --- a/configure.ac +++ b/configure.ac @@ -173,6 +173,21 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([],[ AC_MSG_RESULT([no]) ]) +dnl Check for function attribute used in conjunction with __builtin_cpu_supports +AC_MSG_CHECKING([for __attribute__((target))]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + __attribute__((target("ssse3"))) + int zero(void) { + return 0; + } +]], [[zero();]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_ATTRIBUTE_TARGET], 1, + [Define if __attribute__((target(...))) is available.]) +], [ + AC_MSG_RESULT([no]) +]) + ]) dnl End of AS_IF(hts_have_cpuid) dnl Avoid chicken-and-egg problem where pkg-config supplies the @@ -316,6 +331,15 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic]) # Darwin has a dubious fdatasync() symbol, but no declaration in AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)]) +AC_MSG_CHECKING([for __attribute__((constructor))]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[ + static __attribute__((constructor)) void noop(void) {} +]], [])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1, + [Define if __attribute__((constructor)) is available.]) +], [AC_MSG_RESULT([no])]) + if test $enable_plugins != no; then AC_SEARCH_LIBS([dlsym], [dl], [], [MSG_ERROR([dlsym() not found diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 0c5f8957a..e714e8fda 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -34,10 +34,6 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) #endif -#ifdef __has_builtin -#define HTS_COMPILER_HAS_BUILTIN(function) __has_builtin(function) -#endif - #elif defined __GNUC__ #define HTS_GCC_AT_LEAST(major, minor) \ (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) @@ -46,10 +42,6 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTS_COMPILER_HAS #define HTS_COMPILER_HAS(attribute) 0 #endif -#ifndef HTS_COMPILER_HAS_BUILTIN -#define HTS_COMPILER_HAS_BUILTIN(function) 0 -#endif - #ifndef HTS_GCC_AT_LEAST #define HTS_GCC_AT_LEAST(major, minor) 0 #endif @@ -126,16 +118,6 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_FORMAT(type, idx, first) #endif -#define HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ - ((HTS_COMPILER_HAS(target) && HTS_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)) \ - || HTS_GCC_AT_LEAST(4, 8)) - -#if (defined(__x86_64__) || defined(_M_X64)) -#define HTS_BUILD_IS_X86_64 1 -#else -#define HTS_BUILD_IS_X86_64 0 -#endif - #if defined(_WIN32) || defined(__CYGWIN__) #if defined(HTS_BUILDING_LIBRARY) #define HTSLIB_EXPORT __declspec(dllexport) diff --git a/sam_internal.h b/sam_internal.h index 505dc9979..17a6b794e 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -25,7 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "htslib/hts_defs.h" + #include "htslib/sam.h" #ifdef __cplusplus @@ -99,9 +99,8 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { seq[i] = seq_nt16_str[bam_seqi(nib, i)]; } -#if HTS_BUILD_IS_X86_64 \ - && HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ - && HAVE_BUILTIN_CPU_SUPPORT_SSSE3 +#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \ + defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3 #define BUILDING_SIMD_NIBBLE2BASE #endif From a5ba1a65c0c1bdedb91805414a91bd2c47958da7 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 2 Jun 2024 23:39:21 +1200 Subject: [PATCH 5/8] Minor improvements to Intel implementation Notably the adjustment to seq_vec_end_ptr stops it from reverting to the scalar code prematurely: at present it drops back to scalar when <=32 entries remain, rather than <=31. --- simd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/simd.c b/simd.c index 539677762..33a053ecb 100644 --- a/simd.c +++ b/simd.c @@ -26,7 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "sam_internal.h" -#include "immintrin.h" +#include #ifdef BUILDING_SIMD_NIBBLE2BASE @@ -41,11 +41,10 @@ DEALINGS IN THE SOFTWARE. */ __attribute__((target("ssse3"))) static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { - seq[0] = 0; const char *seq_end_ptr = seq + len; char *seq_cursor = seq; uint8_t *nibble_cursor = nib; - const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i)); + const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i) - 1); __m128i first_upper_shuffle = _mm_setr_epi8( 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1); __m128i first_lower_shuffle = _mm_setr_epi8( From 2c48f9f2d8828eb8aaf4760d62bf79ed9f908a52 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 23 Jun 2024 20:32:11 +1200 Subject: [PATCH 6/8] Add correctness and speed tests for nibble2base() routines --- .gitignore | 1 + Makefile | 6 ++ configure.ac | 10 +++ test/test_nibbles.c | 164 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 test/test_nibbles.c diff --git a/.gitignore b/.gitignore index 8b4d74ca1..96b4d386d 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ shlib-exports-*.txt /test/test_kfunc /test/test_kstring /test/test_mod +/test/test_nibbles /test/test-parse-reg /test/test_realn /test/test-regidx diff --git a/Makefile b/Makefile index 2c9445b53..de65a774c 100644 --- a/Makefile +++ b/Makefile @@ -87,6 +87,7 @@ BUILT_TEST_PROGRAMS = \ test/test_kfunc \ test/test_kstring \ test/test_mod \ + test/test_nibbles \ test/test_realn \ test/test-regidx \ test/test_str2int \ @@ -603,6 +604,7 @@ check test: all $(HTSCODECS_TEST_TARGETS) test/test_expr test/test_kfunc test/test_kstring + test/test_nibbles -v test/test_str2int test/test_time_funcs test/fieldarith test/fieldarith.sam @@ -671,6 +673,9 @@ test/test_kstring: test/test_kstring.o libhts.a test/test_mod: test/test_mod.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread +test/test_nibbles: test/test_nibbles.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread @@ -773,6 +778,7 @@ test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h) +test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) diff --git a/configure.ac b/configure.ac index cc2b0023d..19b48b5e3 100644 --- a/configure.ac +++ b/configure.ac @@ -340,6 +340,16 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[ [Define if __attribute__((constructor)) is available.]) ], [AC_MSG_RESULT([no])]) +AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID]) +AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[ + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); +]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1, + [Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.]) +], [AC_MSG_RESULT([no])]) + if test $enable_plugins != no; then AC_SEARCH_LIBS([dlsym], [dl], [], [MSG_ERROR([dlsym() not found diff --git a/test/test_nibbles.c b/test/test_nibbles.c new file mode 100644 index 000000000..1ef3456ea --- /dev/null +++ b/test/test_nibbles.c @@ -0,0 +1,164 @@ +/* test/test_nibbles.c -- Test SIMD optimised function implementations. + + Copyright (C) 2024 Centre for Population Genomics. + + Author: John Marshall + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include + +#ifdef HAVE_CLOCK_GETTIME_CPUTIME +#include +#else +#include +#endif + +#include "../htslib/sam.h" +#include "../sam_internal.h" + +long long gettime(void) { +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + struct timespec ts; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + return ts.tv_sec * 1000000000LL + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000LL + tv.tv_usec; +#endif +} + +char *fmttime(long long elapsed) { + static char buf[64]; + +#ifdef HAVE_CLOCK_GETTIME_CPUTIME + long long sec = elapsed / 1000000000; + long long nsec = elapsed % 1000000000; + sprintf(buf, "%lld.%09lld processor seconds", sec, nsec); +#else + long long sec = elapsed / 1000000; + long long usec = elapsed % 1000000; + sprintf(buf, "%lld.%06lld wall-time seconds", sec, usec); +#endif + + return buf; +} + +void nibble2base_single(uint8_t *nib, char *seq, int len) { + int i; + for (i = 0; i < len; i++) + seq[i] = seq_nt16_str[bam_seqi(nib, i)]; +} + +unsigned char nibble[5000]; +char buf[10000]; + +int validate_nibble2base(void) { + char defbuf[500]; + int i, start, len; + unsigned long long total = 0, failed = 0; + + for (i = 0; i < sizeof nibble; i++) + nibble[i] = i % 256; + + for (start = 0; start < 80; start++) + for (len = 0; len < 400; len++) { + memset(defbuf, '\0', sizeof defbuf); + nibble2base_single(&nibble[start], defbuf, len); + + memset(buf, '\0', sizeof defbuf); + nibble2base(&nibble[start], buf, len); + + total++; + if (strcmp(defbuf, buf) != 0) { + printf("%s expected\n%s FAIL\n\n", defbuf, buf); + failed++; + } + } + + if (failed > 0) { + fprintf(stderr, "Failures: %llu (out of %llu tests)\n", failed, total); + return 1; + } + + return 0; +} + +int time_nibble2base(int length, unsigned long count) { + unsigned long i, total = 0; + + for (i = 0; i < length; i++) + nibble[i] = i % 256; + + printf("Timing %lu nibble2base iterations with read length %d...\n", count, length); + long long start = gettime(); + + for (i = 0; i < count; i++) { + nibble2base(nibble, buf, length); + total += buf[i % length]; + } + + long long stop = gettime(); + printf("%s (summing to %lu)\n", fmttime(stop - start), total); + return 0; +} + +int main(int argc, char **argv) { + int readlen = 5000; + unsigned long count = 1000000; + int status = 0; + int c; + + if (argc == 1) + printf( +"Usage: test_nibbles [-c NUM] [-r NUM] [-n|-v]...\n" +"Options:\n" +" -c NUM Specify number of iterations [%lu]\n" +" -n Run nibble2base speed tests\n" +" -r NUM Specify read length [%d]\n" +" -v Run all validation tests\n" +"", count, readlen); + + while ((c = getopt(argc, argv, "c:nr:v")) >= 0) + switch (c) { + case 'c': + count = strtoul(optarg, NULL, 0); + break; + + case 'n': + status += time_nibble2base(readlen, count); + break; + + case 'r': + readlen = atoi(optarg); + break; + + case 'v': + status += validate_nibble2base(); + break; + } + + return status; +} From be6a8fb5a0dc00089d0c507fc1227c1c986444d6 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 23 Jun 2024 22:05:59 +1200 Subject: [PATCH 7/8] Add ARM Neon nibble2base_neon() implementation As ARM CPU capability instructions are privileged, we also need to add cpu_supports_neon() and implement it to query for Neon/AdvSIMD in various platform-dependent ways. (On 32-bit ARM, glibc helpfully renamed the HWCAP_* macros to HWCAP_ARM_*, so accept both on Linux.) 32-bit ARM GCC erroneously does not define vst1q_u8_x2() (bug 71233, fixed in v14.1 in this case), so avoid it on 32-bit ARM by writing interleaved via vst2q_u8() instead. --- sam_internal.h | 3 +- simd.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 3 deletions(-) diff --git a/sam_internal.h b/sam_internal.h index 17a6b794e..8f701f337 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -100,7 +100,8 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) { } #if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \ - defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3 + ((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \ + (defined __ARM_NEON)) #define BUILDING_SIMD_NIBBLE2BASE #endif diff --git a/simd.c b/simd.c index 33a053ecb..ba3edea7b 100644 --- a/simd.c +++ b/simd.c @@ -23,13 +23,91 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +// These must be defined before the first system include to ensure that legacy +// BSD types needed by remain defined when _XOPEN_SOURCE is set. +#if defined __APPLE__ +#define _DARWIN_C_SOURCE +#elif defined __NetBSD__ +#define _NETBSD_SOURCE +#endif + #include "htslib/sam.h" #include "sam_internal.h" +#if defined __x86_64__ #include +#elif defined __ARM_NEON +#include +#endif + +#if defined __arm__ || defined __aarch64__ + +#if defined __linux__ || defined __FreeBSD__ +#include +#elif defined __APPLE__ +#include +#include +#elif defined __NetBSD__ +#include +#include +#include +#ifdef __aarch64__ +#include +#else +#include +#endif +#elif defined _WIN32 +#include +#endif + +static inline int cpu_supports_neon(void) { +#if defined __linux__ && defined __arm__ && defined HWCAP_NEON + return (getauxval(AT_HWCAP) & HWCAP_NEON) != 0; +#elif defined __linux__ && defined __arm__ && defined HWCAP_ARM_NEON + return (getauxval(AT_HWCAP) & HWCAP_ARM_NEON) != 0; +#elif defined __linux__ && defined __aarch64__ && defined HWCAP_ASIMD + return (getauxval(AT_HWCAP) & HWCAP_ASIMD) != 0; +#elif defined __APPLE__ && defined __aarch64__ + int32_t ctl; + size_t ctlsize = sizeof ctl; + if (sysctlbyname("hw.optional.AdvSIMD", &ctl, &ctlsize, NULL, 0) != 0) return 0; + if (ctlsize != sizeof ctl) return 0; + return ctl; +#elif defined __FreeBSD__ && defined __arm__ && defined HWCAP_NEON + unsigned long cap; + if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; + return (cap & HWCAP_NEON) != 0; +#elif defined __FreeBSD__ && defined __aarch64__ && defined HWCAP_ASIMD + unsigned long cap; + if (elf_aux_info(AT_HWCAP, &cap, sizeof cap) != 0) return 0; + return (cap & HWCAP_ASIMD) != 0; +#elif defined __NetBSD__ && defined __arm__ && defined ARM_MVFR0_ASIMD_MASK + uint32_t buf[16]; + size_t buflen = sizeof buf; + if (sysctlbyname("machdep.id_mvfr", buf, &buflen, NULL, 0) != 0) return 0; + if (buflen < sizeof(uint32_t)) return 0; + return (buf[0] & ARM_MVFR0_ASIMD_MASK) == 0x00000002; +#elif defined __NetBSD__ && defined __aarch64__ && defined ID_AA64PFR0_EL1_ADVSIMD + struct aarch64_sysctl_cpu_id buf; + size_t buflen = sizeof buf; + if (sysctlbyname("machdep.cpu0.cpu_id", &buf, &buflen, NULL, 0) != 0) return 0; + if (buflen < offsetof(struct aarch64_sysctl_cpu_id, ac_aa64pfr0) + sizeof(uint64_t)) return 0; + return (buf.ac_aa64pfr0 & ID_AA64PFR0_EL1_ADVSIMD & 0x00e00000) == 0; +#elif defined _WIN32 + return IsProcessorFeaturePresent(PF_ARM_V8_INSTRUCTIONS_AVAILABLE) != 0; +#else + return 0; +#endif +} + +#endif #ifdef BUILDING_SIMD_NIBBLE2BASE +void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; + +#if defined __x86_64__ + /* * Convert a nibble encoded BAM sequence to a string of bases. * @@ -93,8 +171,6 @@ static void nibble2base_ssse3(uint8_t *nib, char *seq, int len) { nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor); } -void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default; - __attribute__((constructor)) static void nibble2base_resolve(void) { if (__builtin_cpu_supports("ssse3")) { @@ -102,6 +178,61 @@ static void nibble2base_resolve(void) { } } +#elif defined __ARM_NEON + +static void nibble2base_neon(uint8_t *nib, char *seq0, int len) { + uint8x16_t low_nibbles_mask = vdupq_n_u8(0x0f); + uint8x16_t nuc_lookup_vec = vld1q_u8((const uint8_t *) seq_nt16_str); +#ifndef __aarch64__ + uint8x8x2_t nuc_lookup_vec2 = {{ vget_low_u8(nuc_lookup_vec), vget_high_u8(nuc_lookup_vec) }}; +#endif + + uint8_t *seq = (uint8_t *) seq0; + int blocks; + + for (blocks = len / 32; blocks > 0; --blocks) { + uint8x16_t encoded = vld1q_u8(nib); + nib += 16; + + /* Translate the high and low nibbles to nucleotide letters separately, + then interleave them back together via vzipq for writing. */ + + uint8x16_t high_nibbles = vshrq_n_u8(encoded, 4); + uint8x16_t low_nibbles = vandq_u8(encoded, low_nibbles_mask); + +#ifdef __aarch64__ + uint8x16_t high_nucleotides = vqtbl1q_u8(nuc_lookup_vec, high_nibbles); + uint8x16_t low_nucleotides = vqtbl1q_u8(nuc_lookup_vec, low_nibbles); +#else + uint8x8_t high_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(high_nibbles)); + uint8x8_t high_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(high_nibbles)); + uint8x16_t high_nucleotides = vcombine_u8(high_low, high_high); + + uint8x8_t low_low = vtbl2_u8(nuc_lookup_vec2, vget_low_u8(low_nibbles)); + uint8x8_t low_high = vtbl2_u8(nuc_lookup_vec2, vget_high_u8(low_nibbles)); + uint8x16_t low_nucleotides = vcombine_u8(low_low, low_high); +#endif + +#ifdef __aarch64__ + vst1q_u8_x2(seq, vzipq_u8(high_nucleotides, low_nucleotides)); +#else + // Avoid vst1q_u8_x2 as GCC erroneously omits it on 32-bit ARM + uint8x16x2_t nucleotides = {{ high_nucleotides, low_nucleotides }}; + vst2q_u8(seq, nucleotides); +#endif + seq += 32; + } + + if (len % 32 != 0) + nibble2base_default(nib, (char *) seq, len % 32); +} + +static __attribute__((constructor)) void nibble2base_resolve(void) { + if (cpu_supports_neon()) htslib_nibble2base = nibble2base_neon; +} + +#endif + #endif // BUILDING_SIMD_NIBBLE2BASE // Potentially useful diagnostic, and prevents "empty translation unit" errors From 552905df430dd86c45707eececbde48f0ff15456 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 1 Jul 2024 23:54:09 +1200 Subject: [PATCH 8/8] Define HAVE_ATTRIBUTE_CONSTRUCTOR on ARM too for non-configure builds --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index de65a774c..e5527e2fb 100644 --- a/Makefile +++ b/Makefile @@ -298,8 +298,10 @@ config.h: if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \ echo '#define HAVE_AVX512 1' >> $@ ; \ fi - echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@ + echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@ echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@ + echo '#endif' >> $@ + echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@ echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@ echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@ echo '#endif' >> $@