Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor SIMD code and add ARM Neon nibble2base() implementation #1795

Merged
merged 8 commits into from
Jul 4, 2024
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ shlib-exports-*.txt
/test/test_kfunc
/test/test_kstring
/test/test_mod
/test/test_nibbles
/test/test-parse-reg
/test/test_realn
/test/test-regidx
Expand Down
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ BUILT_TEST_PROGRAMS = \
test/test_kfunc \
test/test_kstring \
test/test_mod \
test/test_nibbles \
test/test_realn \
test/test-regidx \
test/test_str2int \
Expand Down Expand Up @@ -209,6 +210,7 @@ LIBHTS_OBJS = \
region.o \
sam.o \
sam_mods.o \
simd.o \
synced_bcf_reader.o \
vcf_sweep.o \
tbx.o \
Expand Down Expand Up @@ -296,7 +298,11 @@ config.h:
if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \
echo '#define HAVE_AVX512 1' >> $@ ; \
fi
echo '#if defined __x86_64__ || defined __arm__ || defined __aarch64__' >> $@
echo '#define HAVE_ATTRIBUTE_CONSTRUCTOR 1' >> $@
echo '#endif' >> $@
echo '#if (defined(__x86_64__) || defined(_M_X64))' >> $@
echo '#define HAVE_ATTRIBUTE_TARGET 1' >> $@
echo '#define HAVE_BUILTIN_CPU_SUPPORT_SSSE3 1' >> $@
echo '#endif' >> $@

Expand Down Expand Up @@ -458,6 +464,7 @@ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h)
simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h)
tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h)
faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h)
bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h)
Expand Down Expand Up @@ -599,6 +606,7 @@ check test: all $(HTSCODECS_TEST_TARGETS)
test/test_expr
test/test_kfunc
test/test_kstring
test/test_nibbles -v
test/test_str2int
test/test_time_funcs
test/fieldarith test/fieldarith.sam
Expand Down Expand Up @@ -667,6 +675,9 @@ test/test_kstring: test/test_kstring.o libhts.a
test/test_mod: test/test_mod.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread

test/test_nibbles: test/test_nibbles.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_nibbles.o libhts.a $(LIBS) -lpthread

test/test_realn: test/test_realn.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread

Expand Down Expand Up @@ -769,6 +780,7 @@ test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h)
test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h)
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h)
test/test_nibbles.o: test/test_nibbles.c config.h $(htslib_sam_h) $(sam_internal_h)
test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h)
test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h)
Expand Down
34 changes: 34 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,21 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([],[
AC_MSG_RESULT([no])
])

dnl Check for function attribute used in conjunction with __builtin_cpu_supports
AC_MSG_CHECKING([for __attribute__((target))])
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
__attribute__((target("ssse3")))
int zero(void) {
return 0;
}
]], [[zero();]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_ATTRIBUTE_TARGET], 1,
[Define if __attribute__((target(...))) is available.])
], [
AC_MSG_RESULT([no])
])

]) dnl End of AS_IF(hts_have_cpuid)

dnl Avoid chicken-and-egg problem where pkg-config supplies the
Expand Down Expand Up @@ -316,6 +331,25 @@ AC_CHECK_FUNCS([gmtime_r fsync drand48 srand48_deterministic])
# Darwin has a dubious fdatasync() symbol, but no declaration in <unistd.h>
AC_CHECK_DECL([fdatasync(int)], [AC_CHECK_FUNCS(fdatasync)])

AC_MSG_CHECKING([for __attribute__((constructor))])
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
static __attribute__((constructor)) void noop(void) {}
]], [])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_ATTRIBUTE_CONSTRUCTOR], 1,
[Define if __attribute__((constructor)) is available.])
], [AC_MSG_RESULT([no])])

AC_MSG_CHECKING([for clock_gettime with CLOCK_PROCESS_CPUTIME_ID])
AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <time.h>]], [[
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
]])], [
AC_MSG_RESULT([yes])
AC_DEFINE([HAVE_CLOCK_GETTIME_CPUTIME], 1,
[Define if clock_gettime exists and accepts CLOCK_PROCESS_CPUTIME_ID.])
], [AC_MSG_RESULT([no])])

if test $enable_plugins != no; then
AC_SEARCH_LIBS([dlsym], [dl], [],
[MSG_ERROR([dlsym() not found
Expand Down
18 changes: 0 additions & 18 deletions htslib/hts_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ DEALINGS IN THE SOFTWARE. */
#define HTS_COMPILER_HAS(attribute) __has_attribute(attribute)
#endif

#ifdef __has_builtin
#define HTS_COMPILER_HAS_BUILTIN(function) __has_builtin(function)
#endif

#elif defined __GNUC__
#define HTS_GCC_AT_LEAST(major, minor) \
(__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
Expand All @@ -46,10 +42,6 @@ DEALINGS IN THE SOFTWARE. */
#ifndef HTS_COMPILER_HAS
#define HTS_COMPILER_HAS(attribute) 0
#endif
#ifndef HTS_COMPILER_HAS_BUILTIN
#define HTS_COMPILER_HAS_BUILTIN(function) 0
#endif

#ifndef HTS_GCC_AT_LEAST
#define HTS_GCC_AT_LEAST(major, minor) 0
#endif
Expand Down Expand Up @@ -126,16 +118,6 @@ DEALINGS IN THE SOFTWARE. */
#define HTS_FORMAT(type, idx, first)
#endif

#define HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
((HTS_COMPILER_HAS(target) && HTS_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)) \
|| HTS_GCC_AT_LEAST(4, 8))

#if (defined(__x86_64__) || defined(_M_X64))
#define HTS_BUILD_IS_X86_64 1
#else
#define HTS_BUILD_IS_X86_64 0
#endif

#if defined(_WIN32) || defined(__CYGWIN__)
#if defined(HTS_BUILDING_LIBRARY)
#define HTSLIB_EXPORT __declspec(dllexport)
Expand Down
91 changes: 12 additions & 79 deletions sam_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ DEALINGS IN THE SOFTWARE. */

#include <errno.h>
#include <stdint.h>
#include "htslib/hts_defs.h"

#include "htslib/sam.h"

#ifdef __cplusplus
Expand Down Expand Up @@ -99,88 +99,21 @@ static inline void nibble2base_default(uint8_t *nib, char *seq, int len) {
seq[i] = seq_nt16_str[bam_seqi(nib, i)];
}

#if HTS_BUILD_IS_X86_64 \
&& HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \
&& HAVE_BUILTIN_CPU_SUPPORT_SSSE3
#include "immintrin.h"
/*
* Convert a nibble encoded BAM sequence to a string of bases.
*
* Using SSSE3 instructions, 16 codepoints that hold 2 bases each can be
* unpacked into 32 indexes from 0-15. Using the pshufb instruction these can
* be converted to the IUPAC characters.
* It falls back on the nibble2base_default function for the remainder.
*/

__attribute__((target("ssse3")))
static inline void nibble2base_ssse3(uint8_t *nib, char *seq, int len) {
seq[0] = 0;
const char *seq_end_ptr = seq + len;
char *seq_cursor = seq;
uint8_t *nibble_cursor = nib;
const char *seq_vec_end_ptr = seq_end_ptr - (2 * sizeof(__m128i));
__m128i first_upper_shuffle = _mm_setr_epi8(
0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1);
__m128i first_lower_shuffle = _mm_setr_epi8(
-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7);
__m128i second_upper_shuffle = _mm_setr_epi8(
8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1);
__m128i second_lower_shuffle = _mm_setr_epi8(
-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15);
__m128i nuc_lookup_vec = _mm_lddqu_si128((__m128i *)seq_nt16_str);
/* Work on 16 encoded characters at the time resulting in 32 decoded characters
Examples are given for 8 encoded characters A until H to keep it readable.
Encoded stored as |AB|CD|EF|GH|
Shuffle into |AB|00|CD|00|EF|00|GH|00| and
|00|AB|00|CD|00|EF|00|GH|
Shift upper to the right resulting into
|0A|B0|0C|D0|0E|F0|0G|H0| and
|00|AB|00|CD|00|EF|00|GH|
Merge with or resulting into (X stands for garbage)
|0A|XB|0C|XD|0E|XF|0G|XH|
Bitwise and with 0b1111 leads to:
|0A|0B|0C|0D|0E|0F|0G|0H|
We can use the resulting 4-bit integers as indexes for the shuffle of
the nucleotide lookup. */
while (seq_cursor < seq_vec_end_ptr) {
__m128i encoded = _mm_lddqu_si128((__m128i *)nibble_cursor);

__m128i first_upper = _mm_shuffle_epi8(encoded, first_upper_shuffle);
__m128i first_lower = _mm_shuffle_epi8(encoded, first_lower_shuffle);
__m128i shifted_first_upper = _mm_srli_epi64(first_upper, 4);
__m128i first_merged = _mm_or_si128(shifted_first_upper, first_lower);
__m128i first_indexes = _mm_and_si128(first_merged, _mm_set1_epi8(15));
__m128i first_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, first_indexes);
_mm_storeu_si128((__m128i *)seq_cursor, first_nucleotides);

__m128i second_upper = _mm_shuffle_epi8(encoded, second_upper_shuffle);
__m128i second_lower = _mm_shuffle_epi8(encoded, second_lower_shuffle);
__m128i shifted_second_upper = _mm_srli_epi64(second_upper, 4);
__m128i second_merged = _mm_or_si128(shifted_second_upper, second_lower);
__m128i second_indexes = _mm_and_si128(second_merged, _mm_set1_epi8(15));
__m128i second_nucleotides = _mm_shuffle_epi8(nuc_lookup_vec, second_indexes);
_mm_storeu_si128((__m128i *)(seq_cursor + 16), second_nucleotides);

nibble_cursor += sizeof(__m128i);
seq_cursor += 2 * sizeof(__m128i);
}
nibble2base_default(nibble_cursor, seq_cursor, seq_end_ptr - seq_cursor);
}

static void (*nibble2base)(uint8_t *nib, char *seq, int len) = nibble2base_default;

__attribute__((constructor))
static void nibble2base_resolve(void) {
if (__builtin_cpu_supports("ssse3")) {
nibble2base = nibble2base_ssse3;
}
}
#if defined HAVE_ATTRIBUTE_CONSTRUCTOR && \
((defined __x86_64__ && defined HAVE_ATTRIBUTE_TARGET && defined HAVE_BUILTIN_CPU_SUPPORT_SSSE3) || \
(defined __ARM_NEON))
#define BUILDING_SIMD_NIBBLE2BASE
#endif

#else
static inline void nibble2base(uint8_t *nib, char *seq, int len) {
#ifdef BUILDING_SIMD_NIBBLE2BASE
extern void (*htslib_nibble2base)(uint8_t *nib, char *seq, int len);
htslib_nibble2base(nib, seq, len);
#else
nibble2base_default(nib, seq, len);
}
#endif
}

#ifdef __cplusplus
}
#endif
Expand Down
Loading