Skip to content

Commit

Permalink
lib/x86/adler32: remove the AVX-512BW implementation
Browse files Browse the repository at this point in the history
This code doesn't seem worthwhile to keep around, given that using zmm
registers incurs a frequency drop.  On some CPU models, it is a very
substantial drop.  So it might seem worthwhile in microbenchmarks, but
the benefit is lost by the effect on other workloads.  Also in Alder
Lake, Intel is removing AVX-512 from client CPUs.  There's also room for
improvement in the AVX-2 implementation.

This code could come back later, but let's drop it for now.
  • Loading branch information
ebiggers committed Jun 11, 2022
1 parent 471d331 commit 416bac3
Show file tree
Hide file tree
Showing 5 changed files with 4 additions and 114 deletions.
79 changes: 3 additions & 76 deletions lib/x86/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* HAVE_SSE2_INTRIN */

/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
/* AVX2 implementation, processes 32 bytes at a time */
#if HAVE_AVX2_INTRIN
# define adler32_avx2 adler32_avx2
# define FUNCNAME adler32_avx2
Expand Down Expand Up @@ -233,87 +233,14 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX2_INTRIN */

/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
#if HAVE_AVX512BW_INTRIN
# define adler32_avx512bw adler32_avx512bw
# define FUNCNAME adler32_avx512bw
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_LEN 64
# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN
# if HAVE_AVX512BW_NATIVE
# define ATTRIBUTES
# else
# define ATTRIBUTES __attribute__((target("avx512bw")))
# endif
# include <immintrin.h>
static forceinline ATTRIBUTES void
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
const __m512i zeroes = _mm512_setzero_si512();
const __v64qi multipliers = (__v64qi){
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
__v16si v_s1 = (__v16si)zeroes;
__v16si v_s1_sums = (__v16si)zeroes;
__v16si v_s2 = (__v16si)zeroes;

do {
/* Load the next 64-byte segment. */
__m512i bytes = *p++;
/*
* Multiply the bytes by 64...1 (the number of times they need
* to be added to s2) and add adjacent products.
*/
__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
bytes, (__m512i)multipliers);
/*
* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 64 to the end.
*/
v_s1_sums += v_s1;
/*
* Add the sum of each group of 8 bytes to the corresponding s1
* counter.
*/
v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
/*
* Add the sum of each group of 4 products of the bytes by
* 64...1 to the corresponding s2 counter.
*/
v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
(__m512i)ones);
} while (p != end);

/*
* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment length (64).
*/
v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);

/* Add the counters to the real s1 and s2. */
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* HAVE_AVX512BW_INTRIN */

#if defined(adler32_avx512bw) && HAVE_AVX512BW_NATIVE
#define DEFAULT_IMPL adler32_avx512bw
#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
#define DEFAULT_IMPL adler32_avx2
#else
static inline adler32_func_t
arch_select_adler32_func(void)
{
const u32 features MAYBE_UNUSED = get_x86_cpu_features();

#ifdef adler32_avx512bw
if (HAVE_AVX512BW(features))
return adler32_avx512bw;
#endif
#ifdef adler32_avx2
if (HAVE_AVX2(features))
return adler32_avx2;
Expand Down
12 changes: 0 additions & 12 deletions lib/x86/cpu_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
{X86_CPU_FEATURE_AVX, "avx"},
{X86_CPU_FEATURE_AVX2, "avx2"},
{X86_CPU_FEATURE_BMI2, "bmi2"},
{X86_CPU_FEATURE_AVX512BW, "avx512bw"},
};

volatile u32 libdeflate_x86_cpu_features = 0;
Expand All @@ -93,7 +92,6 @@ void libdeflate_init_x86_cpu_features(void)
u32 max_function;
u32 features_1, features_2, features_3, features_4;
bool os_avx_support = false;
bool os_avx512_support = false;

/* Get maximum supported function */
cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
Expand All @@ -115,13 +113,6 @@ void libdeflate_init_x86_cpu_features(void)
os_avx_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX);

os_avx512_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX |
XCR0_BIT_OPMASK |
XCR0_BIT_ZMM_HI256 |
XCR0_BIT_HI16_ZMM);
}

if (os_avx_support && IS_SET(features_2, 28))
Expand All @@ -139,9 +130,6 @@ void libdeflate_init_x86_cpu_features(void)
if (IS_SET(features_3, 8))
features |= X86_CPU_FEATURE_BMI2;

if (os_avx512_support && IS_SET(features_3, 30))
features |= X86_CPU_FEATURE_AVX512BW;

out:
disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
ARRAY_LEN(x86_cpu_feature_table));
Expand Down
21 changes: 0 additions & 21 deletions lib/x86/cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,12 @@
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512BW 0x00000020

#define HAVE_SSE2(features) (HAVE_SSE2_NATIVE || ((features) & X86_CPU_FEATURE_SSE2))
#define HAVE_PCLMUL(features) (HAVE_PCLMUL_NATIVE || ((features) & X86_CPU_FEATURE_PCLMUL))
#define HAVE_AVX(features) (HAVE_AVX_NATIVE || ((features) & X86_CPU_FEATURE_AVX))
#define HAVE_AVX2(features) (HAVE_AVX2_NATIVE || ((features) & X86_CPU_FEATURE_AVX2))
#define HAVE_BMI2(features) (HAVE_BMI2_NATIVE || ((features) & X86_CPU_FEATURE_BMI2))
#define HAVE_AVX512BW(features) (HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))

#if HAVE_DYNAMIC_X86_CPU_FEATURES
#define X86_CPU_FEATURES_KNOWN 0x80000000
Expand Down Expand Up @@ -159,25 +157,6 @@ typedef char __v64qi __attribute__((__vector_size__(64)));
(HAVE_DYNAMIC_X86_CPU_FEATURES && \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))

/* AVX-512BW */
#ifdef __AVX512BW__
# define HAVE_AVX512BW_NATIVE 1
#else
# define HAVE_AVX512BW_NATIVE 0
#endif
#define HAVE_AVX512BW_TARGET \
(HAVE_DYNAMIC_X86_CPU_FEATURES && \
(GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512)))
/*
* clang originally added AVX-512BW support without defining
* __builtin_ia32_psadbw512 and the corresponding _mm512_sad_epu8 intrinsic. So
* the condition below is a bit different from usual; it ensures we do the
* __has_builtin check even if __AVX512BW__ is defined.
*/
#define HAVE_AVX512BW_INTRIN \
(HAVE_AVX512BW_TARGET && \
(HAVE_TARGET_INTRINSICS || HAVE_AVX512BW_NATIVE))

#endif /* __i386__ || __x86_64__ */

#endif /* LIB_X86_CPU_FEATURES_H */
4 changes: 0 additions & 4 deletions scripts/checksum_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,6 @@ echo
{
case $ARCH in
i386|x86_64)
if have_cpu_feature avx512bw; then
do_benchmark "AVX-512BW"
disable_cpu_feature "avx512bw" "-mno-avx512bw"
fi
if have_cpu_feature avx2; then
do_benchmark "AVX2"
disable_cpu_feature "avx2" "-mno-avx2"
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ build_and_run_tests() {
if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
case "$ARCH" in
i386|x86_64)
features+=(avx512bw avx2 avx bmi2 pclmul sse2)
features+=(avx2 avx bmi2 pclmul sse2)
;;
arm*|aarch*)
features+=(sha3 crc32 pmull neon)
Expand Down

0 comments on commit 416bac3

Please sign in to comment.