lib/x86/adler32: remove the AVX-512BW implementation

This code doesn't seem worthwhile to keep around, given that using zmm registers incurs a frequency drop. On some CPU models, it is a very substantial drop. So it might seem worthwhile in microbenchmarks, but the benefit is lost by the effect on other workloads. Also in Alder Lake, Intel is removing AVX-512 from client CPUs. There's also room for improvement in the AVX-2 implementation. This code could come back later, but let's drop it for now.
ebiggers · Jun 11, 2022 · 416bac3 · 416bac3
1 parent 471d331
commit 416bac3
Show file tree

Hide file tree

Showing 5 changed files with 4 additions and 114 deletions.
diff --git a/lib/x86/adler32_impl.h b/lib/x86/adler32_impl.h
@@ -168,7 +168,7 @@ adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
 #  include "../adler32_vec_template.h"
 #endif /* HAVE_SSE2_INTRIN */
 
-/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
+/* AVX2 implementation, processes 32 bytes at a time */
 #if HAVE_AVX2_INTRIN
 #  define adler32_avx2		adler32_avx2
 #  define FUNCNAME		adler32_avx2
@@ -233,87 +233,14 @@ adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
 #  include "../adler32_vec_template.h"
 #endif /* HAVE_AVX2_INTRIN */
 
-/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
-#if HAVE_AVX512BW_INTRIN
-#  define adler32_avx512bw	adler32_avx512bw
-#  define FUNCNAME		adler32_avx512bw
-#  define FUNCNAME_CHUNK	adler32_avx512bw_chunk
-#  define IMPL_ALIGNMENT	64
-#  define IMPL_SEGMENT_LEN	64
-#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_AVX512BW_NATIVE
-#    define ATTRIBUTES
-#  else
-#    define ATTRIBUTES		__attribute__((target("avx512bw")))
-#  endif
-#  include <immintrin.h>
-static forceinline ATTRIBUTES void
-adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
-		       u32 *s1, u32 *s2)
-{
-	const __m512i zeroes = _mm512_setzero_si512();
-	const __v64qi multipliers = (__v64qi){
-		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
-	__v16si v_s1 = (__v16si)zeroes;
-	__v16si v_s1_sums = (__v16si)zeroes;
-	__v16si v_s2 = (__v16si)zeroes;
-
-	do {
-		/* Load the next 64-byte segment. */
-		__m512i bytes = *p++;
-		/*
-		 * Multiply the bytes by 64...1 (the number of times they need
-		 * to be added to s2) and add adjacent products.
-		 */
-		__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
-						bytes, (__m512i)multipliers);
-		/*
-		 * Keep sum of all previous s1 counters, for adding to s2 later.
-		 * This allows delaying the multiplication by 64 to the end.
-		 */
-		v_s1_sums += v_s1;
-		/*
-		 * Add the sum of each group of 8 bytes to the corresponding s1
-		 * counter.
-		 */
-		v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
-		/*
-		 * Add the sum of each group of 4 products of the bytes by
-		 * 64...1 to the corresponding s2 counter.
-		 */
-		v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
-						   (__m512i)ones);
-	} while (p != end);
-
-	/*
-	 * Finish the s2 counters by adding the sum of the s1 values at the
-	 * beginning of each segment, multiplied by the segment length (64).
-	 */
-	v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
-
-	/* Add the counters to the real s1 and s2. */
-	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
-}
-#  include "../adler32_vec_template.h"
-#endif /* HAVE_AVX512BW_INTRIN */
-
-#if defined(adler32_avx512bw) && HAVE_AVX512BW_NATIVE
-#define DEFAULT_IMPL	adler32_avx512bw
+#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
+#define DEFAULT_IMPL	adler32_avx2
 #else
 static inline adler32_func_t
 arch_select_adler32_func(void)
 {
 	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
 
-#ifdef adler32_avx512bw
-	if (HAVE_AVX512BW(features))
-		return adler32_avx512bw;
-#endif
 #ifdef adler32_avx2
 	if (HAVE_AVX2(features))
 		return adler32_avx2;

diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
@@ -80,7 +80,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_AVX,		"avx"},
 	{X86_CPU_FEATURE_AVX2,		"avx2"},
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
-	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
 };
 
 volatile u32 libdeflate_x86_cpu_features = 0;
@@ -93,7 +92,6 @@ void libdeflate_init_x86_cpu_features(void)
 	u32 max_function;
 	u32 features_1, features_2, features_3, features_4;
 	bool os_avx_support = false;
-	bool os_avx512_support = false;
 
 	/* Get maximum supported function  */
 	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
@@ -115,13 +113,6 @@ void libdeflate_init_x86_cpu_features(void)
 		os_avx_support = IS_ALL_SET(xcr0,
 					    XCR0_BIT_SSE |
 					    XCR0_BIT_AVX);
-
-		os_avx512_support = IS_ALL_SET(xcr0,
-					       XCR0_BIT_SSE |
-					       XCR0_BIT_AVX |
-					       XCR0_BIT_OPMASK |
-					       XCR0_BIT_ZMM_HI256 |
-					       XCR0_BIT_HI16_ZMM);
 	}
 
 	if (os_avx_support && IS_SET(features_2, 28))
@@ -139,9 +130,6 @@ void libdeflate_init_x86_cpu_features(void)
 	if (IS_SET(features_3, 8))
 		features |= X86_CPU_FEATURE_BMI2;
 
-	if (os_avx512_support && IS_SET(features_3, 30))
-		features |= X86_CPU_FEATURE_AVX512BW;
-
 out:
 	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
 					 ARRAY_LEN(x86_cpu_feature_table));

diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
@@ -44,14 +44,12 @@
 #define X86_CPU_FEATURE_AVX		0x00000004
 #define X86_CPU_FEATURE_AVX2		0x00000008
 #define X86_CPU_FEATURE_BMI2		0x00000010
-#define X86_CPU_FEATURE_AVX512BW	0x00000020
 
 #define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
 #define HAVE_PCLMUL(features)	(HAVE_PCLMUL_NATIVE   || ((features) & X86_CPU_FEATURE_PCLMUL))
 #define HAVE_AVX(features)	(HAVE_AVX_NATIVE      || ((features) & X86_CPU_FEATURE_AVX))
 #define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
 #define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
-#define HAVE_AVX512BW(features)	(HAVE_AVX512BW_NATIVE || ((features) & X86_CPU_FEATURE_AVX512BW))
 
 #if HAVE_DYNAMIC_X86_CPU_FEATURES
 #define X86_CPU_FEATURES_KNOWN		0x80000000
@@ -159,25 +157,6 @@ typedef char  __v64qi __attribute__((__vector_size__(64)));
 	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
 	 (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)))
 
-/* AVX-512BW */
-#ifdef __AVX512BW__
-#  define HAVE_AVX512BW_NATIVE	1
-#else
-#  define HAVE_AVX512BW_NATIVE	0
-#endif
-#define HAVE_AVX512BW_TARGET \
-	(HAVE_DYNAMIC_X86_CPU_FEATURES && \
-	 (GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512)))
-/*
- * clang originally added AVX-512BW support without defining
- * __builtin_ia32_psadbw512 and the corresponding _mm512_sad_epu8 intrinsic.  So
- * the condition below is a bit different from usual; it ensures we do the
- * __has_builtin check even if __AVX512BW__ is defined.
- */
-#define HAVE_AVX512BW_INTRIN \
-	(HAVE_AVX512BW_TARGET && \
-	 (HAVE_TARGET_INTRINSICS || HAVE_AVX512BW_NATIVE))
-
 #endif /* __i386__ || __x86_64__ */
 
 #endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/scripts/checksum_benchmarks.sh b/scripts/checksum_benchmarks.sh
@@ -136,10 +136,6 @@ echo
 {
 case $ARCH in
 i386|x86_64)
-	if have_cpu_feature avx512bw; then
-		do_benchmark "AVX-512BW"
-		disable_cpu_feature "avx512bw" "-mno-avx512bw"
-	fi
 	if have_cpu_feature avx2; then
 		do_benchmark "AVX2"
 		disable_cpu_feature "avx2" "-mno-avx2"

diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
@@ -128,7 +128,7 @@ build_and_run_tests() {
 	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
 		case "$ARCH" in
 		i386|x86_64)
-			features+=(avx512bw avx2 avx bmi2 pclmul sse2)
+			features+=(avx2 avx bmi2 pclmul sse2)
 			;;
 		arm*|aarch*)
 			features+=(sha3 crc32 pmull neon)