lib/x86/crc32: more optimizations

- As was recently done in the Adler-32 code, take advantage of the fact that on recent x86 processors, vmovdqu with an aligned pointer is just as fast as vmovdqa. Don't waste time aligning the pointer unless the length is very large, and at the same time, handle all cases of len >= 8*VL using the main loop so that the 4*VL wide loop isn't needed. (Before, aligning the pointer was tied to whether the main loop was used or not, since the main loop used vmovdqa.) - Handle short lengths more efficiently. Instead of falling back to crc32_slice1() for all len < VL, use AVX-512 masking (when available) to handle 4 <= len <= 15, and use 128-bit vector instructions to handle 16 <= len < VL. - Document why the main loop uses a width of 8*VL instead of 4*VL.
ebiggers · Mar 12, 2024 · 5d15bce · 5d15bce
1 parent 8ae3a19
commit 5d15bce
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 137 deletions.
diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
@@ -82,7 +82,6 @@ static const struct cpu_feature x86_cpu_feature_table[] = {
 	{X86_CPU_FEATURE_AVX2,		"avx2"},
 	{X86_CPU_FEATURE_BMI2,		"bmi2"},
 	{X86_CPU_FEATURE_ZMM,		"zmm"},
-	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
 	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
 	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
 	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
@@ -163,8 +162,6 @@ void libdeflate_init_x86_cpu_features(void)
 	if (((xcr0 & 0xe6) == 0xe6) &&
 	    allow_512bit_vectors(manufacturer, family, model))
 		features |= X86_CPU_FEATURE_ZMM;
-	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
-		features |= X86_CPU_FEATURE_AVX512F;
 	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
 		features |= X86_CPU_FEATURE_AVX512BW;
 	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))

diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
@@ -39,17 +39,16 @@
 #define X86_CPU_FEATURE_BMI2		(1 << 4)
 /*
  * ZMM indicates whether 512-bit vectors (zmm registers) should be used.  On
- * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
- * supports it, i.e. even if AVX512F is set.  On these CPUs, we may still use
- * AVX-512 instructions, but only with ymm and xmm registers.
+ * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU and
+ * operating system support AVX-512.  On these CPUs, we may still use AVX-512
+ * instructions, but only with xmm and ymm registers.
  */
 #define X86_CPU_FEATURE_ZMM		(1 << 5)
-#define X86_CPU_FEATURE_AVX512F		(1 << 6)
-#define X86_CPU_FEATURE_AVX512BW	(1 << 7)
-#define X86_CPU_FEATURE_AVX512VL	(1 << 8)
-#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 9)
-#define X86_CPU_FEATURE_AVX512VNNI	(1 << 10)
-#define X86_CPU_FEATURE_AVXVNNI		(1 << 11)
+#define X86_CPU_FEATURE_AVX512BW	(1 << 6)
+#define X86_CPU_FEATURE_AVX512VL	(1 << 7)
+#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 8)
+#define X86_CPU_FEATURE_AVX512VNNI	(1 << 9)
+#define X86_CPU_FEATURE_AVXVNNI		(1 << 10)
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 /* Runtime x86 CPU feature detection is supported. */
@@ -135,12 +134,6 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_BMI2_NATIVE		0
 #endif
 
-#ifdef __AVX512F__
-#  define HAVE_AVX512F(features)	1
-#else
-#  define HAVE_AVX512F(features)	((features) & X86_CPU_FEATURE_AVX512F)
-#endif
-
 #ifdef __AVX512BW__
 #  define HAVE_AVX512BW(features)	1
 #else

diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
@@ -30,6 +30,19 @@
 
 #include "cpu_features.h"
 
+/*
+ * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes.
+ * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes.
+ */
+static const u8 MAYBE_UNUSED shift_tab[48] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 /* PCLMULQDQ implementation */
 #  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
@@ -88,7 +101,7 @@
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
-#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
 #  define VL			32
 #  define USE_SSE4_1		1
 #  define USE_AVX512		1
@@ -101,7 +114,7 @@
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
-#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl")
 #  define VL			64
 #  define USE_SSE4_1		1
 #  define USE_AVX512		1
@@ -116,12 +129,12 @@ arch_select_crc32_func(void)
 #ifdef crc32_x86_vpclmulqdq_avx512_vl512
 	if ((features & X86_CPU_FEATURE_ZMM) &&
 	    HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
-	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+	    HAVE_AVX512BW(features) && HAVE_AVX512VL(features))
 		return crc32_x86_vpclmulqdq_avx512_vl512;
 #endif
 #ifdef crc32_x86_vpclmulqdq_avx512_vl256
 	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
-	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+	    HAVE_AVX512BW(features) && HAVE_AVX512VL(features))
 		return crc32_x86_vpclmulqdq_avx512_vl256;
 #endif
 #ifdef crc32_x86_vpclmulqdq_avx2