From 22ac42320f16a173a90b4714041387209350bb08 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 26 Oct 2024 12:01:07 -0700
Subject: [PATCH] lib/x86/crc32: target pclmul,sse4.1 instead of pclmul

In practice, all CPUs that support PCLMULQDQ also support SSE4.1:

    Intel: Westmere and later + Silvermont and later
    AMD: Bulldozer and later

Therefore, make crc32_x86_pclmulqdq() use SSE4.1 instructions.

To be safe, add an explicit check for SSE4.1 support.  Though as per the
above, this is unnecessary in practice (as far as I can tell).
---
 lib/x86/cpu_features.c          |  7 ++++++-
 lib/x86/cpu_features.h          |  3 ++-
 lib/x86/crc32_impl.h            | 24 ++++++++----------------
 lib/x86/crc32_pclmul_template.h | 32 +++++---------------------------
 4 files changed, 21 insertions(+), 45 deletions(-)

diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c
index d1c648b4..1f956999 100644
--- a/lib/x86/cpu_features.c
+++ b/lib/x86/cpu_features.c
@@ -140,7 +140,12 @@ void libdeflate_init_x86_cpu_features(void)
 		family += (a >> 20) & 0xff;
 	if (d & (1 << 26))
 		features |= X86_CPU_FEATURE_SSE2;
-	if (c & (1 << 1))
+	/*
+	 * No known CPUs have pclmulqdq without sse4.1, so in practice code
+	 * targeting pclmulqdq can use sse4.1 instructions.  But to be safe,
+	 * explicitly check for both the pclmulqdq and sse4.1 bits.
+	 */
+	if ((c & (1 << 1)) && (c & (1 << 19)))
 		features |= X86_CPU_FEATURE_PCLMULQDQ;
 	if (c & (1 << 27))
 		xcr0 = read_xcr(0);
diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h
index e70bc80f..e1fd4a03 100644
--- a/lib/x86/cpu_features.h
+++ b/lib/x86/cpu_features.h
@@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; }
 #  define HAVE_SSE2_NATIVE		0
 #endif
 
-#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \
+	(defined(_MSC_VER) && defined(__AVX2__))
 #  define HAVE_PCLMULQDQ(features)	1
 #else
 #  define HAVE_PCLMULQDQ(features)	((features) & X86_CPU_FEATURE_PCLMULQDQ)
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
index 1747aa85..eda7b4cb 100644
--- a/lib/x86/crc32_impl.h
+++ b/lib/x86/crc32_impl.h
@@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 };
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-/* PCLMULQDQ implementation */
+/*
+ * PCLMULQDQ implementation.  This targets PCLMULQDQ+SSE4.1, since in practice
+ * all CPUs that support PCLMULQDQ also support SSE4.1.
+ */
 #  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
 #  define SUFFIX			 _pclmulqdq
-#  define ATTRIBUTES		_target_attribute("pclmul")
+#  define ATTRIBUTES		_target_attribute("pclmul,sse4.1")
 #  define VL			16
-#  define USE_SSE4_1		0
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 
 /*
- * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
- * implementation, this still uses 128-bit vectors, but it has two potential
- * benefits.  First, simply compiling against the AVX target can improve
- * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
- * actually using any AVX intrinsics, probably due to the availability of
- * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
- * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
- * handling of partial blocks.  (We *could* compile a variant with
- * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.)
+ * PCLMULQDQ/AVX implementation.  Same as above, but this is compiled with AVX
+ * enabled so that the compiler can generate VEX-coded instructions which can be
+ * slightly more efficient.  It still uses 128-bit vectors.
  */
 #  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
 #  define SUFFIX				 _pclmulqdq_avx
 #  define ATTRIBUTES		_target_attribute("pclmul,avx")
 #  define VL			16
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
@@ -90,7 +85,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  define SUFFIX				 _vpclmulqdq_avx2
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		0
 #  include "crc32_pclmul_template.h"
 #endif
@@ -108,7 +102,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512)
 #  define VL			32
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 
@@ -121,7 +114,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512
 #  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512)
 #  define VL			64
-#  define USE_SSE4_1		1
 #  define USE_AVX512		1
 #  include "crc32_pclmul_template.h"
 #endif
diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
index 0fe38f8a..34099f84 100644
--- a/lib/x86/crc32_pclmul_template.h
+++ b/lib/x86/crc32_pclmul_template.h
@@ -34,17 +34,13 @@
  * ATTRIBUTES:
  *	Target function attributes to use.  Must satisfy the dependencies of the
  *	other parameters as follows:
- *	   VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul
- *	   VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
- *	   VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
- *	   VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=16 && USE_AVX512=0: at least pclmul,sse4.1
+ *	   VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
+ *	   VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl
  *	   (Other combinations are not useful and have not been tested.)
  * VL:
  *	Vector length in bytes.  Must be 16, 32, or 64.
- * USE_SSE4_1:
- *	If 1, take advantage of SSE4.1 instructions such as pblendvb.
- *	If 0, assume that the CPU might not support SSE4.1.
  * USE_AVX512:
  *	If 1, take advantage of AVX-512 features such as masking and the
  *	vpternlog instruction.  This doesn't enable the use of 512-bit vectors;
@@ -149,7 +145,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults)
 #define fold_vec512	ADD_SUFFIX(fold_vec512)
 #endif /* VL >= 64 */
 
-#if USE_SSE4_1
 /*
  * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
  * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
@@ -181,7 +176,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
 	return fold_vec128(x0, x1, mults_128b);
 }
 #define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
-#endif /* USE_SSE4_1 */
 
 static ATTRIBUTES u32
 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
@@ -273,7 +267,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			size_t align = -(uintptr_t)p & (VL-1);
 
 			len -= align;
-		#if USE_SSE4_1
 			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0);
 			p += 16;
 			if (align & 15) {
@@ -296,11 +289,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 			v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1);
 		#  endif
 			p -= 16;
-		#else
-			crc = crc32_slice1(crc, p, align);
-			p += align;
-			v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
-		#endif
 		} else {
 			v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0));
 		}
@@ -399,10 +387,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * If fold_lessthan16bytes() is available, handle any remainder
 	 * of 1 to 15 bytes now, before reducing to 32 bits.
 	 */
-#if USE_SSE4_1
 	if (len)
 		x0 = fold_lessthan16bytes(x0, p, len, mults_128b);
-#endif
 #if USE_AVX512
 reduce_x0:
 #endif
@@ -467,14 +453,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
 				  barrett_reduction_constants, 0x10);
 	x0 = _mm_xor_si128(x0, x1);
-#if USE_SSE4_1
-	crc = _mm_extract_epi32(x0, 1);
-#else
-	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
-	/* Process up to 15 bytes left over at the end. */
-	crc = crc32_slice1(crc, p, len);
-#endif
-	return crc;
+	return _mm_extract_epi32(x0, 1);
 }
 
 #undef vec_t
@@ -491,5 +470,4 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 #undef SUFFIX
 #undef ATTRIBUTES
 #undef VL
-#undef USE_SSE4_1
 #undef USE_AVX512