From 22ac42320f16a173a90b4714041387209350bb08 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 26 Oct 2024 12:01:07 -0700 Subject: [PATCH] lib/x86/crc32: target pclmul,sse4.1 instead of pclmul In practice, all CPUs that support PCLMULQDQ also support SSE4.1: Intel: Westmere and later + Silvermont and later AMD: Bulldozer and later Therefore, make crc32_x86_pclmulqdq() use SSE4.1 instructions. To be safe, add an explicit check for SSE4.1 support. Though as per the above, this is unnecessary in practice (as far as I can tell). --- lib/x86/cpu_features.c | 7 ++++++- lib/x86/cpu_features.h | 3 ++- lib/x86/crc32_impl.h | 24 ++++++++---------------- lib/x86/crc32_pclmul_template.h | 32 +++++--------------------------- 4 files changed, 21 insertions(+), 45 deletions(-) diff --git a/lib/x86/cpu_features.c b/lib/x86/cpu_features.c index d1c648b4..1f956999 100644 --- a/lib/x86/cpu_features.c +++ b/lib/x86/cpu_features.c @@ -140,7 +140,12 @@ void libdeflate_init_x86_cpu_features(void) family += (a >> 20) & 0xff; if (d & (1 << 26)) features |= X86_CPU_FEATURE_SSE2; - if (c & (1 << 1)) + /* + * No known CPUs have pclmulqdq without sse4.1, so in practice code + * targeting pclmulqdq can use sse4.1 instructions. But to be safe, + * explicitly check for both the pclmulqdq and sse4.1 bits. + */ + if ((c & (1 << 1)) && (c & (1 << 19))) features |= X86_CPU_FEATURE_PCLMULQDQ; if (c & (1 << 27)) xcr0 = read_xcr(0); diff --git a/lib/x86/cpu_features.h b/lib/x86/cpu_features.h index e70bc80f..e1fd4a03 100644 --- a/lib/x86/cpu_features.h +++ b/lib/x86/cpu_features.h @@ -108,7 +108,8 @@ static inline u32 get_x86_cpu_features(void) { return 0; } # define HAVE_SSE2_NATIVE 0 #endif -#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) +#if (defined(__PCLMUL__) && defined(__SSE4_1__)) || \ + (defined(_MSC_VER) && defined(__AVX2__)) # define HAVE_PCLMULQDQ(features) 1 #else # define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index 1747aa85..eda7b4cb 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -44,31 +44,26 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { }; #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) -/* PCLMULQDQ implementation */ +/* + * PCLMULQDQ implementation. This targets PCLMULQDQ+SSE4.1, since in practice + * all CPUs that support PCLMULQDQ also support SSE4.1. + */ # define crc32_x86_pclmulqdq crc32_x86_pclmulqdq # define SUFFIX _pclmulqdq -# define ATTRIBUTES _target_attribute("pclmul") +# define ATTRIBUTES _target_attribute("pclmul,sse4.1") # define VL 16 -# define USE_SSE4_1 0 # define USE_AVX512 0 # include "crc32_pclmul_template.h" /* - * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ - * implementation, this still uses 128-bit vectors, but it has two potential - * benefits. First, simply compiling against the AVX target can improve - * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without - * actually using any AVX intrinsics, probably due to the availability of - * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3 - * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient - * handling of partial blocks. (We *could* compile a variant with - * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we currently don't bother.) + * PCLMULQDQ/AVX implementation. Same as above, but this is compiled with AVX + * enabled so that the compiler can generate VEX-coded instructions which can be + * slightly more efficient. It still uses 128-bit vectors. */ # define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx # define SUFFIX _pclmulqdq_avx # define ATTRIBUTES _target_attribute("pclmul,avx") # define VL 16 -# define USE_SSE4_1 1 # define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif @@ -90,7 +85,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { # define SUFFIX _vpclmulqdq_avx2 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") # define VL 32 -# define USE_SSE4_1 1 # define USE_AVX512 0 # include "crc32_pclmul_template.h" #endif @@ -108,7 +102,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { # define SUFFIX _vpclmulqdq_avx512_vl256 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" NO_EVEX512) # define VL 32 -# define USE_SSE4_1 1 # define USE_AVX512 1 # include "crc32_pclmul_template.h" @@ -121,7 +114,6 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { # define SUFFIX _vpclmulqdq_avx512_vl512 # define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512bw,avx512vl" EVEX512) # define VL 64 -# define USE_SSE4_1 1 # define USE_AVX512 1 # include "crc32_pclmul_template.h" #endif diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index 0fe38f8a..34099f84 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -34,17 +34,13 @@ * ATTRIBUTES: * Target function attributes to use. Must satisfy the dependencies of the * other parameters as follows: - * VL=16 && USE_SSE4_1=0 && USE_AVX512=0: at least pclmul - * VL=16 && USE_SSE4_1=1 && USE_AVX512=0: at least pclmul,sse4.1 - * VL=32 && USE_SSE4_1=1 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 - * VL=32 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl - * VL=64 && USE_SSE4_1=1 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl + * VL=16 && USE_AVX512=0: at least pclmul,sse4.1 + * VL=32 && USE_AVX512=0: at least vpclmulqdq,pclmul,avx2 + * VL=32 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl + * VL=64 && USE_AVX512=1: at least vpclmulqdq,pclmul,avx512bw,avx512vl * (Other combinations are not useful and have not been tested.) * VL: * Vector length in bytes. Must be 16, 32, or 64. - * USE_SSE4_1: - * If 1, take advantage of SSE4.1 instructions such as pblendvb. - * If 0, assume that the CPU might not support SSE4.1. * USE_AVX512: * If 1, take advantage of AVX-512 features such as masking and the * vpternlog instruction. This doesn't enable the use of 512-bit vectors; @@ -149,7 +145,6 @@ ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i /* __v8du */ mults) #define fold_vec512 ADD_SUFFIX(fold_vec512) #endif /* VL >= 64 */ -#if USE_SSE4_1 /* * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and @@ -181,7 +176,6 @@ ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, return fold_vec128(x0, x1, mults_128b); } #define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes) -#endif /* USE_SSE4_1 */ static ATTRIBUTES u32 ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) @@ -273,7 +267,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) size_t align = -(uintptr_t)p & (VL-1); len -= align; - #if USE_SSE4_1 x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), x0); p += 16; if (align & 15) { @@ -296,11 +289,6 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) v0 = _mm512_inserti64x4(v0, *(const __m256i *)(p + 16), 1); # endif p -= 16; - #else - crc = crc32_slice1(crc, p, align); - p += align; - v0 = VXOR(VLOADU(p), M128I_TO_VEC(_mm_cvtsi32_si128(crc))); - #endif } else { v0 = VXOR(VLOADU(p), M128I_TO_VEC(x0)); } @@ -399,10 +387,8 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * If fold_lessthan16bytes() is available, handle any remainder * of 1 to 15 bytes now, before reducing to 32 bits. */ -#if USE_SSE4_1 if (len) x0 = fold_lessthan16bytes(x0, p, len, mults_128b); -#endif #if USE_AVX512 reduce_x0: #endif @@ -467,14 +453,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), barrett_reduction_constants, 0x10); x0 = _mm_xor_si128(x0, x1); -#if USE_SSE4_1 - crc = _mm_extract_epi32(x0, 1); -#else - crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01)); - /* Process up to 15 bytes left over at the end. */ - crc = crc32_slice1(crc, p, len); -#endif - return crc; + return _mm_extract_epi32(x0, 1); } #undef vec_t @@ -491,5 +470,4 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) #undef SUFFIX #undef ATTRIBUTES #undef VL -#undef USE_SSE4_1 #undef USE_AVX512