From 1a6f436edd37998e1452341dffd03a8cfd7ab3b0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 26 Nov 2024 20:52:53 -0800 Subject: [PATCH] lib/x86/crc32: fix undefined behavior in VPCLMULQDQ optimized functions The specifications for _mm256_castsi128_si256() and _mm512_castsi128_si512() are bugged, as they leave the high bits undefined instead of zeroed as would be expected. Separate intrinsics _mm256_zextsi128_si256() and _mm512_zextsi128_si512() were later added to allow working around this defect. Use them. This fixes incorrect CRC checksums produced by crc32_x86_vpclmulqdq_avx512_vl512() when built with 'clang -O0'. Other cases are not known to have been affected. Resolves https://github.com/ebiggers/libdeflate/issues/403 Fixes: 5f2a0b4beca9 ("lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32") --- lib/x86/crc32_impl.h | 10 ++++++++-- lib/x86/crc32_pclmul_template.h | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h index eda7b4cb..c8909cde 100644 --- a/lib/x86/crc32_impl.h +++ b/lib/x86/crc32_impl.h @@ -78,8 +78,10 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { * * gcc 8.1 and 8.2 had a similar bug where they assumed that * _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3. + * + * _mm256_zextsi128_si256() requires gcc 10. */ -#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \ +#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) # define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 # define SUFFIX _vpclmulqdq_avx2 @@ -89,7 +91,7 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { # include "crc32_pclmul_template.h" #endif -#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ +#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ) /* * VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar @@ -97,6 +99,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { * instruction and more registers. This is used on CPUs that support AVX-512 * but where using 512-bit vectors causes downclocking. This should also be the * optimal implementation on CPUs that support AVX10/256 but not AVX10/512. + * + * _mm256_zextsi128_si256() requires gcc 10. */ # define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 # define SUFFIX _vpclmulqdq_avx512_vl256 @@ -109,6 +113,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = { * VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs * that have a good AVX-512 implementation including VPCLMULQDQ. This should * also be the optimal implementation on CPUs that support AVX10/512. + * + * _mm512_zextsi128_si512() requires gcc 10. */ # define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 # define SUFFIX _vpclmulqdq_avx512_vl512 diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index 09abb515..df804a29 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -80,7 +80,7 @@ # define fold_vec fold_vec256 # define VLOADU(p) _mm256_loadu_si256((const void *)(p)) # define VXOR(a, b) _mm256_xor_si256((a), (b)) -# define M128I_TO_VEC(a) _mm256_castsi128_si256(a) +# define M128I_TO_VEC(a) _mm256_zextsi128_si256(a) # define MULTS(a, b) _mm256_set_epi64x(a, b, a, b) # define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) # define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) @@ -91,7 +91,7 @@ # define fold_vec fold_vec512 # define VLOADU(p) _mm512_loadu_si512((const void *)(p)) # define VXOR(a, b) _mm512_xor_si512((a), (b)) -# define M128I_TO_VEC(a) _mm512_castsi128_si512(a) +# define M128I_TO_VEC(a) _mm512_zextsi128_si512(a) # define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b) # define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG) # define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)