lib/x86/crc32: fix undefined behavior in VPCLMULQDQ optimized functions

The specifications for _mm256_castsi128_si256() and _mm512_castsi128_si512() are bugged, as they leave the high bits undefined instead of zeroed as would be expected. Separate intrinsics _mm256_zextsi128_si256() and _mm512_zextsi128_si512() were later added to allow working around this defect. Use them. This fixes incorrect CRC checksums produced by crc32_x86_vpclmulqdq_avx512_vl512() when built with 'clang -O0'. Other cases are not known to have been affected. Resolves #403 Fixes: 5f2a0b4 ("lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32")
ebiggers · Nov 27, 2024 · 614e9bf · 614e9bf
1 parent c7b053e
commit 614e9bf
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 4 deletions.
diff --git a/lib/x86/crc32_impl.h b/lib/x86/crc32_impl.h
@@ -78,8 +78,10 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  *
  * gcc 8.1 and 8.2 had a similar bug where they assumed that
  * _mm256_clmulepi64_epi128() always needed AVX512.  It's fixed in gcc 8.3.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
-#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 #  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
 #  define SUFFIX				 _vpclmulqdq_avx2
@@ -89,14 +91,16 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
 #  include "crc32_pclmul_template.h"
 #endif
 
-#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
+#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
 	!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
 /*
  * VPCLMULQDQ/AVX512 implementation using 256-bit vectors.  This is very similar
  * to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
  * instruction and more registers.  This is used on CPUs that support AVX-512
  * but where using 512-bit vectors causes downclocking.  This should also be the
  * optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
+ *
+ * _mm256_zextsi128_si256() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
 #  define SUFFIX				      _vpclmulqdq_avx512_vl256
@@ -109,6 +113,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
  * VPCLMULQDQ/AVX512 implementation using 512-bit vectors.  This is used on CPUs
  * that have a good AVX-512 implementation including VPCLMULQDQ.  This should
  * also be the optimal implementation on CPUs that support AVX10/512.
+ *
+ * _mm512_zextsi128_si512() requires gcc 10.
  */
 #  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
 #  define SUFFIX				      _vpclmulqdq_avx512_vl512

diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
@@ -80,7 +80,7 @@
 #  define fold_vec		fold_vec256
 #  define VLOADU(p)		_mm256_loadu_si256((const void *)(p))
 #  define VXOR(a, b)		_mm256_xor_si256((a), (b))
-#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define M128I_TO_VEC(a)	_mm256_zextsi128_si256(a)
 #  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
 #  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
@@ -91,7 +91,7 @@
 #  define fold_vec		fold_vec512
 #  define VLOADU(p)		_mm512_loadu_si512((const void *)(p))
 #  define VXOR(a, b)		_mm512_xor_si512((a), (b))
-#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define M128I_TO_VEC(a)	_mm512_zextsi128_si512(a)
 #  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
 #  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
 #  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)