Skip to content

Commit

Permalink
lib/x86/crc32: fix undefined behavior in VPCLMULQDQ optimized functions
Browse files Browse the repository at this point in the history
The specifications for _mm256_castsi128_si256() and
_mm512_castsi128_si512() are bugged, as they leave the high bits
undefined instead of zeroed as would be expected.  Separate intrinsics
_mm256_zextsi128_si256() and _mm512_zextsi128_si512() were later added
to allow working around this defect.  Use them.

This fixes incorrect CRC checksums produced by
crc32_x86_vpclmulqdq_avx512_vl512() when built with 'clang -O0'.
Other cases are not known to have been affected.

Resolves #403
Fixes: 5f2a0b4 ("lib/x86/crc32: add VPCLMULQDQ implementations of CRC-32")
  • Loading branch information
ebiggers committed Nov 27, 2024
1 parent c7b053e commit 614e9bf
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
10 changes: 8 additions & 2 deletions lib/x86/crc32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
*
* gcc 8.1 and 8.2 had a similar bug where they assumed that
* _mm256_clmulepi64_epi128() always needed AVX512. It's fixed in gcc 8.3.
*
* _mm256_zextsi128_si256() requires gcc 10.
*/
#if (GCC_PREREQ(8, 3) || CLANG_PREREQ(6, 0, 10000000)) && \
#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2
# define SUFFIX _vpclmulqdq_avx2
Expand All @@ -89,14 +91,16 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
# include "crc32_pclmul_template.h"
#endif

#if (GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
#if (GCC_PREREQ(10, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)) && \
!defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_VPCLMULQDQ)
/*
* VPCLMULQDQ/AVX512 implementation using 256-bit vectors. This is very similar
* to the VPCLMULQDQ/AVX2 implementation but takes advantage of the vpternlog
* instruction and more registers. This is used on CPUs that support AVX-512
* but where using 512-bit vectors causes downclocking. This should also be the
* optimal implementation on CPUs that support AVX10/256 but not AVX10/512.
*
* _mm256_zextsi128_si256() requires gcc 10.
*/
# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256
# define SUFFIX _vpclmulqdq_avx512_vl256
Expand All @@ -109,6 +113,8 @@ static const u8 MAYBE_UNUSED shift_tab[48] = {
* VPCLMULQDQ/AVX512 implementation using 512-bit vectors. This is used on CPUs
* that have a good AVX-512 implementation including VPCLMULQDQ. This should
* also be the optimal implementation on CPUs that support AVX10/512.
*
* _mm512_zextsi128_si512() requires gcc 10.
*/
# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512
# define SUFFIX _vpclmulqdq_avx512_vl512
Expand Down
4 changes: 2 additions & 2 deletions lib/x86/crc32_pclmul_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
# define fold_vec fold_vec256
# define VLOADU(p) _mm256_loadu_si256((const void *)(p))
# define VXOR(a, b) _mm256_xor_si256((a), (b))
# define M128I_TO_VEC(a) _mm256_castsi128_si256(a)
# define M128I_TO_VEC(a) _mm256_zextsi128_si256(a)
# define MULTS(a, b) _mm256_set_epi64x(a, b, a, b)
# define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
# define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
Expand All @@ -91,7 +91,7 @@
# define fold_vec fold_vec512
# define VLOADU(p) _mm512_loadu_si512((const void *)(p))
# define VXOR(a, b) _mm512_xor_si512((a), (b))
# define M128I_TO_VEC(a) _mm512_castsi128_si512(a)
# define M128I_TO_VEC(a) _mm512_zextsi128_si512(a)
# define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b)
# define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
# define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
Expand Down

0 comments on commit 614e9bf

Please sign in to comment.