From 4ebc7bdc23b99cc72aca5e968aac5806c54b7d74 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 8 Mar 2024 21:22:49 -0800 Subject: [PATCH 1/3] android_build.sh: make the intended CFLAGS actually be used Also update the default NDK version --- scripts/android_build.sh | 5 +++-- scripts/android_tests.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/android_build.sh b/scripts/android_build.sh index ae0b4bc0..061a8517 100755 --- a/scripts/android_build.sh +++ b/scripts/android_build.sh @@ -6,10 +6,10 @@ SCRIPTDIR="$(dirname "$0")" BUILDDIR="$SCRIPTDIR/../build" API_LEVEL=28 ARCH=arm64 -export CFLAGS=${CFLAGS:-} +CFLAGS=${CFLAGS:-} ENABLE_CRC=false ENABLE_CRYPTO=false -NDKDIR=$HOME/android-ndk-r23b +NDKDIR=$HOME/android-ndk-r25b usage() { cat << EOF @@ -111,6 +111,7 @@ esac "$SCRIPTDIR"/cmake-helper.sh -G Ninja \ -DCMAKE_TOOLCHAIN_FILE="$NDKDIR"/build/cmake/android.toolchain.cmake \ + -DCMAKE_C_FLAGS="$CFLAGS" \ -DANDROID_ABI="$ANDROID_ABI" \ -DANDROID_PLATFORM="$API_LEVEL" \ -DLIBDEFLATE_BUILD_TESTS=1 diff --git a/scripts/android_tests.sh b/scripts/android_tests.sh index 3ec1007b..0daeaa28 100755 --- a/scripts/android_tests.sh +++ b/scripts/android_tests.sh @@ -12,7 +12,7 @@ if [ $# -ne 0 ]; then fi # Use NDKDIR if specified in environment, else use default value. -: "${NDKDIR:=$HOME/android-ndk-r23b}" +: "${NDKDIR:=$HOME/android-ndk-r25b}" if [ ! -e "$NDKDIR" ]; then cat 1>&2 << EOF Android NDK was not found in NDKDIR=$NDKDIR! Set the From 5e6197b191012b7acbd1e606e74f819c0f0ee054 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 8 Mar 2024 21:22:49 -0800 Subject: [PATCH 2/3] lib/x86/adler32: comment and style fixes --- lib/x86/adler32_template.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h index 1cd6aaed..125e4b92 100644 --- a/lib/x86/adler32_template.h +++ b/lib/x86/adler32_template.h @@ -137,16 +137,16 @@ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) #else { __m256i v_s1_256, v_s2_256; -# if VL == 32 + #if VL == 32 v_s1_256 = v_s1; v_s2_256 = v_s2; -# else + #else /* Reduce 512 bits to 256 bits. */ v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), _mm512_extracti64x4_epi64(v_s1, 1)); v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), _mm512_extracti64x4_epi64(v_s2, 1)); -# endif + #endif /* Reduce 256 bits to 128 bits. */ v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), _mm256_extracti128_si256(v_s1_256, 1)); @@ -394,8 +394,8 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw * to sum, for each i across iterations, byte i into a corresponding - * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddw to - * multiply each counter i by (2*VL - i), then add the products to s2. + * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddwd + * to multiply each counter by (2*VL - i), then add the products to s2. * * An alternative implementation would use pmaddubsw and pmaddwd in the * inner loop to do (2*VL - i)*data[i] directly and add the products in @@ -413,7 +413,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) * s2 are guaranteed to not exceed UINT32_MAX, and every * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are - * used with pmaddw which does signed multiplication. In the + * used with pmaddwd which does signed multiplication. In the * SSE2 case this limits chunks to 4096 bytes instead of 5504. */ size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), From ca8607ee0b6fee591cd0fd73006c8ac1807debe9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 8 Mar 2024 21:22:49 -0800 Subject: [PATCH 3/3] lib/arm/adler32: refactor and improve implementations Make the same improvements to lib/arm/adler32 as were made to lib/x86/adler32, e.g. changes to handle short inputs more efficiently. This makes adler32_vec_template.h no longer used, so delete it. --- CMakeLists.txt | 1 - lib/adler32_vec_template.h | 123 ----------- lib/arm/adler32_impl.h | 419 +++++++++++++++++++++++-------------- 3 files changed, 266 insertions(+), 277 deletions(-) delete mode 100644 lib/adler32_vec_template.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f902c675..0ce71c01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,6 @@ endif() if(LIBDEFLATE_ZLIB_SUPPORT) list(APPEND LIB_SOURCES lib/adler32.c - lib/adler32_vec_template.h lib/arm/adler32_impl.h lib/x86/adler32_impl.h lib/x86/adler32_template.h diff --git a/lib/adler32_vec_template.h b/lib/adler32_vec_template.h deleted file mode 100644 index 98c086bb..00000000 --- a/lib/adler32_vec_template.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * adler32_vec_template.h - template for vectorized Adler-32 implementations - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This file contains a template for vectorized Adler-32 implementations. - * - * The inner loop between reductions modulo 65521 of an unvectorized Adler-32 - * implementation looks something like this: - * - * do { - * s1 += *p; - * s2 += s1; - * } while (++p != chunk_end); - * - * For vectorized calculation of s1, we only need to sum the input bytes. They - * can be accumulated into multiple counters which are eventually summed - * together. - * - * For vectorized calculation of s2, the basic idea is that for each iteration - * that processes N bytes, we can perform the following vectorizable - * calculation: - * - * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N - * - * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N - * separate counters, then do the multiplications by N...1 just once at the end - * rather than once per iteration. - * - * Also, we must account for how previous bytes will affect s2 by doing the - * following at beginning of each iteration: - * - * s2 += s1 * N - * - * Furthermore, like s1, "s2" can actually be multiple counters which are - * eventually summed together. - */ - -static u32 ATTRIBUTES MAYBE_UNUSED -FUNCNAME(u32 adler, const u8 *p, size_t len) -{ - const size_t max_chunk_len = - MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) - - (MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) % IMPL_SEGMENT_LEN); - u32 s1 = adler & 0xFFFF; - u32 s2 = adler >> 16; - const u8 * const end = p + len; - const u8 *vend; - - /* Process a byte at a time until the needed alignment is reached. */ - if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) { - do { - s1 += *p++; - s2 += s1; - } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* - * Process "chunks" of bytes using vector instructions. Chunk lengths - * are limited to MAX_CHUNK_LEN, which guarantees that s1 and s2 never - * overflow before being reduced modulo DIVISOR. For vector processing, - * chunk lengths are also made evenly divisible by IMPL_SEGMENT_LEN and - * may be further limited to IMPL_MAX_CHUNK_LEN. - */ - STATIC_ASSERT(IMPL_SEGMENT_LEN % IMPL_ALIGNMENT == 0); - vend = end - ((size_t)(end - p) % IMPL_SEGMENT_LEN); - while (p != vend) { - size_t chunk_len = MIN((size_t)(vend - p), max_chunk_len); - - s2 += s1 * chunk_len; - - FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_len), - &s1, &s2); - - p += chunk_len; - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* Process any remaining bytes. */ - if (p != end) { - do { - s1 += *p++; - s2 += s1; - } while (p != end); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - return (s2 << 16) | s1; -} - -#undef FUNCNAME -#undef FUNCNAME_CHUNK -#undef ATTRIBUTES -#undef IMPL_ALIGNMENT -#undef IMPL_SEGMENT_LEN -#undef IMPL_MAX_CHUNK_LEN diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index 4083b2ef..c715b65d 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -32,13 +32,7 @@ /* Regular NEON implementation */ #if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() -# define adler32_neon adler32_neon -# define FUNCNAME adler32_neon -# define FUNCNAME_CHUNK adler32_neon_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 64 -/* Prevent unsigned overflow of the 16-bit precision byte counters */ -# define IMPL_MAX_CHUNK_LEN (64 * (0xFFFF / 0xFF)) +# define adler32_arm_neon adler32_arm_neon # if HAVE_NEON_NATIVE # define ATTRIBUTES # else @@ -49,9 +43,8 @@ # endif # endif # include -static forceinline ATTRIBUTES void -adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, - u32 *s1, u32 *s2) +static u32 ATTRIBUTES MAYBE_UNUSED +adler32_arm_neon(u32 adler, const u8 *p, size_t len) { static const u16 _aligned_attribute(16) mults[64] = { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, @@ -67,104 +60,153 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, const uint16x8_t mults_f = vld1q_u16(&mults[40]); const uint16x8_t mults_g = vld1q_u16(&mults[48]); const uint16x8_t mults_h = vld1q_u16(&mults[56]); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; - uint32x4_t v_s1 = vdupq_n_u32(0); - uint32x4_t v_s2 = vdupq_n_u32(0); /* - * v_byte_sums_* contain the sum of the bytes at index i across all - * 64-byte segments, for each index 0..63. + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the unaligned load penalty. */ - uint16x8_t v_byte_sums_a = vdupq_n_u16(0); - uint16x8_t v_byte_sums_b = vdupq_n_u16(0); - uint16x8_t v_byte_sums_c = vdupq_n_u16(0); - uint16x8_t v_byte_sums_d = vdupq_n_u16(0); - uint16x8_t v_byte_sums_e = vdupq_n_u16(0); - uint16x8_t v_byte_sums_f = vdupq_n_u16(0); - uint16x8_t v_byte_sums_g = vdupq_n_u16(0); - uint16x8_t v_byte_sums_h = vdupq_n_u16(0); - - do { - /* Load the next 64 bytes. */ - const uint8x16_t bytes1 = *p++; - const uint8x16_t bytes2 = *p++; - const uint8x16_t bytes3 = *p++; - const uint8x16_t bytes4 = *p++; - uint16x8_t tmp; + if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & 15); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + while (len) { /* - * Accumulate the previous s1 counters into the s2 counters. - * The needed multiplication by 64 is delayed to later. + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. */ - v_s2 = vaddq_u32(v_s2, v_s1); + size_t n = MIN(len, MAX_CHUNK_LEN & ~63); - /* - * Add the 64 bytes to their corresponding v_byte_sums counters, - * while also accumulating the sums of each adjacent set of 4 - * bytes into v_s1. - */ - tmp = vpaddlq_u8(bytes1); - v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1)); - v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1)); - tmp = vpadalq_u8(tmp, bytes2); - v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2)); - v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2)); - tmp = vpadalq_u8(tmp, bytes3); - v_byte_sums_e = vaddw_u8(v_byte_sums_e, vget_low_u8(bytes3)); - v_byte_sums_f = vaddw_u8(v_byte_sums_f, vget_high_u8(bytes3)); - tmp = vpadalq_u8(tmp, bytes4); - v_byte_sums_g = vaddw_u8(v_byte_sums_g, vget_low_u8(bytes4)); - v_byte_sums_h = vaddw_u8(v_byte_sums_h, vget_high_u8(bytes4)); - v_s1 = vpadalq_u16(v_s1, tmp); - - } while (p != end); - - /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */ -#ifdef ARCH_ARM32 -# define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c)) -#else -# define umlal2 vmlal_high_u16 -#endif - v_s2 = vqshlq_n_u32(v_s2, 6); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), vget_low_u16(mults_a)); - v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), vget_low_u16(mults_b)); - v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), vget_low_u16(mults_c)); - v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), vget_low_u16(mults_d)); - v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), vget_low_u16(mults_e)); - v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), vget_low_u16(mults_f)); - v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), vget_low_u16(mults_g)); - v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), vget_low_u16(mults_h)); - v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h); -#undef umlal2 - - /* Horizontal sum to finish up */ -#ifdef ARCH_ARM32 - *s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) + - vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3); - *s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) + - vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3); -#else - *s1 += vaddvq_u32(v_s1); - *s2 += vaddvq_u32(v_s2); -#endif + len -= n; + + if (n >= 64) { + uint32x4_t v_s1 = vdupq_n_u32(0); + uint32x4_t v_s2 = vdupq_n_u32(0); + /* + * v_byte_sums_* contain the sum of the bytes at index i + * across all 64-byte segments, for each index 0..63. + */ + uint16x8_t v_byte_sums_a = vdupq_n_u16(0); + uint16x8_t v_byte_sums_b = vdupq_n_u16(0); + uint16x8_t v_byte_sums_c = vdupq_n_u16(0); + uint16x8_t v_byte_sums_d = vdupq_n_u16(0); + uint16x8_t v_byte_sums_e = vdupq_n_u16(0); + uint16x8_t v_byte_sums_f = vdupq_n_u16(0); + uint16x8_t v_byte_sums_g = vdupq_n_u16(0); + uint16x8_t v_byte_sums_h = vdupq_n_u16(0); + + s2 += s1 * (n & ~63); + + do { + /* Load the next 64 data bytes. */ + const uint8x16_t data_a = vld1q_u8(p + 0); + const uint8x16_t data_b = vld1q_u8(p + 16); + const uint8x16_t data_c = vld1q_u8(p + 32); + const uint8x16_t data_d = vld1q_u8(p + 48); + uint16x8_t tmp; + + /* + * Accumulate the previous s1 counters into the + * s2 counters. The needed multiplication by 64 + * is delayed to later. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + + /* + * Add the 64 data bytes to their v_byte_sums + * counters, while also accumulating the sums of + * each adjacent set of 4 bytes into v_s1. + */ + tmp = vpaddlq_u8(data_a); + v_byte_sums_a = vaddw_u8(v_byte_sums_a, + vget_low_u8(data_a)); + v_byte_sums_b = vaddw_u8(v_byte_sums_b, + vget_high_u8(data_a)); + tmp = vpadalq_u8(tmp, data_b); + v_byte_sums_c = vaddw_u8(v_byte_sums_c, + vget_low_u8(data_b)); + v_byte_sums_d = vaddw_u8(v_byte_sums_d, + vget_high_u8(data_b)); + tmp = vpadalq_u8(tmp, data_c); + v_byte_sums_e = vaddw_u8(v_byte_sums_e, + vget_low_u8(data_c)); + v_byte_sums_f = vaddw_u8(v_byte_sums_f, + vget_high_u8(data_c)); + tmp = vpadalq_u8(tmp, data_d); + v_byte_sums_g = vaddw_u8(v_byte_sums_g, + vget_low_u8(data_d)); + v_byte_sums_h = vaddw_u8(v_byte_sums_h, + vget_high_u8(data_d)); + v_s1 = vpadalq_u16(v_s1, tmp); + + p += 64; + n -= 64; + } while (n >= 64); + + /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */ + #ifdef ARCH_ARM32 + # define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c)) + #else + # define umlal2 vmlal_high_u16 + #endif + v_s2 = vqshlq_n_u32(v_s2, 6); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), + vget_low_u16(mults_a)); + v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), + vget_low_u16(mults_b)); + v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), + vget_low_u16(mults_c)); + v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), + vget_low_u16(mults_d)); + v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), + vget_low_u16(mults_e)); + v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), + vget_low_u16(mults_f)); + v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), + vget_low_u16(mults_g)); + v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), + vget_low_u16(mults_h)); + v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h); + #undef umlal2 + + /* Horizontal sum to finish up */ + #ifdef ARCH_ARM32 + s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) + + vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3); + s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) + + vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3); + #else + s1 += vaddvq_u32(v_s1); + s2 += vaddvq_u32(v_s2); + #endif + } + adler32_generic_noreduce(&s1, &s2, p, n); + p += n; + s1 %= DIVISOR; + s2 %= DIVISOR; + } + return (s2 << 16) | s1; } -# include "../adler32_vec_template.h" +#undef ATTRIBUTES #endif /* Regular NEON implementation */ /* NEON+dotprod implementation */ #if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() -# define adler32_neon_dotprod adler32_neon_dotprod -# define FUNCNAME adler32_neon_dotprod -# define FUNCNAME_CHUNK adler32_neon_dotprod_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 64 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN +# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod # if HAVE_DOTPROD_NATIVE # define ATTRIBUTES # else @@ -182,9 +224,8 @@ adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, # endif # endif # include -static forceinline ATTRIBUTES void -adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end, - u32 *s1, u32 *s2) +static u32 ATTRIBUTES +adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) { static const u8 _aligned_attribute(16) mults[64] = { 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, @@ -197,72 +238,144 @@ adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end, const uint8x16_t mults_c = vld1q_u8(&mults[32]); const uint8x16_t mults_d = vld1q_u8(&mults[48]); const uint8x16_t ones = vdupq_n_u8(1); - uint32x4_t v_s1_a = vdupq_n_u32(0); - uint32x4_t v_s1_b = vdupq_n_u32(0); - uint32x4_t v_s1_c = vdupq_n_u32(0); - uint32x4_t v_s1_d = vdupq_n_u32(0); - uint32x4_t v_s2_a = vdupq_n_u32(0); - uint32x4_t v_s2_b = vdupq_n_u32(0); - uint32x4_t v_s2_c = vdupq_n_u32(0); - uint32x4_t v_s2_d = vdupq_n_u32(0); - uint32x4_t v_s1_sums_a = vdupq_n_u32(0); - uint32x4_t v_s1_sums_b = vdupq_n_u32(0); - uint32x4_t v_s1_sums_c = vdupq_n_u32(0); - uint32x4_t v_s1_sums_d = vdupq_n_u32(0); - uint32x4_t v_s1; - uint32x4_t v_s2; - uint32x4_t v_s1_sums; - - do { - uint8x16_t bytes_a = *p++; - uint8x16_t bytes_b = *p++; - uint8x16_t bytes_c = *p++; - uint8x16_t bytes_d = *p++; - - v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a); - v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones); - v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a); - - v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b); - v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones); - v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b); - - v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c); - v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones); - v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c); - - v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d); - v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones); - v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d); - } while (p != end); - - v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d)); - v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d)); - v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b), - vaddq_u32(v_s1_sums_c, v_s1_sums_d)); - v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6)); - - *s1 += vaddvq_u32(v_s1); - *s2 += vaddvq_u32(v_s2); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the unaligned load penalty. + */ + if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & 15); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~63); + + len -= n; + + if (n >= 64) { + uint32x4_t v_s1_a = vdupq_n_u32(0); + uint32x4_t v_s1_b = vdupq_n_u32(0); + uint32x4_t v_s1_c = vdupq_n_u32(0); + uint32x4_t v_s1_d = vdupq_n_u32(0); + uint32x4_t v_s2_a = vdupq_n_u32(0); + uint32x4_t v_s2_b = vdupq_n_u32(0); + uint32x4_t v_s2_c = vdupq_n_u32(0); + uint32x4_t v_s2_d = vdupq_n_u32(0); + uint32x4_t v_s1_sums_a = vdupq_n_u32(0); + uint32x4_t v_s1_sums_b = vdupq_n_u32(0); + uint32x4_t v_s1_sums_c = vdupq_n_u32(0); + uint32x4_t v_s1_sums_d = vdupq_n_u32(0); + uint32x4_t v_s1; + uint32x4_t v_s2; + uint32x4_t v_s1_sums; + + s2 += s1 * (n & ~63); + + do { + uint8x16_t data_a = vld1q_u8(p + 0); + uint8x16_t data_b = vld1q_u8(p + 16); + uint8x16_t data_c = vld1q_u8(p + 32); + uint8x16_t data_d = vld1q_u8(p + 48); + + v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a); + v_s1_a = vdotq_u32(v_s1_a, data_a, ones); + v_s2_a = vdotq_u32(v_s2_a, data_a, mults_a); + + v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b); + v_s1_b = vdotq_u32(v_s1_b, data_b, ones); + v_s2_b = vdotq_u32(v_s2_b, data_b, mults_b); + + v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c); + v_s1_c = vdotq_u32(v_s1_c, data_c, ones); + v_s2_c = vdotq_u32(v_s2_c, data_c, mults_c); + + v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d); + v_s1_d = vdotq_u32(v_s1_d, data_d, ones); + v_s2_d = vdotq_u32(v_s2_d, data_d, mults_d); + + p += 64; + n -= 64; + } while (n >= 64); + + v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), + vaddq_u32(v_s1_c, v_s1_d)); + v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), + vaddq_u32(v_s2_c, v_s2_d)); + v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, + v_s1_sums_b), + vaddq_u32(v_s1_sums_c, + v_s1_sums_d)); + v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6)); + + s1 += vaddvq_u32(v_s1); + s2 += vaddvq_u32(v_s2); + } + /* + * Process the last 0 <= n < 64 bytes of the chunk. This is a + * copy of adler32_generic_noreduce(). We can't just call it + * directly here because in some cases the compiler errors out + * when inlining it due to a target specific option mismatch due + * to the use of arch=armv8.2 above. + */ + if (n >= 4) { + u32 s1_sum = 0; + u32 byte_0_sum = 0; + u32 byte_1_sum = 0; + u32 byte_2_sum = 0; + u32 byte_3_sum = 0; + + do { + s1_sum += s1; + s1 += p[0] + p[1] + p[2] + p[3]; + byte_0_sum += p[0]; + byte_1_sum += p[1]; + byte_2_sum += p[2]; + byte_3_sum += p[3]; + p += 4; + n -= 4; + } while (n >= 4); + s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + + (2 * byte_2_sum) + byte_3_sum; + } + for (; n; n--, p++) { + s1 += *p; + s2 += s1; + } + s1 %= DIVISOR; + s2 %= DIVISOR; + } + return (s2 << 16) | s1; } -# include "../adler32_vec_template.h" +#undef ATTRIBUTES #endif /* NEON+dotprod implementation */ -#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE -#define DEFAULT_IMPL adler32_neon_dotprod +#if defined(adler32_arm_neon_dotprod) && HAVE_DOTPROD_NATIVE +#define DEFAULT_IMPL adler32_arm_neon_dotprod #else static inline adler32_func_t arch_select_adler32_func(void) { const u32 features MAYBE_UNUSED = get_arm_cpu_features(); -#ifdef adler32_neon_dotprod +#ifdef adler32_arm_neon_dotprod if (HAVE_NEON(features) && HAVE_DOTPROD(features)) - return adler32_neon_dotprod; + return adler32_arm_neon_dotprod; #endif -#ifdef adler32_neon +#ifdef adler32_arm_neon if (HAVE_NEON(features)) - return adler32_neon; + return adler32_arm_neon; #endif return NULL; }