diff --git a/lib/adler32.c b/lib/adler32.c index 3de595aa..d5f39d8f 100644 --- a/lib/adler32.c +++ b/lib/adler32.c @@ -53,47 +53,54 @@ */ #define MAX_CHUNK_LEN 5552 -static forceinline void MAYBE_UNUSED -adler32_generic_noreduce(u32 *s1_p, u32 *s2_p, const u8 *p, size_t len) -{ - u32 s1 = *s1_p; - u32 s2 = *s2_p; - - /* - * This loop processes four bytes at a time with increased instruction- - * level parallelism when compared to the traditional approach of - * repeatedly doing 's1 += *p++; s2 += s1'. It is very similar to how - * vectorized implementations (e.g. AVX2) of Adler-32 commonly work. - */ - if (len >= 4) { - u32 s1_sum = 0; - u32 byte_0_sum = 0; - u32 byte_1_sum = 0; - u32 byte_2_sum = 0; - u32 byte_3_sum = 0; - - do { - s1_sum += s1; - s1 += p[0] + p[1] + p[2] + p[3]; - byte_0_sum += p[0]; - byte_1_sum += p[1]; - byte_2_sum += p[2]; - byte_3_sum += p[3]; - p += 4; - len -= 4; - } while (len >= 4); - s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + - (2 * byte_2_sum) + byte_3_sum; - } - - /* Process any remainder. */ - for (; len; len--, p++) { - s1 += *p; - s2 += s1; - } - *s1_p = s1; - *s2_p = s2; -} +/* + * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n, + * update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither + * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes + * already processed after the last reduction must not exceed MAX_CHUNK_LEN. + * + * This uses only portable C code. This is used as a fallback when a vectorized + * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform. + * + * Some of the vectorized implementations also use this to handle the end of the + * data when the data isn't evenly divisible by the length the vectorized code + * works on. To avoid compiler errors about target-specific option mismatches + * when this is used in that way, this is a macro rather than a function. + * + * Although this is unvectorized, this does include an optimization where the + * main loop processes four bytes at a time using a strategy similar to that + * used by vectorized implementations. This provides increased instruction- + * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'. + */ +#define ADLER32_CHUNK(s1, s2, p, n) \ +do { \ + if (n >= 4) { \ + u32 s1_sum = 0; \ + u32 byte_0_sum = 0; \ + u32 byte_1_sum = 0; \ + u32 byte_2_sum = 0; \ + u32 byte_3_sum = 0; \ + \ + do { \ + s1_sum += s1; \ + s1 += p[0] + p[1] + p[2] + p[3]; \ + byte_0_sum += p[0]; \ + byte_1_sum += p[1]; \ + byte_2_sum += p[2]; \ + byte_3_sum += p[3]; \ + p += 4; \ + n -= 4; \ + } while (n >= 4); \ + s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \ + (2 * byte_2_sum) + byte_3_sum; \ + } \ + for (; n; n--, p++) { \ + s1 += *p; \ + s2 += s1; \ + } \ + s1 %= DIVISOR; \ + s2 %= DIVISOR; \ +} while (0) static u32 MAYBE_UNUSED adler32_generic(u32 adler, const u8 *p, size_t len) @@ -102,14 +109,10 @@ adler32_generic(u32 adler, const u8 *p, size_t len) u32 s2 = adler >> 16; while (len) { - size_t chunk_len = MIN(len, MAX_CHUNK_LEN); - - adler32_generic_noreduce(&s1, &s2, p, chunk_len); - p += chunk_len; - len -= chunk_len; + size_t n = MIN(len, MAX_CHUNK_LEN & ~3); - s1 %= DIVISOR; - s2 %= DIVISOR; + len -= n; + ADLER32_CHUNK(s1, s2, p, n); } return (s2 << 16) | s1; diff --git a/lib/arm/adler32_impl.h b/lib/arm/adler32_impl.h index c715b65d..99a5f3f9 100644 --- a/lib/arm/adler32_impl.h +++ b/lib/arm/adler32_impl.h @@ -65,7 +65,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) /* * If the length is large and the pointer is misaligned, align it. - * For smaller lengths, just take the unaligned load penalty. + * For smaller lengths, just take the misaligned load penalty. */ if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { do { @@ -194,10 +194,11 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len) s2 += vaddvq_u32(v_s2); #endif } - adler32_generic_noreduce(&s1, &s2, p, n); - p += n; - s1 %= DIVISOR; - s2 %= DIVISOR; + /* + * Process the last 0 <= n < 64 bytes of the chunk using + * scalar instructions and reduce s1 and s2 mod DIVISOR. + */ + ADLER32_CHUNK(s1, s2, p, n); } return (s2 << 16) | s1; } @@ -243,7 +244,7 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) /* * If the length is large and the pointer is misaligned, align it. - * For smaller lengths, just take the unaligned load penalty. + * For smaller lengths, just take the misaligned load penalty. */ if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { do { @@ -323,38 +324,10 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) s2 += vaddvq_u32(v_s2); } /* - * Process the last 0 <= n < 64 bytes of the chunk. This is a - * copy of adler32_generic_noreduce(). We can't just call it - * directly here because in some cases the compiler errors out - * when inlining it due to a target specific option mismatch due - * to the use of arch=armv8.2 above. + * Process the last 0 <= n < 64 bytes of the chunk using + * scalar instructions and reduce s1 and s2 mod DIVISOR. */ - if (n >= 4) { - u32 s1_sum = 0; - u32 byte_0_sum = 0; - u32 byte_1_sum = 0; - u32 byte_2_sum = 0; - u32 byte_3_sum = 0; - - do { - s1_sum += s1; - s1 += p[0] + p[1] + p[2] + p[3]; - byte_0_sum += p[0]; - byte_1_sum += p[1]; - byte_2_sum += p[2]; - byte_3_sum += p[3]; - p += 4; - n -= 4; - } while (n >= 4); - s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + - (2 * byte_2_sum) + byte_3_sum; - } - for (; n; n--, p++) { - s1 += *p; - s2 += s1; - } - s1 %= DIVISOR; - s2 %= DIVISOR; + ADLER32_CHUNK(s1, s2, p, n); } return (s2 << 16) | s1; } diff --git a/lib/x86/adler32_template.h b/lib/x86/adler32_template.h index 125e4b92..c788acc5 100644 --- a/lib/x86/adler32_template.h +++ b/lib/x86/adler32_template.h @@ -221,7 +221,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) /* * If the length is large and the pointer is misaligned, align it. - * For smaller lengths, just take the unaligned load penalty. + * For smaller lengths, just take the misaligned load penalty. */ if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { do { @@ -477,12 +477,9 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) } /* * Process the last 0 <= n < 2*VL bytes of the chunk using - * scalar instructions, then reduce s1 and s2 mod DIVISOR. + * scalar instructions and reduce s1 and s2 mod DIVISOR. */ - adler32_generic_noreduce(&s1, &s2, p, n); - p += n; - s1 %= DIVISOR; - s2 %= DIVISOR; + ADLER32_CHUNK(s1, s2, p, n); } #endif /* !USE_VNNI */ return (s2 << 16) | s1;