Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/adler32: replace adler32_generic_noreduce() with ADLER32_CHUNK() #351

Merged
merged 1 commit into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 51 additions & 48 deletions lib/adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,47 +53,54 @@
*/
#define MAX_CHUNK_LEN 5552

static forceinline void MAYBE_UNUSED
adler32_generic_noreduce(u32 *s1_p, u32 *s2_p, const u8 *p, size_t len)
{
u32 s1 = *s1_p;
u32 s2 = *s2_p;

/*
* This loop processes four bytes at a time with increased instruction-
* level parallelism when compared to the traditional approach of
* repeatedly doing 's1 += *p++; s2 += s1'. It is very similar to how
* vectorized implementations (e.g. AVX2) of Adler-32 commonly work.
*/
if (len >= 4) {
u32 s1_sum = 0;
u32 byte_0_sum = 0;
u32 byte_1_sum = 0;
u32 byte_2_sum = 0;
u32 byte_3_sum = 0;

do {
s1_sum += s1;
s1 += p[0] + p[1] + p[2] + p[3];
byte_0_sum += p[0];
byte_1_sum += p[1];
byte_2_sum += p[2];
byte_3_sum += p[3];
p += 4;
len -= 4;
} while (len >= 4);
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +
(2 * byte_2_sum) + byte_3_sum;
}

/* Process any remainder. */
for (; len; len--, p++) {
s1 += *p;
s2 += s1;
}
*s1_p = s1;
*s2_p = s2;
}
/*
* Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
* update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
* s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
* already processed after the last reduction must not exceed MAX_CHUNK_LEN.
*
* This uses only portable C code. This is used as a fallback when a vectorized
* implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
*
* Some of the vectorized implementations also use this to handle the end of the
* data when the data isn't evenly divisible by the length the vectorized code
* works on. To avoid compiler errors about target-specific option mismatches
* when this is used in that way, this is a macro rather than a function.
*
* Although this is unvectorized, this does include an optimization where the
* main loop processes four bytes at a time using a strategy similar to that
* used by vectorized implementations. This provides increased instruction-
* level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
*/
#define ADLER32_CHUNK(s1, s2, p, n) \
do { \
if (n >= 4) { \
u32 s1_sum = 0; \
u32 byte_0_sum = 0; \
u32 byte_1_sum = 0; \
u32 byte_2_sum = 0; \
u32 byte_3_sum = 0; \
\
do { \
s1_sum += s1; \
s1 += p[0] + p[1] + p[2] + p[3]; \
byte_0_sum += p[0]; \
byte_1_sum += p[1]; \
byte_2_sum += p[2]; \
byte_3_sum += p[3]; \
p += 4; \
n -= 4; \
} while (n >= 4); \
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
(2 * byte_2_sum) + byte_3_sum; \
} \
for (; n; n--, p++) { \
s1 += *p; \
s2 += s1; \
} \
s1 %= DIVISOR; \
s2 %= DIVISOR; \
} while (0)

static u32 MAYBE_UNUSED
adler32_generic(u32 adler, const u8 *p, size_t len)
Expand All @@ -102,14 +109,10 @@ adler32_generic(u32 adler, const u8 *p, size_t len)
u32 s2 = adler >> 16;

while (len) {
size_t chunk_len = MIN(len, MAX_CHUNK_LEN);

adler32_generic_noreduce(&s1, &s2, p, chunk_len);
p += chunk_len;
len -= chunk_len;
size_t n = MIN(len, MAX_CHUNK_LEN & ~3);

s1 %= DIVISOR;
s2 %= DIVISOR;
len -= n;
ADLER32_CHUNK(s1, s2, p, n);
}

return (s2 << 16) | s1;
Expand Down
47 changes: 10 additions & 37 deletions lib/arm/adler32_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
do {
Expand Down Expand Up @@ -194,10 +194,11 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
s2 += vaddvq_u32(v_s2);
#endif
}
adler32_generic_noreduce(&s1, &s2, p, n);
p += n;
s1 %= DIVISOR;
s2 %= DIVISOR;
/*
* Process the last 0 <= n < 64 bytes of the chunk using
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
ADLER32_CHUNK(s1, s2, p, n);
}
return (s2 << 16) | s1;
}
Expand Down Expand Up @@ -243,7 +244,7 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
do {
Expand Down Expand Up @@ -323,38 +324,10 @@ adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
s2 += vaddvq_u32(v_s2);
}
/*
* Process the last 0 <= n < 64 bytes of the chunk. This is a
* copy of adler32_generic_noreduce(). We can't just call it
* directly here because in some cases the compiler errors out
* when inlining it due to a target specific option mismatch due
* to the use of arch=armv8.2 above.
* Process the last 0 <= n < 64 bytes of the chunk using
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
if (n >= 4) {
u32 s1_sum = 0;
u32 byte_0_sum = 0;
u32 byte_1_sum = 0;
u32 byte_2_sum = 0;
u32 byte_3_sum = 0;

do {
s1_sum += s1;
s1 += p[0] + p[1] + p[2] + p[3];
byte_0_sum += p[0];
byte_1_sum += p[1];
byte_2_sum += p[2];
byte_3_sum += p[3];
p += 4;
n -= 4;
} while (n >= 4);
s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +
(2 * byte_2_sum) + byte_3_sum;
}
for (; n; n--, p++) {
s1 += *p;
s2 += s1;
}
s1 %= DIVISOR;
s2 %= DIVISOR;
ADLER32_CHUNK(s1, s2, p, n);
}
return (s2 << 16) | s1;
}
Expand Down
9 changes: 3 additions & 6 deletions lib/x86/adler32_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)

/*
* If the length is large and the pointer is misaligned, align it.
* For smaller lengths, just take the unaligned load penalty.
* For smaller lengths, just take the misaligned load penalty.
*/
if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
do {
Expand Down Expand Up @@ -477,12 +477,9 @@ ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
}
/*
* Process the last 0 <= n < 2*VL bytes of the chunk using
* scalar instructions, then reduce s1 and s2 mod DIVISOR.
* scalar instructions and reduce s1 and s2 mod DIVISOR.
*/
adler32_generic_noreduce(&s1, &s2, p, n);
p += n;
s1 %= DIVISOR;
s2 %= DIVISOR;
ADLER32_CHUNK(s1, s2, p, n);
}
#endif /* !USE_VNNI */
return (s2 << 16) | s1;
Expand Down
Loading