-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
incorporate upstream fixes to crc32c code #52325
Comments
StackOverflow's diff is unreadable, unfortunately. I also tried downloading the SO source and then running |
Here is the --- oldcode.c 2023-11-27 18:12:19
+++ newcode.c 2023-11-27 18:12:35
@@ -1,40 +1,33 @@
-/* Compute CRC-32C using the Intel hardware instruction. */
-static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len)
-{
- const unsigned char *next = buf;
- const unsigned char *end;
- uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
+static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
+ if (buf == NULL)
+ return 0;
- /* populate shift tables the first time through */
- pthread_once(&crc32c_once_hw, crc32c_init_hw);
+ // Pre-process the crc.
+ uint64_t crc0 = crc ^ 0xffffffff;
- /* pre-process the crc */
- crc0 = crc ^ 0xffffffff;
-
- /* compute the crc for up to seven leading bytes to bring the data pointer
- to an eight-byte boundary */
+ // Compute the crc for up to seven leading bytes, bringing the data pointer
+ // to an eight-byte boundary.
+ unsigned char const *next = buf;
while (len && ((uintptr_t)next & 7) != 0) {
__asm__("crc32b\t" "(%1), %0"
- : "=r"(crc0)
- : "r"(next), "0"(crc0));
+ : "+r"(crc0)
+ : "r"(next), "m"(*next));
next++;
len--;
}
- /* compute the crc on sets of LONG*3 bytes, executing three independent crc
- instructions, each on LONG bytes -- this is optimized for the Nehalem,
- Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
- throughput of one crc per cycle, but a latency of three cycles */
+ // Compute the crc on sets of LONG*3 bytes, making use of three ALUs in
+ // parallel on a single core.
while (len >= LONG*3) {
- crc1 = 0;
- crc2 = 0;
- end = next + LONG;
+ uint64_t crc1 = 0;
+ uint64_t crc2 = 0;
+ unsigned char const *end = next + LONG;
do {
__asm__("crc32q\t" "(%3), %0\n\t"
"crc32q\t" LONGx1 "(%3), %1\n\t"
"crc32q\t" LONGx2 "(%3), %2"
- : "=r"(crc0), "=r"(crc1), "=r"(crc2)
- : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
+ : "+r"(crc0), "+r"(crc1), "+r"(crc2)
+ : "r"(next), "m"(*next));
next += 8;
} while (next < end);
crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
@@ -43,18 +36,18 @@
len -= LONG*3;
}
- /* do the same thing, but now on SHORT*3 blocks for the remaining data less
- than a LONG*3 block */
+ // Do the same thing, but now on SHORT*3 blocks for the remaining data less
+ // than a LONG*3 block.
while (len >= SHORT*3) {
- crc1 = 0;
- crc2 = 0;
- end = next + SHORT;
+ uint64_t crc1 = 0;
+ uint64_t crc2 = 0;
+ unsigned char const *end = next + SHORT;
do {
__asm__("crc32q\t" "(%3), %0\n\t"
"crc32q\t" SHORTx1 "(%3), %1\n\t"
"crc32q\t" SHORTx2 "(%3), %2"
- : "=r"(crc0), "=r"(crc1), "=r"(crc2)
- : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
+ : "+r"(crc0), "+r"(crc1), "+r"(crc2)
+ : "r"(next), "m"(*next));
next += 8;
} while (next < end);
crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
@@ -63,26 +56,26 @@
len -= SHORT*3;
}
- /* compute the crc on the remaining eight-byte units less than a SHORT*3
- block */
- end = next + (len - (len & 7));
+ // Compute the crc on the remaining eight-byte units less than a SHORT*3
+ // block.
+ unsigned char const *end = next + (len - (len & 7));
while (next < end) {
__asm__("crc32q\t" "(%1), %0"
- : "=r"(crc0)
- : "r"(next), "0"(crc0));
+ : "+r"(crc0)
+ : "r"(next), "m"(*next));
next += 8;
}
len &= 7;
- /* compute the crc for up to seven trailing bytes */
+ // Compute the crc for up to seven trailing bytes.
while (len) {
__asm__("crc32b\t" "(%1), %0"
- : "=r"(crc0)
- : "r"(next), "0"(crc0));
+ : "+r"(crc0)
+ : "r"(next), "m"(*next));
next++;
len--;
}
- /* return a post-processed crc */
- return (uint32_t)crc0 ^ 0xffffffff;
+ // Return the crc, post-processed.
+ return ~(uint32_t)crc0;
} |
(There were a bunch of other changes related to the generation of the tables, but we had already independently made our tables constant so I don't think we need those. It was updated to return |
These are a response to [this comment](https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software/17646775#comment88832559_17646775) on StackOverflow: > Your asm constraints aren't strictly safe. A pointer in a `"r"` constraint does not imply that the pointed-to memory is also an input. You could just use memory operands (or the `_mm_crc32_u64` intrinsic), but if you want to force the addressing mode you can use a dummy memory operand like `asm("crc32 (%1),%0 " : "+r"(crc0) : "r"(next), "m" (*(const char (*)[24]) pStr) )`;. Inlining or LTO could break this. See [at&t asm inline c++ problem](https://stackoverflow.com/posts/comments/81680441) (which needs an update: pointer-to-array is cleaner than a struct with a flexible array member). The existing test coverage wasn't enough to fully exercise Adler's code, so I added another test. Closes #52325
Our CRC32c implementation was adapted from @madler's (BSD-licensed) code in this stackoverflow question. However, I just noticed that he posted a 2021 comment that the code has been updated from the version we used:
I don't know if there is an easy way to get the diff, but we should probably go through it and incorporate any improvements as needed.
For reference, this is the version of
crc32c.c
from my initial import, which should be closest to Mark's original code.Update: the
diff
is available on stackoverflow here: https://stackoverflow.com/posts/17646775/revisions … note that there were 3 updates in 2021.The text was updated successfully, but these errors were encountered: