From fd67cb2ab0fe5c41d2e12d6aae0396d493a72b71 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 29 Nov 2023 16:13:04 -0500 Subject: [PATCH] incorporate upstream fixes to crc32c.c assembly (#52326) These are a response to [this comment](https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software/17646775#comment88832559_17646775) on StackOverflow: > Your asm constraints aren't strictly safe. A pointer in a `"r"` constraint does not imply that the pointed-to memory is also an input. You could just use memory operands (or the `_mm_crc32_u64` intrinsic), but if you want to force the addressing mode you can use a dummy memory operand like `asm("crc32 (%1),%0 " : "+r"(crc0) : "r"(next), "m" (*(const char (*)[24]) pStr) )`;. Inlining or LTO could break this. See [at&t asm inline c++ problem](https://stackoverflow.com/posts/comments/81680441) (which needs an update: pointer-to-array is cleaner than a struct with a flexible array member). The existing test coverage wasn't enough to fully exercise Adler's code, so I added another test. Closes #52325 --- src/crc32c.c | 39 ++++++++++++++++++++-------------- stdlib/CRC32c/test/runtests.jl | 9 ++++++++ 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/crc32c.c b/src/crc32c.c index 4ca8db06459a1..844a6e34156e6 100644 --- a/src/crc32c.c +++ b/src/crc32c.c @@ -1,6 +1,6 @@ /* crc32c.c -- compute CRC-32C using software table or available hardware instructions - * Copyright (C) 2013 Mark Adler - * Version 1.1 1 Aug 2013 Mark Adler + * Copyright (C) 2013, 2021 Mark Adler + * Version 1.1 1 Aug 2013 Mark Adler, updates from Version 1.2 5 June 2021 * * Code retrieved in August 2016 from August 2013 post by Mark Adler on * http://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software @@ -10,6 +10,7 @@ * - architecture and compiler detection * - precompute crc32c tables and store in a generated .c file * - ARMv8 support + * Updated to incorporate upstream 2021 patch by Mark Adler to register constraints. */ /* @@ -39,6 +40,8 @@ /* Version history: 1.0 10 Feb 2013 First version 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel + 1.2 5 Jun 2021 Correct register constraints on assembly instructions + (+ other changes that were superfluous for us) */ #include "julia.h" @@ -98,16 +101,14 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len) to an eight-byte boundary */ while (len && ((uintptr_t)buf & 7) != 0) { __asm__("crc32b\t" "(%1), %0" - : "=r"(crc0) - : "r"(buf), "0"(crc0)); + : "+r"(crc0) + : "r"(buf), "m"(*buf)); buf++; len--; } - /* compute the crc on sets of LONG*3 bytes, executing three independent crc - instructions, each on LONG bytes -- this is optimized for the Nehalem, - Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a - throughput of one crc per cycle, but a latency of three cycles */ + /* compute the crc on sets of LONG*3 bytes, + making use of three ALUs in parallel on a single core. */ while (len >= LONG * 3) { uintptr_t crc1 = 0; uintptr_t crc2 = 0; @@ -116,8 +117,11 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len) __asm__(CRC32_PTR "\t" "(%3), %0\n\t" CRC32_PTR "\t" LONGx1 "(%3), %1\n\t" CRC32_PTR "\t" LONGx2 "(%3), %2" - : "=r"(crc0), "=r"(crc1), "=r"(crc2) - : "r"(buf), "0"(crc0), "1"(crc1), "2"(crc2)); + : "+r"(crc0), "+r"(crc1), "+r"(crc2) + : "r"(buf), + "m"(* (const char (*)[sizeof(void*)]) &buf[0]), + "m"(* (const char (*)[sizeof(void*)]) &buf[LONG]), + "m"(* (const char (*)[sizeof(void*)]) &buf[LONG*2])); buf += sizeof(void*); } while (buf < end); crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1; @@ -136,8 +140,11 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len) __asm__(CRC32_PTR "\t" "(%3), %0\n\t" CRC32_PTR "\t" SHORTx1 "(%3), %1\n\t" CRC32_PTR "\t" SHORTx2 "(%3), %2" - : "=r"(crc0), "=r"(crc1), "=r"(crc2) - : "r"(buf), "0"(crc0), "1"(crc1), "2"(crc2)); + : "+r"(crc0), "+r"(crc1), "+r"(crc2) + : "r"(buf), + "m"(* (const char (*)[sizeof(void*)]) &buf[0]), + "m"(* (const char (*)[sizeof(void*)]) &buf[SHORT]), + "m"(* (const char (*)[sizeof(void*)]) &buf[SHORT*2])); buf += sizeof(void*); } while (buf < end); crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1; @@ -151,8 +158,8 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len) const char *end = buf + (len - (len & 7)); while (buf < end) { __asm__(CRC32_PTR "\t" "(%1), %0" - : "=r"(crc0) - : "r"(buf), "0"(crc0)); + : "+r"(crc0) + : "r"(buf), "m"(* (const char (*)[sizeof(void*)]) buf)); buf += sizeof(void*); } len &= 7; @@ -160,8 +167,8 @@ static uint32_t crc32c_sse42(uint32_t crc, const char *buf, size_t len) /* compute the crc for up to seven trailing bytes */ while (len) { __asm__("crc32b\t" "(%1), %0" - : "=r"(crc0) - : "r"(buf), "0"(crc0)); + : "+r"(crc0) + : "r"(buf), "m"(*buf)); buf++; len--; } diff --git a/stdlib/CRC32c/test/runtests.jl b/stdlib/CRC32c/test/runtests.jl index e9e933ee2451c..41a7ea2ab62fa 100644 --- a/stdlib/CRC32c/test/runtests.jl +++ b/stdlib/CRC32c/test/runtests.jl @@ -45,6 +45,15 @@ function test_crc32c(crc32c) rm(f, force=true) end end + + # test longer arrays to cover all the code paths in crc32c.c + LONG = 8192 # from crc32c.c + SHORT = 256 # from crc32c.c + n = LONG*3+SHORT*3+SHORT*2+64+7 + big = vcat(reinterpret(UInt8, hton.(0x74d7f887 .^ (1:nĂ·4))), UInt8[1:n%4;]) + for (offset,crc) in [(0, 0x13a5ecd5), (1, 0xecf34b7e), (2, 0xfa71b596), (3, 0xbfd24745), (4, 0xf0cb3370), (5, 0xb0ec88b5), (6, 0x258c20a8), (7, 0xa9bd638d)] + @test crc == crc32c(@view big[1+offset:end]) + end end unsafe_crc32c_sw(a, n, crc) = ccall(:jl_crc32c_sw, UInt32, (UInt32, Ptr{UInt8}, Csize_t), crc, a, n)