Skip to content

Commit

Permalink
Fix RocksDB SIGILL error on Raspberry PI 4 (#7233)
Browse files Browse the repository at this point in the history
Summary:
Issue:#7042

No PMULL runtime check will lead to SIGILL on a Raspberry pi 4.

Leverage 'getauxval' to get Hardware-Cap to detect whether target
platform does support PMULL or not in runtime.

Consider the condition that the target platform does support crc32 but not support PMULL.
In this condition, the code should leverage the crc32 instruction
rather than skip all hardware crc32 instruction.

Pull Request resolved: #7233

Reviewed By: jay-zhuang

Differential Revision: D23790116

fbshipit-source-id: a3ebd821fbd4a38dd2f59064adbb7c3013ee8140
  • Loading branch information
guyuqi authored and facebook-github-bot committed Sep 22, 2020
1 parent 3591da3 commit 29f7bbe
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 44 deletions.
6 changes: 6 additions & 0 deletions util/crc32c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@

#endif

#if defined(__linux__) && defined(HAVE_ARM64_CRC)
bool pmull_runtime_flag = false;
#endif

namespace ROCKSDB_NAMESPACE {
namespace crc32c {

Expand Down Expand Up @@ -494,6 +498,7 @@ std::string IsFastCrc32Supported() {
if (crc32c_runtime_check()) {
has_fast_crc = true;
arch = "Arm64";
pmull_runtime_flag = crc32c_pmull_runtime_check();
} else {
has_fast_crc = false;
arch = "Arm64";
Expand Down Expand Up @@ -1224,6 +1229,7 @@ static inline Function Choose_Extend() {
return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
if(crc32c_runtime_check()) {
pmull_runtime_flag = crc32c_pmull_runtime_check();
return ExtendARMImpl;
} else {
return ExtendImpl<Slow_CRC32>;
Expand Down
111 changes: 67 additions & 44 deletions util/crc32c_arm64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif
#ifndef HWCAP_PMULL
#define HWCAP_PMULL (1 << 4)
#endif

#ifdef HAVE_ARM64_CRYPTO
/* unfolding to compute 8 * 3 = 24 bytes parallelly */
Expand All @@ -35,6 +38,8 @@
} while (0)
#endif

extern bool pmull_runtime_flag;

uint32_t crc32c_runtime_check(void) {
#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
uint64_t auxv = getauxval(AT_HWCAP);
Expand All @@ -44,6 +49,15 @@ uint32_t crc32c_runtime_check(void) {
#endif
}

bool crc32c_pmull_runtime_check(void) {
#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
uint64_t auxv = getauxval(AT_HWCAP);
return (auxv & HWCAP_PMULL) != 0;
#else
return false;
#endif
}

#ifdef ROCKSDB_UBSAN_RUN
#if defined(__clang__)
__attribute__((__no_sanitize__("alignment")))
Expand All @@ -58,6 +72,13 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
int length = (int)len;
crc ^= 0xffffffff;

/*
* Pmull runtime check here.
* Raspberry Pi supports crc32 but doesn't support pmull.
* Skip Crc32c Parallel computation if no crypto extension available.
*/
if (pmull_runtime_flag) {
/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check */
#ifdef HAVE_ARM64_CRYPTO
/* Crc32c Parallel computation
* Algorithm comes from Intel whitepaper:
Expand All @@ -68,51 +89,53 @@ uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
* One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
*/
#define BLK_LENGTH 42
while (length >= 1024) {
uint64_t t0, t1;
uint32_t crc0 = 0, crc1 = 0, crc2 = 0;

/* Parallel Param:
* k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
* k1 = CRC32(x ^ (42 * 8 * 8 - 1));
*/
uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;

/* Prefetch data for following block to avoid cache miss */
PREF1KL1((uint8_t *)buf64, 1024);

/* First 8 byte for better pipelining */
crc0 = crc32c_u64(crc, *buf64++);

/* 3 blocks crc32c parallel computation
* Macro unfolding to compute parallelly
* 168 * 6 = 1008 (bytes)
*/
CRC32C7X24BYTES(0);
CRC32C7X24BYTES(1);
CRC32C7X24BYTES(2);
CRC32C7X24BYTES(3);
CRC32C7X24BYTES(4);
CRC32C7X24BYTES(5);
buf64 += (BLK_LENGTH * 3);

/* Last 8 bytes */
crc = crc32c_u64(crc2, *buf64++);

t0 = (uint64_t)vmull_p64(crc0, k0);
t1 = (uint64_t)vmull_p64(crc1, k1);

/* Merge (crc0, crc1, crc2) -> crc */
crc1 = crc32c_u64(0, t1);
crc ^= crc1;
crc0 = crc32c_u64(0, t0);
crc ^= crc0;

length -= 1024;
}

if (length == 0) return crc ^ (0xffffffffU);
while (length >= 1024) {
uint64_t t0, t1;
uint32_t crc0 = 0, crc1 = 0, crc2 = 0;

/* Parallel Param:
* k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
* k1 = CRC32(x ^ (42 * 8 * 8 - 1));
*/
uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;

/* Prefetch data for following block to avoid cache miss */
PREF1KL1((uint8_t *)buf64, 1024);

/* First 8 byte for better pipelining */
crc0 = crc32c_u64(crc, *buf64++);

/* 3 blocks crc32c parallel computation
* Macro unfolding to compute parallelly
* 168 * 6 = 1008 (bytes)
*/
CRC32C7X24BYTES(0);
CRC32C7X24BYTES(1);
CRC32C7X24BYTES(2);
CRC32C7X24BYTES(3);
CRC32C7X24BYTES(4);
CRC32C7X24BYTES(5);
buf64 += (BLK_LENGTH * 3);

/* Last 8 bytes */
crc = crc32c_u64(crc2, *buf64++);

t0 = (uint64_t)vmull_p64(crc0, k0);
t1 = (uint64_t)vmull_p64(crc1, k1);

/* Merge (crc0, crc1, crc2) -> crc */
crc1 = crc32c_u64(0, t1);
crc ^= crc1;
crc0 = crc32c_u64(0, t0);
crc ^= crc0;

length -= 1024;
}

if (length == 0) return crc ^ (0xffffffffU);
#endif
} // if Pmull runtime check here

buf8 = (const uint8_t *)buf64;
while (length >= 8) {
crc = crc32c_u64(crc, *(const uint64_t *)buf8);
Expand Down
1 change: 1 addition & 0 deletions util/crc32c_arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
extern uint32_t crc32c_runtime_check(void);
extern bool crc32c_pmull_runtime_check(void);

#ifdef __ARM_FEATURE_CRYPTO
#define HAVE_ARM64_CRYPTO
Expand Down

0 comments on commit 29f7bbe

Please sign in to comment.