Skip to content

Commit 8f9a8e2

Browse files
committed
Learnings from truffle.
1 parent d8880e4 commit 8f9a8e2

File tree

1 file changed

+23
-18
lines changed

1 file changed

+23
-18
lines changed

sqlite3/libc/string.h

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -242,17 +242,21 @@ char *strrchr(const char *s, int c) {
242242
// SIMDized check which bytes are in a set (Geoff Langdale)
243243
// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
244244

245+
// This is the same algorithm as truffle from Hyperscan:
246+
// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
247+
// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
248+
245249
typedef struct {
246250
__u8x16 lo;
247251
__u8x16 hi;
248252
} __wasm_v128_bitmap256_t;
249253

250254
__attribute__((always_inline))
251-
static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
252-
uint8_t hi_nibble = (uint8_t)i >> 4;
253-
uint8_t lo_nibble = (uint8_t)i & 0xf;
254-
bitmap->lo[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 0));
255-
bitmap->hi[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 8));
255+
static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, uint8_t i) {
256+
uint8_t hi_nibble = i >> 4;
257+
uint8_t lo_nibble = i & 0xf;
258+
bitmap->lo[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 0));
259+
bitmap->hi[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 8));
256260
}
257261

258262
#ifndef __wasm_relaxed_simd__
@@ -264,18 +268,17 @@ static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
264268
__attribute__((always_inline))
265269
static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) {
266270
v128_t hi_nibbles = wasm_u8x16_shr(v, 4);
267-
v128_t bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, //
268-
1, 2, 4, 8, 16, 32, 64, 128);
271+
v128_t bitmask_lookup = wasm_u64x2_const_splat(0x8040201008040201);
269272
v128_t bitmask = wasm_i8x16_relaxed_swizzle(bitmask_lookup, hi_nibbles);
270273

271274
v128_t indices_0_7 = v & wasm_u8x16_const_splat(0x8f);
272275
v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat(0x80);
273276

274-
v128_t row_0_7 = wasm_i8x16_swizzle(bitmap.lo, indices_0_7);
275-
v128_t row_8_15 = wasm_i8x16_swizzle(bitmap.hi, indices_8_15);
277+
v128_t row_0_7 = wasm_i8x16_swizzle((v128_t)bitmap.lo, indices_0_7);
278+
v128_t row_8_15 = wasm_i8x16_swizzle((v128_t)bitmap.hi, indices_8_15);
276279

277280
v128_t bitsets = row_0_7 | row_8_15;
278-
return wasm_i8x16_eq(bitsets & bitmask, bitmask);
281+
return bitsets & bitmask;
279282
}
280283

281284
#undef wasm_i8x16_relaxed_swizzle
@@ -317,17 +320,18 @@ size_t strspn(const char *s, const char *c) {
317320

318321
for (; *c; c++) {
319322
// Terminator IS NOT on the bitmap.
320-
__wasm_v128_setbit(&bitmap, *c);
323+
__wasm_v128_setbit(&bitmap, (uint8_t)*c);
321324
}
322325

323326
for (;;) {
324327
v128_t v = *(v128_t *)addr;
325-
v128_t cmp = __wasm_v128_chkbits(bitmap, v);
328+
v128_t found = __wasm_v128_chkbits(bitmap, v);
326329
// Bitmask is slow on AArch64, all_true is much faster.
327-
if (!wasm_i8x16_all_true(cmp)) {
330+
if (!wasm_i8x16_all_true(found)) {
331+
v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
328332
// Clear the bits corresponding to align (little-endian)
329333
// so we can count trailing zeros.
330-
int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
334+
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
331335
// At least one bit will be set, unless align cleared them.
332336
// Knowing this helps the compiler if it unrolls the loop.
333337
__builtin_assume(mask || align);
@@ -356,17 +360,18 @@ size_t strcspn(const char *s, const char *c) {
356360

357361
do {
358362
// Terminator IS on the bitmap.
359-
__wasm_v128_setbit(&bitmap, *c);
363+
__wasm_v128_setbit(&bitmap, (uint8_t)*c);
360364
} while (*c++);
361365

362366
for (;;) {
363367
v128_t v = *(v128_t *)addr;
364-
v128_t cmp = __wasm_v128_chkbits(bitmap, v);
368+
v128_t found = __wasm_v128_chkbits(bitmap, v);
365369
// Bitmask is slow on AArch64, any_true is much faster.
366-
if (wasm_v128_any_true(cmp)) {
370+
if (wasm_v128_any_true(found)) {
371+
v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
367372
// Clear the bits corresponding to align (little-endian)
368373
// so we can count trailing zeros.
369-
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
374+
int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
370375
// At least one bit will be set, unless align cleared them.
371376
// Knowing this helps the compiler if it unrolls the loop.
372377
__builtin_assume(mask || align);

0 commit comments

Comments
 (0)