@@ -242,17 +242,21 @@ char *strrchr(const char *s, int c) {
242
242
// SIMDized check which bytes are in a set (Geoff Langdale)
243
243
// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
244
244
245
+ // This is the same algorithm as truffle from Hyperscan:
246
+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
247
+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
248
+
245
249
typedef struct {
246
250
__u8x16 lo ;
247
251
__u8x16 hi ;
248
252
} __wasm_v128_bitmap256_t ;
249
253
250
254
__attribute__((always_inline ))
251
- static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , int i ) {
252
- uint8_t hi_nibble = ( uint8_t ) i >> 4 ;
253
- uint8_t lo_nibble = ( uint8_t ) i & 0xf ;
254
- bitmap -> lo [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 0 ));
255
- bitmap -> hi [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 8 ));
255
+ static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , uint8_t i ) {
256
+ uint8_t hi_nibble = i >> 4 ;
257
+ uint8_t lo_nibble = i & 0xf ;
258
+ bitmap -> lo [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 0 ));
259
+ bitmap -> hi [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 8 ));
256
260
}
257
261
258
262
#ifndef __wasm_relaxed_simd__
@@ -264,18 +268,17 @@ static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
264
268
__attribute__((always_inline ))
265
269
static v128_t __wasm_v128_chkbits (__wasm_v128_bitmap256_t bitmap , v128_t v ) {
266
270
v128_t hi_nibbles = wasm_u8x16_shr (v , 4 );
267
- v128_t bitmask_lookup = wasm_u8x16_const (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , //
268
- 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 );
271
+ v128_t bitmask_lookup = wasm_u64x2_const_splat (0x8040201008040201 );
269
272
v128_t bitmask = wasm_i8x16_relaxed_swizzle (bitmask_lookup , hi_nibbles );
270
273
271
274
v128_t indices_0_7 = v & wasm_u8x16_const_splat (0x8f );
272
275
v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat (0x80 );
273
276
274
- v128_t row_0_7 = wasm_i8x16_swizzle (bitmap .lo , indices_0_7 );
275
- v128_t row_8_15 = wasm_i8x16_swizzle (bitmap .hi , indices_8_15 );
277
+ v128_t row_0_7 = wasm_i8x16_swizzle (( v128_t ) bitmap .lo , indices_0_7 );
278
+ v128_t row_8_15 = wasm_i8x16_swizzle (( v128_t ) bitmap .hi , indices_8_15 );
276
279
277
280
v128_t bitsets = row_0_7 | row_8_15 ;
278
- return wasm_i8x16_eq ( bitsets & bitmask , bitmask ) ;
281
+ return bitsets & bitmask ;
279
282
}
280
283
281
284
#undef wasm_i8x16_relaxed_swizzle
@@ -317,17 +320,18 @@ size_t strspn(const char *s, const char *c) {
317
320
318
321
for (; * c ; c ++ ) {
319
322
// Terminator IS NOT on the bitmap.
320
- __wasm_v128_setbit (& bitmap , * c );
323
+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
321
324
}
322
325
323
326
for (;;) {
324
327
v128_t v = * (v128_t * )addr ;
325
- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
328
+ v128_t found = __wasm_v128_chkbits (bitmap , v );
326
329
// Bitmask is slow on AArch64, all_true is much faster.
327
- if (!wasm_i8x16_all_true (cmp )) {
330
+ if (!wasm_i8x16_all_true (found )) {
331
+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
328
332
// Clear the bits corresponding to align (little-endian)
329
333
// so we can count trailing zeros.
330
- int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
334
+ int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
331
335
// At least one bit will be set, unless align cleared them.
332
336
// Knowing this helps the compiler if it unrolls the loop.
333
337
__builtin_assume (mask || align );
@@ -356,17 +360,18 @@ size_t strcspn(const char *s, const char *c) {
356
360
357
361
do {
358
362
// Terminator IS on the bitmap.
359
- __wasm_v128_setbit (& bitmap , * c );
363
+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
360
364
} while (* c ++ );
361
365
362
366
for (;;) {
363
367
v128_t v = * (v128_t * )addr ;
364
- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
368
+ v128_t found = __wasm_v128_chkbits (bitmap , v );
365
369
// Bitmask is slow on AArch64, any_true is much faster.
366
- if (wasm_v128_any_true (cmp )) {
370
+ if (wasm_v128_any_true (found )) {
371
+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
367
372
// Clear the bits corresponding to align (little-endian)
368
373
// so we can count trailing zeros.
369
- int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
374
+ int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
370
375
// At least one bit will be set, unless align cleared them.
371
376
// Knowing this helps the compiler if it unrolls the loop.
372
377
__builtin_assume (mask || align );
0 commit comments