From ac45c1420ce8800bf329cc4e274cd00367ff7eff Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Tue, 19 Jul 2022 01:20:02 +0200 Subject: [PATCH 1/2] NEON64: enc: split encoder into ASM and C implementations Create a second source file for the inline assembly implementation of the encoder. --- lib/arch/neon64/codec.c | 9 ++- lib/arch/neon64/enc_loop.c | 67 --------------------- lib/arch/neon64/enc_loop_asm.c | 103 +++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 69 deletions(-) create mode 100644 lib/arch/neon64/enc_loop_asm.c diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c index fc953b23..79789bb7 100644 --- a/lib/arch/neon64/codec.c +++ b/lib/arch/neon64/codec.c @@ -58,8 +58,13 @@ load_64byte_table (const uint8_t *p) #include "../generic/32/dec_loop.c" #include "../generic/64/enc_loop.c" #include "dec_loop.c" -#include "enc_reshuffle.c" -#include "enc_loop.c" + +#ifdef BASE64_NEON64_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_reshuffle.c" +# include "enc_loop.c" +#endif #endif // BASE64_USE_NEON64 diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c index d1862f7a..59a1c597 100644 --- a/lib/arch/neon64/enc_loop.c +++ b/lib/arch/neon64/enc_loop.c @@ -1,72 +1,6 @@ -#ifdef BASE64_NEON64_USE_ASM -static inline void -enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) -{ - // This function duplicates the functionality of enc_loop_neon64_inner, - // but entirely with inline assembly. This gives a significant speedup - // over using NEON intrinsics, which do not always generate very good - // code. The logic of the assembly is directly lifted from the - // intrinsics version, so it can be used as a guide to this code. - - // Temporary registers, used as scratch space. - uint8x16_t tmp0, tmp1, tmp2, tmp3; - - // Numeric constant. - const uint8x16_t n63 = vdupq_n_u8(63); - - __asm__ ( - - // Load 48 bytes and deinterleave. The bytes are loaded to - // hard-coded registers v12, v13 and v14, to ensure that they - // are contiguous. Increment the source pointer. - "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" - - // Reshuffle the bytes using temporaries. - "ushr %[t0].16b, v12.16b, #2 \n\t" - "ushr %[t1].16b, v13.16b, #4 \n\t" - "ushr %[t2].16b, v14.16b, #6 \n\t" - "sli %[t1].16b, v12.16b, #4 \n\t" - "sli %[t2].16b, v13.16b, #2 \n\t" - "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" - "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" - "and %[t3].16b, v14.16b, %[n63].16b \n\t" - - // Translate the values to the Base64 alphabet. - "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" - "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" - "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" - "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" - - // Store 64 bytes and interleave. Increment the dest pointer. - "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" - - // Outputs (modified). - : [src] "+r" (*s), - [dst] "+r" (*o), - [t0] "=&w" (tmp0), - [t1] "=&w" (tmp1), - [t2] "=&w" (tmp2), - [t3] "=&w" (tmp3) - - // Inputs (not modified). - : [n63] "w" (n63), - [l0] "w" (tbl_enc.val[0]), - [l1] "w" (tbl_enc.val[1]), - [l2] "w" (tbl_enc.val[2]), - [l3] "w" (tbl_enc.val[3]) - - // Clobbers. - : "v12", "v13", "v14", "v15" - ); -} -#endif - static inline void enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) { -#ifdef BASE64_NEON64_USE_ASM - enc_loop_neon64_inner_asm(s, o, tbl_enc); -#else // Load 48 bytes and deinterleave: uint8x16x3_t src = vld3q_u8(*s); @@ -86,7 +20,6 @@ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_en *s += 48; *o += 64; -#endif } static inline void diff --git a/lib/arch/neon64/enc_loop_asm.c b/lib/arch/neon64/enc_loop_asm.c new file mode 100644 index 00000000..6aa36893 --- /dev/null +++ b/lib/arch/neon64/enc_loop_asm.c @@ -0,0 +1,103 @@ +static inline void +enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) +{ + // This function duplicates the functionality of enc_loop_neon64_inner, + // but entirely with inline assembly. This gives a significant speedup + // over using NEON intrinsics, which do not always generate very good + // code. The logic of the assembly is directly lifted from the + // intrinsics version, so it can be used as a guide to this code. + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + + // Numeric constant. + const uint8x16_t n63 = vdupq_n_u8(63); + + __asm__ ( + + // Load 48 bytes and deinterleave. The bytes are loaded to + // hard-coded registers v12, v13 and v14, to ensure that they + // are contiguous. Increment the source pointer. + "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" + + // Reshuffle the bytes using temporaries. + "ushr %[t0].16b, v12.16b, #2 \n\t" + "ushr %[t1].16b, v13.16b, #4 \n\t" + "ushr %[t2].16b, v14.16b, #6 \n\t" + "sli %[t1].16b, v12.16b, #4 \n\t" + "sli %[t2].16b, v13.16b, #2 \n\t" + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" + "and %[t3].16b, v14.16b, %[n63].16b \n\t" + + // Translate the values to the Base64 alphabet. + "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" + "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" + "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" + "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" + + // Store 64 bytes and interleave. Increment the dest pointer. + "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" + + // Outputs (modified). + : [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) + + // Inputs (not modified). + : [n63] "w" (n63), + [l0] "w" (tbl_enc.val[0]), + [l1] "w" (tbl_enc.val[1]), + [l2] "w" (tbl_enc.val[2]), + [l3] "w" (tbl_enc.val[3]) + + // Clobbers. + : "v12", "v13", "v14", "v15" + ); +} + +static inline void +enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + size_t rounds = *slen / 48; + + *slen -= rounds * 48; // 48 bytes consumed per round. + *olen += rounds * 64; // 64 bytes produced per round. + + // Load the encoding table. + const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); + + while (rounds > 0) { + if (rounds >= 8) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 8; + continue; + } + if (rounds >= 4) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 4; + continue; + } + if (rounds >= 2) { + enc_loop_neon64_inner(s, o, tbl_enc); + enc_loop_neon64_inner(s, o, tbl_enc); + rounds -= 2; + continue; + } + enc_loop_neon64_inner(s, o, tbl_enc); + break; + } +} From 76dbd2d91ccbabfe6feaf99a0ec0261941808eac Mon Sep 17 00:00:00 2001 From: Alfred Klomp Date: Tue, 19 Jul 2022 02:31:05 +0200 Subject: [PATCH 2/2] NEON64: enc: convert full encoding loop to inline assembly Convert the full encoding loop to an inline assembly implementation for compilers that support inline assembly. The motivation for this change is issue #96: when optimization is turned off on recent versions of clang, the encoding table is sometimes not loaded into sequential registers. This happens despite taking pains to ensure that the compiler uses an explicit set of registers for the load (v8..v11). This leaves us with not much options beside rewriting the full encoding loop in inline assembly. Only then can we be absolutely certain that the right registers are used. Thankfully, AArch64 assembly is not very difficult to write by hand. In making this change, we optimize the unrolled loops for rounds >= 8 by interleaving memory operations (loads, stores) with data operations (arithmetic, table lookups). Splitting these two classes of instructions avoids pipeline stalls and data dependencies. The current loop iteration also prefetches the data needed in the next iteration. To allow that without duplicating massive amounts of code, we abstract the various assembly blocks into preprocessor macros and instantiate them as needed. This mixing of the preprocessor with inline assembly is perhaps a bit gnarly, but I think the usage is simple enough that the advantages (code reuse) outweigh the disadvantages. Code was tested on a Debian VM running under QEMU. Unfortunately, testing in a VM does not let us measure the actual performance impact. --- lib/arch/neon64/enc_loop_asm.c | 230 +++++++++++++++++++++------------ 1 file changed, 147 insertions(+), 83 deletions(-) diff --git a/lib/arch/neon64/enc_loop_asm.c b/lib/arch/neon64/enc_loop_asm.c index 6aa36893..a646527b 100644 --- a/lib/arch/neon64/enc_loop_asm.c +++ b/lib/arch/neon64/enc_loop_asm.c @@ -1,103 +1,167 @@ -static inline void -enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) -{ - // This function duplicates the functionality of enc_loop_neon64_inner, - // but entirely with inline assembly. This gives a significant speedup - // over using NEON intrinsics, which do not always generate very good - // code. The logic of the assembly is directly lifted from the - // intrinsics version, so it can be used as a guide to this code. +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. - // Temporary registers, used as scratch space. - uint8x16_t tmp0, tmp1, tmp2, tmp3; +// Generate a block of inline assembly that loads three user-defined registers +// P, Q, R from memory and deinterleaves them, post-incrementing the src +// pointer. The register set should be sequential. +#define LOAD(P, Q, R) \ + "ld3 {"P".16b, "Q".16b, "R".16b}, [%[src]], #48 \n\t" - // Numeric constant. - const uint8x16_t n63 = vdupq_n_u8(63); +// Generate a block of inline assembly that takes three deinterleaved registers +// and shuffles the bytes. The output is in temporary registers t0..t3. +#define SHUF(P, Q, R) \ + "ushr %[t0].16b, "P".16b, #2 \n\t" \ + "ushr %[t1].16b, "Q".16b, #4 \n\t" \ + "ushr %[t2].16b, "R".16b, #6 \n\t" \ + "sli %[t1].16b, "P".16b, #4 \n\t" \ + "sli %[t2].16b, "Q".16b, #2 \n\t" \ + "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \ + "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \ + "and %[t3].16b, "R".16b, %[n63].16b \n\t" - __asm__ ( +// Generate a block of inline assembly that takes temporary registers t0..t3 +// and translates them to the base64 alphabet, using a table loaded into +// v8..v11. The output is in user-defined registers P..S. +#define TRAN(P, Q, R, S) \ + "tbl "P".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \ + "tbl "Q".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \ + "tbl "R".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \ + "tbl "S".16b, {v8.16b-v11.16b}, %[t3].16b \n\t" - // Load 48 bytes and deinterleave. The bytes are loaded to - // hard-coded registers v12, v13 and v14, to ensure that they - // are contiguous. Increment the source pointer. - "ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t" +// Generate a block of inline assembly that interleaves four registers and +// stores them, post-incrementing the destination pointer. +#define STOR(P, Q, R, S) \ + "st4 {"P".16b, "Q".16b, "R".16b, "S".16b}, [%[dst]], #64 \n\t" - // Reshuffle the bytes using temporaries. - "ushr %[t0].16b, v12.16b, #2 \n\t" - "ushr %[t1].16b, v13.16b, #4 \n\t" - "ushr %[t2].16b, v14.16b, #6 \n\t" - "sli %[t1].16b, v12.16b, #4 \n\t" - "sli %[t2].16b, v13.16b, #2 \n\t" - "and %[t1].16b, %[t1].16b, %[n63].16b \n\t" - "and %[t2].16b, %[t2].16b, %[n63].16b \n\t" - "and %[t3].16b, v14.16b, %[n63].16b \n\t" +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. +#define ROUND() \ + LOAD("v12", "v13", "v14") \ + SHUF("v12", "v13", "v14") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") - // Translate the values to the Base64 alphabet. - "tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t" - "tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t" - "tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t" - "tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t" +// Generate a block of assembly that generates a type A interleaved encoder +// round. It uses registers that were loaded by the previous type B round, and +// in turn loads registers for the next type B round. +#define ROUND_A() \ + SHUF("v2", "v3", "v4") \ + LOAD("v12", "v13", "v14") \ + TRAN("v2", "v3", "v4", "v5") \ + STOR("v2", "v3", "v4", "v5") - // Store 64 bytes and interleave. Increment the dest pointer. - "st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t" +// Type B interleaved encoder round. Same as type A, but register sets swapped. +#define ROUND_B() \ + SHUF("v12", "v13", "v14") \ + LOAD("v2", "v3", "v4") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") - // Outputs (modified). - : [src] "+r" (*s), - [dst] "+r" (*o), - [t0] "=&w" (tmp0), - [t1] "=&w" (tmp1), - [t2] "=&w" (tmp2), - [t3] "=&w" (tmp3) +// The first type A round needs to load its own registers. +#define ROUND_A_FIRST() \ + LOAD("v2", "v3", "v4") \ + ROUND_A() - // Inputs (not modified). - : [n63] "w" (n63), - [l0] "w" (tbl_enc.val[0]), - [l1] "w" (tbl_enc.val[1]), - [l2] "w" (tbl_enc.val[2]), - [l3] "w" (tbl_enc.val[3]) +// The last type B round omits the load for the next step. +#define ROUND_B_LAST() \ + SHUF("v12", "v13", "v14") \ + TRAN("v12", "v13", "v14", "v15") \ + STOR("v12", "v13", "v14", "v15") - // Clobbers. - : "v12", "v13", "v14", "v15" - ); -} +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" static inline void enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) { size_t rounds = *slen / 48; + if (rounds == 0) { + return; + } + *slen -= rounds * 48; // 48 bytes consumed per round. *olen += rounds * 64; // 64 bytes produced per round. - // Load the encoding table. - const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit); - - while (rounds > 0) { - if (rounds >= 8) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 8; - continue; - } - if (rounds >= 4) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 4; - continue; - } - if (rounds >= 2) { - enc_loop_neon64_inner(s, o, tbl_enc); - enc_loop_neon64_inner(s, o, tbl_enc); - rounds -= 2; - continue; - } - enc_loop_neon64_inner(s, o, tbl_enc); - break; - } + // Number of times to go through the 8x loop. + size_t loops = rounds / 8; + + // Number of rounds remaining after the 8x loop. + rounds %= 8; + + // Temporary registers, used as scratch space. + uint8x16_t tmp0, tmp1, tmp2, tmp3; + + __asm__ volatile ( + + // Load the encoding table into v8..v11. + " ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t" + + // If there are eight rounds or more, enter an 8x unrolled loop + // of interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations to maximize + // pipeline throughput. + " cbz %[loops], 4f \n\t" + + // The SIMD instructions do not touch the flags. + "88: subs %[loops], %[loops], #1 \n\t" + " " ROUND_A_FIRST() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B_LAST() + " b.ne 88b \n\t" + + // Enter a 4x unrolled loop for rounds of 4 or more. + "4: cmp %[rounds], #4 \n\t" + " b.lt 30f \n\t" + " " ROUND_A_FIRST() + " " ROUND_B() + " " ROUND_A() + " " ROUND_B_LAST() + " sub %[rounds], %[rounds], #4 \n\t" + + // Dispatch the remaining rounds 0..3. + "30: cbz %[rounds], 0f \n\t" + " cmp %[rounds], #2 \n\t" + " b.eq 2f \n\t" + " b.lt 1f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [t0] "=&w" (tmp0), + [t1] "=&w" (tmp1), + [t2] "=&w" (tmp2), + [t3] "=&w" (tmp3) + + // Inputs (not modified). + : [rounds] "r" (rounds), + [tbl] "r" (base64_table_enc_6bit), + [n63] "w" (vdupq_n_u8(63)) + + // Clobbers. + : "v2", "v3", "v4", "v5", + "v8", "v9", "v10", "v11", + "v12", "v13", "v14", "v15" + ); } + +#pragma GCC diagnostic pop