Skip to content

Commit

Permalink
deps: upgrade base64 to dd7a2b5f318
Browse files Browse the repository at this point in the history
  • Loading branch information
mscdex committed Jul 28, 2022
1 parent f561f31 commit f35cb3d
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 69 deletions.
9 changes: 7 additions & 2 deletions deps/base64/base64/lib/arch/neon64/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,13 @@ load_64byte_table (const uint8_t *p)
#include "../generic/32/dec_loop.c"
#include "../generic/64/enc_loop.c"
#include "dec_loop.c"
#include "enc_reshuffle.c"
#include "enc_loop.c"

#ifdef BASE64_NEON64_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_loop.c"
#endif

#endif // BASE64_USE_NEON64

Expand Down
67 changes: 0 additions & 67 deletions deps/base64/base64/lib/arch/neon64/enc_loop.c
Original file line number Diff line number Diff line change
@@ -1,72 +1,6 @@
#ifdef BASE64_NEON64_USE_ASM
static inline void
enc_loop_neon64_inner_asm (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
// This function duplicates the functionality of enc_loop_neon64_inner,
// but entirely with inline assembly. This gives a significant speedup
// over using NEON intrinsics, which do not always generate very good
// code. The logic of the assembly is directly lifted from the
// intrinsics version, so it can be used as a guide to this code.

// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;

// Numeric constant.
const uint8x16_t n63 = vdupq_n_u8(63);

__asm__ (

// Load 48 bytes and deinterleave. The bytes are loaded to
// hard-coded registers v12, v13 and v14, to ensure that they
// are contiguous. Increment the source pointer.
"ld3 {v12.16b, v13.16b, v14.16b}, [%[src]], #48 \n\t"

// Reshuffle the bytes using temporaries.
"ushr %[t0].16b, v12.16b, #2 \n\t"
"ushr %[t1].16b, v13.16b, #4 \n\t"
"ushr %[t2].16b, v14.16b, #6 \n\t"
"sli %[t1].16b, v12.16b, #4 \n\t"
"sli %[t2].16b, v13.16b, #2 \n\t"
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t"
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t"
"and %[t3].16b, v14.16b, %[n63].16b \n\t"

// Translate the values to the Base64 alphabet.
"tbl v12.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t0].16b \n\t"
"tbl v13.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t1].16b \n\t"
"tbl v14.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t2].16b \n\t"
"tbl v15.16b, {%[l0].16b, %[l1].16b, %[l2].16b, %[l3].16b}, %[t3].16b \n\t"

// Store 64 bytes and interleave. Increment the dest pointer.
"st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [%[dst]], #64 \n\t"

// Outputs (modified).
: [src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3)

// Inputs (not modified).
: [n63] "w" (n63),
[l0] "w" (tbl_enc.val[0]),
[l1] "w" (tbl_enc.val[1]),
[l2] "w" (tbl_enc.val[2]),
[l3] "w" (tbl_enc.val[3])

// Clobbers.
: "v12", "v13", "v14", "v15"
);
}
#endif

static inline void
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
#ifdef BASE64_NEON64_USE_ASM
enc_loop_neon64_inner_asm(s, o, tbl_enc);
#else
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);

Expand All @@ -86,7 +20,6 @@ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_en

*s += 48;
*o += 64;
#endif
}

static inline void
Expand Down
167 changes: 167 additions & 0 deletions deps/base64/base64/lib/arch/neon64/enc_loop_asm.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.

// Generate a block of inline assembly that loads three user-defined registers
// P, Q, R from memory and deinterleaves them, post-incrementing the src
// pointer. The register set should be sequential.
#define LOAD(P, Q, R) \
"ld3 {"P".16b, "Q".16b, "R".16b}, [%[src]], #48 \n\t"

// Generate a block of inline assembly that takes three deinterleaved registers
// and shuffles the bytes. The output is in temporary registers t0..t3.
#define SHUF(P, Q, R) \
"ushr %[t0].16b, "P".16b, #2 \n\t" \
"ushr %[t1].16b, "Q".16b, #4 \n\t" \
"ushr %[t2].16b, "R".16b, #6 \n\t" \
"sli %[t1].16b, "P".16b, #4 \n\t" \
"sli %[t2].16b, "Q".16b, #2 \n\t" \
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
"and %[t3].16b, "R".16b, %[n63].16b \n\t"

// Generate a block of inline assembly that takes temporary registers t0..t3
// and translates them to the base64 alphabet, using a table loaded into
// v8..v11. The output is in user-defined registers P..S.
#define TRAN(P, Q, R, S) \
"tbl "P".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
"tbl "Q".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
"tbl "R".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
"tbl "S".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"

// Generate a block of inline assembly that interleaves four registers and
// stores them, post-incrementing the destination pointer.
#define STOR(P, Q, R, S) \
"st4 {"P".16b, "Q".16b, "R".16b, "S".16b}, [%[dst]], #64 \n\t"

// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result.
#define ROUND() \
LOAD("v12", "v13", "v14") \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")

// Generate a block of assembly that generates a type A interleaved encoder
// round. It uses registers that were loaded by the previous type B round, and
// in turn loads registers for the next type B round.
#define ROUND_A() \
SHUF("v2", "v3", "v4") \
LOAD("v12", "v13", "v14") \
TRAN("v2", "v3", "v4", "v5") \
STOR("v2", "v3", "v4", "v5")

// Type B interleaved encoder round. Same as type A, but register sets swapped.
#define ROUND_B() \
SHUF("v12", "v13", "v14") \
LOAD("v2", "v3", "v4") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")

// The first type A round needs to load its own registers.
#define ROUND_A_FIRST() \
LOAD("v2", "v3", "v4") \
ROUND_A()

// The last type B round omits the load for the next step.
#define ROUND_B_LAST() \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")

// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"

static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;

if (rounds == 0) {
return;
}

*slen -= rounds * 48; // 48 bytes consumed per round.
*olen += rounds * 64; // 64 bytes produced per round.

// Number of times to go through the 8x loop.
size_t loops = rounds / 8;

// Number of rounds remaining after the 8x loop.
rounds %= 8;

// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;

__asm__ volatile (

// Load the encoding table into v8..v11.
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"

// If there are eight rounds or more, enter an 8x unrolled loop
// of interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations to maximize
// pipeline throughput.
" cbz %[loops], 4f \n\t"

// The SIMD instructions do not touch the flags.
"88: subs %[loops], %[loops], #1 \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" b.ne 88b \n\t"
// Enter a 4x unrolled loop for rounds of 4 or more.
"4: cmp %[rounds], #4 \n\t"
" b.lt 30f \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" sub %[rounds], %[rounds], #4 \n\t"
// Dispatch the remaining rounds 0..3.
"30: cbz %[rounds], 0f \n\t"
" cmp %[rounds], #2 \n\t"
" b.eq 2f \n\t"
" b.lt 1f \n\t"

// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"

// Outputs (modified).
: [loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3)

// Inputs (not modified).
: [rounds] "r" (rounds),
[tbl] "r" (base64_table_enc_6bit),
[n63] "w" (vdupq_n_u8(63))

// Clobbers.
: "v2", "v3", "v4", "v5",
"v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15"
);
}

#pragma GCC diagnostic pop

0 comments on commit f35cb3d

Please sign in to comment.