diff --git a/lib/arch/avx/enc_loop_asm.c b/lib/arch/avx/enc_loop_asm.c index 27f1e88e..b24b4929 100644 --- a/lib/arch/avx/enc_loop_asm.c +++ b/lib/arch/avx/enc_loop_asm.c @@ -79,9 +79,9 @@ STOR(B, (ROUND + 1)) /* | - | | | */ \ TRAN(C, A, F) /* - | | | x */ \ LOAD(F, (ROUND + 5)) /* | | | + */ \ - SHUF(D, A, B) /* + + | | | | */ \ - STOR(C, (ROUND + 2)) /* | | - | | | */ \ - TRAN(D, A, B) /* - - V V V */ + SHUF(D, A, B) /* + x | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ // Define a macro that terminates a ROUND_3 macro by taking pre-loaded // registers D, E and F, and translating, shuffling and storing them. diff --git a/lib/arch/avx2/enc_loop_asm.c b/lib/arch/avx2/enc_loop_asm.c index 2338b1c1..87aba284 100644 --- a/lib/arch/avx2/enc_loop_asm.c +++ b/lib/arch/avx2/enc_loop_asm.c @@ -80,9 +80,9 @@ STOR(B, (ROUND + 1)) /* | - | | | */ \ TRAN(C, A, F) /* - | | | x */ \ LOAD(F, (ROUND + 5), -4) /* | | | + */ \ - SHUF(D, A, B) /* + + | | | | */ \ - STOR(C, (ROUND + 2)) /* | | - | | | */ \ - TRAN(D, A, B) /* - - V V V */ + SHUF(D, A, B) /* + x | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ // Define a macro that terminates a ROUND_3 macro by taking pre-loaded // registers D, E and F, and translating, shuffling and storing them. diff --git a/lib/arch/sse41/codec.c b/lib/arch/sse41/codec.c index 00645fed..6e5afe30 100644 --- a/lib/arch/sse41/codec.c +++ b/lib/arch/sse41/codec.c @@ -11,11 +11,25 @@ #if HAVE_SSE41 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_SSE41_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSE41_USE_ASM 1 +# else +# define BASE64_SSE41_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_SSE41_USE_ASM +# include "../ssse3/enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_SSE41 diff --git a/lib/arch/sse42/codec.c b/lib/arch/sse42/codec.c index cf5d97cf..dde823b7 100644 --- a/lib/arch/sse42/codec.c +++ b/lib/arch/sse42/codec.c @@ -11,11 +11,25 @@ #if HAVE_SSE42 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +#ifndef BASE64_SSE42_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSE42_USE_ASM 1 +# else +# define BASE64_SSE42_USE_ASM 0 +# endif +#endif + #include "../ssse3/dec_reshuffle.c" #include "../ssse3/dec_loop.c" -#include "../ssse3/enc_translate.c" -#include "../ssse3/enc_reshuffle.c" -#include "../ssse3/enc_loop.c" + +#if BASE64_SSE42_USE_ASM +# include "../ssse3/enc_loop_asm.c" +#else +# include "../ssse3/enc_translate.c" +# include "../ssse3/enc_reshuffle.c" +# include "../ssse3/enc_loop.c" +#endif #endif // HAVE_SSE42 diff --git a/lib/arch/ssse3/codec.c b/lib/arch/ssse3/codec.c index ad14a458..a812a290 100644 --- a/lib/arch/ssse3/codec.c +++ b/lib/arch/ssse3/codec.c @@ -11,11 +11,27 @@ #if HAVE_SSSE3 #include +// Only enable inline assembly on supported compilers and on 64-bit CPUs. +// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM +// registers, which is not enough to run the inline assembly. +#ifndef BASE64_SSSE3_USE_ASM +# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64 +# define BASE64_SSSE3_USE_ASM 1 +# else +# define BASE64_SSSE3_USE_ASM 0 +# endif +#endif + #include "dec_reshuffle.c" #include "dec_loop.c" -#include "enc_reshuffle.c" -#include "enc_translate.c" -#include "enc_loop.c" + +#if BASE64_SSSE3_USE_ASM +# include "enc_loop_asm.c" +#else +# include "enc_reshuffle.c" +# include "enc_translate.c" +# include "enc_loop.c" +#endif #endif // HAVE_SSSE3 diff --git a/lib/arch/ssse3/enc_loop_asm.c b/lib/arch/ssse3/enc_loop_asm.c new file mode 100644 index 00000000..60e22745 --- /dev/null +++ b/lib/arch/ssse3/enc_loop_asm.c @@ -0,0 +1,263 @@ +// Apologies in advance for combining the preprocessor with inline assembly, +// two notoriously gnarly parts of C, but it was necessary to avoid a lot of +// code repetition. The preprocessor is used to template large sections of +// inline assembly that differ only in the registers used. If the code was +// written out by hand, it would become very large and hard to audit. + +// Generate a block of inline assembly that loads register R0 from memory. The +// offset at which the register is loaded is set by the given round. +#define LOAD(R0, ROUND) \ + "lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t" + +// Generate a block of inline assembly that deinterleaves and shuffles register +// R0 using preloaded constants. Outputs in R0 and R1. +#define SHUF(R0, R1) \ + "pshufb %[lut0], %["R0"] \n\t" \ + "movdqa %["R0"], %["R1"] \n\t" \ + "pand %[msk0], %["R0"] \n\t" \ + "pand %[msk2], %["R1"] \n\t" \ + "pmulhuw %[msk1], %["R0"] \n\t" \ + "pmullw %[msk3], %["R1"] \n\t" \ + "por %["R1"], %["R0"] \n\t" + +// Generate a block of inline assembly that takes R0 and R1 and translates +// their contents to the base64 alphabet, using preloaded constants. +#define TRAN(R0, R1, R2) \ + "movdqa %["R0"], %["R1"] \n\t" \ + "movdqa %["R0"], %["R2"] \n\t" \ + "psubusb %[n51], %["R1"] \n\t" \ + "pcmpgtb %[n25], %["R2"] \n\t" \ + "psubb %["R2"], %["R1"] \n\t" \ + "movdqa %[lut1], %["R2"] \n\t" \ + "pshufb %["R1"], %["R2"] \n\t" \ + "paddb %["R2"], %["R0"] \n\t" + +// Generate a block of inline assembly that stores the given register R0 at an +// offset set by the given round. +#define STOR(R0, ROUND) \ + "movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t" + +// Generate a block of inline assembly that generates a single self-contained +// encoder round: fetch the data, process it, and store the result. Then update +// the source and destination pointers. +#define ROUND() \ + LOAD("a", 0) \ + SHUF("a", "b") \ + TRAN("a", "b", "c") \ + STOR("a", 0) \ + "add $12, %[src] \n\t" \ + "add $16, %[dst] \n\t" + +// Define a macro that initiates a three-way interleaved encoding round by +// preloading registers a, b and c from memory. +// The register graph shows which registers are in use during each step, and +// is a visual aid for choosing registers for that step. Symbol index: +// +// + indicates that a register is loaded by that step. +// | indicates that a register is in use and must not be touched. +// - indicates that a register is decommissioned by that step. +// x indicates that a register is used as a temporary by that step. +// V indicates that a register is an input or output to the macro. +// +#define ROUND_3_INIT() /* a b c d e f */ \ + LOAD("a", 0) /* + */ \ + SHUF("a", "d") /* | + */ \ + LOAD("b", 1) /* | + | */ \ + TRAN("a", "d", "e") /* | | - x */ \ + LOAD("c", 2) /* V V V */ + +// Define a macro that translates, shuffles and stores the input registers A, B +// and C, and preloads registers D, E and F for the next round. +// This macro can be arbitrarily daisy-chained by feeding output registers D, E +// and F back into the next round as input registers A, B and C. The macro +// carefully interleaves memory operations with data operations for optimal +// pipelined performance. + +#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + LOAD(D, (ROUND + 3)) /* V V V + */ \ + SHUF(B, E) /* | | | | + */ \ + STOR(A, (ROUND + 0)) /* - | | | | */ \ + TRAN(B, E, F) /* | | | - x */ \ + LOAD(E, (ROUND + 4)) /* | | | + */ \ + SHUF(C, A) /* + | | | | */ \ + STOR(B, (ROUND + 1)) /* | - | | | */ \ + TRAN(C, A, F) /* - | | | x */ \ + LOAD(F, (ROUND + 5)) /* | | | + */ \ + SHUF(D, A) /* + | | | | */ \ + STOR(C, (ROUND + 2)) /* | - | | | */ \ + TRAN(D, A, B) /* - x V V V */ + +// Define a macro that terminates a ROUND_3 macro by taking pre-loaded +// registers D, E and F, and translating, shuffling and storing them. +#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \ + SHUF(E, A) /* + V V V */ \ + STOR(D, (ROUND + 3)) /* | - | | */ \ + TRAN(E, A, B) /* - x | | */ \ + SHUF(F, C) /* + | | */ \ + STOR(E, (ROUND + 4)) /* | - | */ \ + TRAN(F, C, D) /* - x | */ \ + STOR(F, (ROUND + 5)) /* - */ + +// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f. +#define ROUND_3_A(ROUND) \ + ROUND_3(ROUND, "a", "b", "c", "d", "e", "f") + +// Define a type B round. Inputs and outputs are swapped with regard to type A. +#define ROUND_3_B(ROUND) \ + ROUND_3(ROUND, "d", "e", "f", "a", "b", "c") + +// Terminating macro for a type A round. +#define ROUND_3_A_LAST(ROUND) \ + ROUND_3_A(ROUND) \ + ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f") + +// Terminating macro for a type B round. +#define ROUND_3_B_LAST(ROUND) \ + ROUND_3_B(ROUND) \ + ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c") + +// Suppress clang's warning that the literal string in the asm statement is +// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99 +// compilers). It may be true, but the goal here is not C99 portability. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverlength-strings" + +static inline void +enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen) +{ + // For a clearer explanation of the algorithm used by this function, + // please refer to the plain (not inline assembly) implementation. This + // function follows the same basic logic. + + if (*slen < 16) { + return; + } + + // Process blocks of 12 bytes at a time. + size_t rounds = *slen / 12; + + *slen -= rounds * 12; // 12 bytes consumed per round + *olen += rounds * 16; // 16 bytes produced per round + + // Number of times to go through the 36x loop. + size_t loops = rounds / 36; + + // Number of rounds remaining after the 36x loop. + rounds %= 36; + + // Lookup tables. + const __m128i lut0 = _mm_set_epi8( + 10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1); + + const __m128i lut1 = _mm_setr_epi8( + 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); + + // Temporary registers. + __m128i a, b, c, d, e, f; + + __asm__ volatile ( + + // If there are 36 rounds or more, enter a 36x unrolled loop of + // interleaved encoding rounds. The rounds interleave memory + // operations (load/store) with data operations (table lookups, + // etc) to maximize pipeline throughput. + " test %[loops], %[loops] \n\t" + " jz 18f \n\t" + " jmp 36f \n\t" + " \n\t" + ".balign 64 \n\t" + "36: " ROUND_3_INIT() + " " ROUND_3_A( 0) + " " ROUND_3_B( 3) + " " ROUND_3_A( 6) + " " ROUND_3_B( 9) + " " ROUND_3_A(12) + " " ROUND_3_B(15) + " " ROUND_3_A(18) + " " ROUND_3_B(21) + " " ROUND_3_A(24) + " " ROUND_3_B(27) + " " ROUND_3_A_LAST(30) + " add $(12 * 36), %[src] \n\t" + " add $(16 * 36), %[dst] \n\t" + " dec %[loops] \n\t" + " jnz 36b \n\t" + + // Enter an 18x unrolled loop for rounds of 18 or more. + "18: cmp $18, %[rounds] \n\t" + " jl 9f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B(3) + " " ROUND_3_A(6) + " " ROUND_3_B(9) + " " ROUND_3_A_LAST(12) + " sub $18, %[rounds] \n\t" + " add $(12 * 18), %[src] \n\t" + " add $(16 * 18), %[dst] \n\t" + + // Enter a 9x unrolled loop for rounds of 9 or more. + "9: cmp $9, %[rounds] \n\t" + " jl 6f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A(0) + " " ROUND_3_B_LAST(3) + " sub $9, %[rounds] \n\t" + " add $(12 * 9), %[src] \n\t" + " add $(16 * 9), %[dst] \n\t" + + // Enter a 6x unrolled loop for rounds of 6 or more. + "6: cmp $6, %[rounds] \n\t" + " jl 55f \n\t" + " " ROUND_3_INIT() + " " ROUND_3_A_LAST(0) + " sub $6, %[rounds] \n\t" + " add $(12 * 6), %[src] \n\t" + " add $(16 * 6), %[dst] \n\t" + + // Dispatch the remaining rounds 0..5. + "55: cmp $3, %[rounds] \n\t" + " jg 45f \n\t" + " je 3f \n\t" + " cmp $1, %[rounds] \n\t" + " jg 2f \n\t" + " je 1f \n\t" + " jmp 0f \n\t" + + "45: cmp $4, %[rounds] \n\t" + " je 4f \n\t" + + // Block of non-interlaced encoding rounds, which can each + // individually be jumped to. Rounds fall through to the next. + "5: " ROUND() + "4: " ROUND() + "3: " ROUND() + "2: " ROUND() + "1: " ROUND() + "0: \n\t" + + // Outputs (modified). + : [rounds] "+r" (rounds), + [loops] "+r" (loops), + [src] "+r" (*s), + [dst] "+r" (*o), + [a] "=&x" (a), + [b] "=&x" (b), + [c] "=&x" (c), + [d] "=&x" (d), + [e] "=&x" (e), + [f] "=&x" (f) + + // Inputs (not modified). + : [lut0] "x" (lut0), + [lut1] "x" (lut1), + [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)), + [msk1] "x" (_mm_set1_epi32(0x04000040)), + [msk2] "x" (_mm_set1_epi32(0x003F03F0)), + [msk3] "x" (_mm_set1_epi32(0x01000010)), + [n51] "x" (_mm_set1_epi8(51)), + [n25] "x" (_mm_set1_epi8(25)) + ); +} + +#pragma GCC diagnostic pop