Skip to content

Commit

Permalink
#102: AVX512: add codec
Browse files Browse the repository at this point in the history
Add an AVX512 codec, based on the code in Wojciech Muła's `base64simd'
project. Only the encoder is currently implemented with AVX512
instructions, because it is relatively simple. The decoder is stubbed by
using the AVX2 decoder. A native AVX512 decoder is quite complex, and
might be integrated at some later stage.

Tested with the Intel SDE instruction set emulator running in Future
mode.

Resolves #102.
  • Loading branch information
aklomp committed Oct 20, 2022
2 parents 2e8ad2a + d98d195 commit 6b1a8b8
Show file tree
Hide file tree
Showing 15 changed files with 234 additions and 13 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ jobs:
-DBASE64_BUILD_TESTS=ON
${{ runner.os != 'Windows' && '-DCMAKE_BUILD_TYPE=Release' || '' }}
${{ runner.os == 'macOS' && '-DBASE64_WITH_AVX2=OFF' || '' }}
-DBASE64_WITH_AVX512=OFF
- name: CMake Build
run: cmake --build out --config Release --verbose
- name: CTest
Expand Down Expand Up @@ -76,7 +77,13 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
- name: CMake Configure
run: cmake -B out -Werror=dev -DBASE64_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release
run: >
cmake
-B out
-Werror=dev
-DBASE64_BUILD_TESTS=ON
-DBASE64_WITH_AVX512=OFF
-DCMAKE_BUILD_TYPE=Release
- name: CMake Build
run: cmake --build out --config Release --verbose
- name: CTest
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath")

cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
Expand Down Expand Up @@ -118,6 +120,7 @@ add_library(base64
lib/arch/sse42/codec.c
lib/arch/avx/codec.c
lib/arch/avx2/codec.c
lib/arch/avx512/codec.c

lib/arch/neon32/codec.c
lib/arch/neon64/codec.c
Expand Down Expand Up @@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
configure_codec(SSE42 __SSSE4_2__)
configure_codec(AVX)
configure_codec(AVX2)
configure_codec(AVX512)

elseif (_TARGET_ARCH STREQUAL "arm")
set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Copyright (c) 2005-2007, Nick Galbreath
Copyright (c) 2015-2017, Wojciech Mula
Copyright (c) 2015-2018, Wojciech Muła
Copyright (c) 2016-2017, Matthieu Darbois
Copyright (c) 2013-2022, Alfred Klomp
All rights reserved.
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
OBJCOPY ?= objcopy

OBJS = \
lib/arch/avx512/codec.o \
lib/arch/avx2/codec.o \
lib/arch/generic/codec.o \
lib/arch/neon32/codec.o \
Expand All @@ -16,6 +17,7 @@ OBJS = \
lib/codec_choose.o \
lib/tables/tables.o

HAVE_AVX512 = 0
HAVE_AVX2 = 0
HAVE_NEON32 = 0
HAVE_NEON64 = 0
Expand All @@ -26,6 +28,9 @@ HAVE_AVX = 0

# The user should supply compiler flags for the codecs they want to build.
# Check which codecs we're going to include:
ifdef AVX512_CFLAGS
HAVE_AVX512 = 1
endif
ifdef AVX2_CFLAGS
HAVE_AVX2 = 1
endif
Expand Down Expand Up @@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@

lib/config.h:
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" > $@
@echo "#define HAVE_AVX512 $(HAVE_AVX512)" > $@
@echo "#define HAVE_AVX2 $(HAVE_AVX2)" >> $@
@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
@echo "#define HAVE_SSSE3 $(HAVE_SSSE3)" >> $@
Expand All @@ -75,6 +81,7 @@ lib/config.h:
$(OBJS): lib/config.h
$(OBJS): CFLAGS += -Ilib

lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
lib/arch/avx2/codec.o: CFLAGS += $(AVX2_CFLAGS)
lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)
Expand Down
23 changes: 20 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)

This is an implementation of a base64 stream encoding/decoding library in C99
with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
[OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
to encode/decode simple length-delimited strings. This library aims to be:

Expand All @@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
time, which gives a speedup of four or more times compared to the "plain"
bytewise codec.

AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
instructions. Decoding part reused AVX2 implementations. For CPUs later than
Cannonlake (manufactured in 2018) supports these instructions.

NEON support is hardcoded to on or off at compile time, because portable
runtime feature detection is unavailable on ARM.

Expand Down Expand Up @@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
[articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).

The AVX512 encoder is based on code from Wojciech Muła's
[base64simd](https://github.com/WojciechMula/base64simd) library.

The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).

## Building
Expand All @@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
make lib/libbase64.o
```

Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
A typical build invocation on x86 looks like this:

```sh
Expand All @@ -93,6 +100,15 @@ Example:
AVX2_CFLAGS=-mavx2 make
```

### AVX512

To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
Example:

```sh
AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
```

The codec will only be used if runtime feature detection shows that the target machine supports AVX2.

### SSSE3
Expand Down Expand Up @@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
The following constants can be used:

- `BASE64_FORCE_AVX2`
- `BASE64_FORCE_AVX512`
- `BASE64_FORCE_NEON32`
- `BASE64_FORCE_NEON64`
- `BASE64_FORCE_PLAIN`
Expand Down
2 changes: 2 additions & 0 deletions cmake/Modules/TargetSIMDInstructionSet.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 "-msse4.2")
set(COMPILE_FLAGS_AVX "-mavx")
set(COMPILE_FLAGS_AVX2 "-mavx2")
set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")

#arm
set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
Expand All @@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
set(COMPILE_FLAGS_SSE42 " ")
set(COMPILE_FLAGS_AVX "/arch:AVX")
set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
endif()
endmacro(define_SIMD_compile_flags)
3 changes: 3 additions & 0 deletions cmake/config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#cmakedefine01 BASE64_WITH_AVX2
#define HAVE_AVX2 BASE64_WITH_AVX2

#cmakedefine01 BASE64_WITH_AVX512
#define HAVE_AVX512 BASE64_WITH_AVX512

#cmakedefine01 BASE64_WITH_NEON32
#define HAVE_NEON32 BASE64_WITH_NEON32

Expand Down
1 change: 1 addition & 0 deletions include/libbase64.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ extern "C" {
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_AVX512 (1 << 8)

struct base64_state {
int eof;
Expand Down
42 changes: 42 additions & 0 deletions lib/arch/avx512/codec.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>

#include "../../../include/libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"

#if HAVE_AVX512
#include <immintrin.h>

#include "../avx2/dec_reshuffle.c"
#include "../avx2/dec_loop.c"
#include "enc_reshuffle_translate.c"
#include "enc_loop.c"

#endif // HAVE_AVX512

BASE64_ENC_FUNCTION(avx512)
{
#if HAVE_AVX512
#include "../generic/enc_head.c"
enc_loop_avx512(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
BASE64_ENC_STUB
#endif
}

// Reuse AVX2 decoding. Not supporting AVX512 at present
BASE64_DEC_FUNCTION(avx512)
{
#if HAVE_AVX512
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
BASE64_DEC_STUB
#endif
}
61 changes: 61 additions & 0 deletions lib/arch/avx512/enc_loop.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
static inline void
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
{
// Load input.
__m512i src = _mm512_loadu_si512((__m512i *) *s);

// Reshuffle, translate, store.
src = enc_reshuffle_translate(src);
_mm512_storeu_si512((__m512i *) *o, src);

*s += 48;
*o += 64;
}

static inline void
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}

// Process blocks of 48 bytes at a time. Because blocks are loaded 64
// bytes at a time, ensure that there will be at least 24 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer.
size_t rounds = (*slen - 24) / 48;

*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round

while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx512_inner(s, o);
break;
}
}
50 changes: 50 additions & 0 deletions lib/arch/avx512/enc_reshuffle_translate.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// AVX512 algorithm is based on permutevar and multishift. The code is based on
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.

static inline __m512i
enc_reshuffle_translate (const __m512i input)
{
// 32-bit input
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// output order [1, 2, 0, 1]
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]

const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
0x04050304,
0x07080607,
0x0a0b090a,
0x0d0e0c0d,
0x10110f10,
0x13141213,
0x16171516,
0x191a1819,
0x1c1d1b1c,
0x1f201e1f,
0x22232122,
0x25262425,
0x28292728,
0x2b2c2a2b,
0x2e2f2d2e);

// Reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);

// After multishift a single 32-bit lane has following layout
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])

// 48, 54, 36, 42, 16, 22, 4, 10
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);

// Translate immediatedly after reshuffled.
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);

// Translation 6-bit values to ASCII.
return _mm512_permutexvar_epi8(shuffled_in, lookup);
}
Loading

0 comments on commit 6b1a8b8

Please sign in to comment.