#102: AVX512: add codec

Add an AVX512 codec, based on the code in Wojciech Muła's `base64simd' project. Only the encoder is currently implemented with AVX512 instructions, because it is relatively simple. The decoder is stubbed by using the AVX2 decoder. A native AVX512 decoder is quite complex, and might be integrated at some later stage. Tested with the Intel SDE instruction set emulator running in Future mode. Resolves #102.
aklomp · Oct 20, 2022 · 6b1a8b8 · 6b1a8b8
2 parents 2e8ad2a + d98d195
commit 6b1a8b8
Show file tree

Hide file tree

Showing 15 changed files with 234 additions and 13 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -45,6 +45,7 @@ jobs:
           -DBASE64_BUILD_TESTS=ON
           ${{ runner.os != 'Windows' && '-DCMAKE_BUILD_TYPE=Release' || '' }}
           ${{ runner.os == 'macOS' && '-DBASE64_WITH_AVX2=OFF' || '' }}
+          -DBASE64_WITH_AVX512=OFF
       - name: CMake Build
         run: cmake --build out --config Release --verbose
       - name: CTest
@@ -76,7 +77,13 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v3
       - name: CMake Configure
-        run: cmake -B out -Werror=dev -DBASE64_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release
+        run: >
+          cmake
+          -B out
+          -Werror=dev
+          -DBASE64_BUILD_TESTS=ON
+          -DBASE64_WITH_AVX512=OFF
+          -DCMAKE_BUILD_TYPE=Release
       - name: CMake Build
         run: cmake --build out --config Release --verbose
       - name: CTest

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -62,6 +62,8 @@ cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
 cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
 add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
+cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
+add_feature_info(AVX2 BASE64_WITH_AVX512 "add AVX512 codepath")
 
 cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
 add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
@@ -118,6 +120,7 @@ add_library(base64
     lib/arch/sse42/codec.c
     lib/arch/avx/codec.c
     lib/arch/avx2/codec.c
+    lib/arch/avx512/codec.c
 
     lib/arch/neon32/codec.c
     lib/arch/neon64/codec.c
@@ -206,6 +209,7 @@ if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
     configure_codec(SSE42 __SSSE4_2__)
     configure_codec(AVX)
     configure_codec(AVX2)
+    configure_codec(AVX512)
 
 elseif (_TARGET_ARCH STREQUAL "arm")
     set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")

diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,5 @@
 Copyright (c) 2005-2007, Nick Galbreath
-Copyright (c) 2015-2017, Wojciech Mula
+Copyright (c) 2015-2018, Wojciech Muła
 Copyright (c) 2016-2017, Matthieu Darbois
 Copyright (c) 2013-2022, Alfred Klomp
 All rights reserved.

diff --git a/Makefile b/Makefile
@@ -4,6 +4,7 @@ CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic
 OBJCOPY ?= objcopy
 
 OBJS = \
+  lib/arch/avx512/codec.o \
   lib/arch/avx2/codec.o \
   lib/arch/generic/codec.o \
   lib/arch/neon32/codec.o \
@@ -16,6 +17,7 @@ OBJS = \
   lib/codec_choose.o \
   lib/tables/tables.o
 
+HAVE_AVX512 = 0
 HAVE_AVX2   = 0
 HAVE_NEON32 = 0
 HAVE_NEON64 = 0
@@ -26,6 +28,9 @@ HAVE_AVX    = 0
 
 # The user should supply compiler flags for the codecs they want to build.
 # Check which codecs we're going to include:
+ifdef AVX512_CFLAGS
+  HAVE_AVX512 = 1
+endif
 ifdef AVX2_CFLAGS
   HAVE_AVX2 = 1
 endif
@@ -64,7 +69,8 @@ lib/libbase64.o: $(OBJS)
 	$(OBJCOPY) --keep-global-symbols=lib/exports.txt $@
 
 lib/config.h:
-	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"    > $@
+	@echo "#define HAVE_AVX512 $(HAVE_AVX512)"  > $@
+	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"   >> $@
 	@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
 	@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
 	@echo "#define HAVE_SSSE3  $(HAVE_SSSE3)"  >> $@
@@ -75,6 +81,7 @@ lib/config.h:
 $(OBJS): lib/config.h
 $(OBJS): CFLAGS += -Ilib
 
+lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
 lib/arch/avx2/codec.o:   CFLAGS += $(AVX2_CFLAGS)
 lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
 lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://github.com/aklomp/base64/actions/workflows/test.yml/badge.svg)](https://github.com/aklomp/base64/actions/workflows/test.yml)
 
 This is an implementation of a base64 stream encoding/decoding library in C99
-with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
+with SIMD (AVX2, AVX512, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
 [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
 to encode/decode simple length-delimited strings. This library aims to be:
 
@@ -19,6 +19,10 @@ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
 time, which gives a speedup of four or more times compared to the "plain"
 bytewise codec.
 
+AVX512 support is only for encoding at present, utilizing the AVX512 VL and VBMI
+instructions. Decoding part reused AVX2 implementations. For CPUs later than
+Cannonlake (manufactured in 2018) supports these instructions.
+
 NEON support is hardcoded to on or off at compile time, because portable
 runtime feature detection is unavailable on ARM.
 
@@ -59,6 +63,9 @@ optimizations described by Wojciech Muła in a
 [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
 His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
 
+The AVX512 encoder is based on code from Wojciech Muła's
+[base64simd](https://github.com/WojciechMula/base64simd) library.
+
 The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
 
 ## Building
@@ -76,8 +83,8 @@ To compile just the "plain" library without SIMD codecs, type:
 make lib/libbase64.o
 ```
 
-Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
-`SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
+Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `AVX512_CFLAGS`,
+`NEON32_CFLAGS`, `NEON64_CFLAGS`, `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
 A typical build invocation on x86 looks like this:
 
 ```sh
@@ -93,6 +100,15 @@ Example:
 AVX2_CFLAGS=-mavx2 make
 ```
 
+### AVX512
+
+To build and include the AVX512 codec, set the `AVX512_CFLAGS` environment variable to a value that will turn on AVX512 support in your compiler, typically `-mavx512vl -mavx512vbmi`.
+Example:
+
+```sh
+AVX512_CFLAGS="-mavx512vl -mavx512vbmi" make
+```
+
 The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
 
 ### SSSE3
@@ -208,6 +224,7 @@ Mainly there for testing purposes, this is also useful on ARM where the only way
 The following constants can be used:
 
 - `BASE64_FORCE_AVX2`
+- `BASE64_FORCE_AVX512`
 - `BASE64_FORCE_NEON32`
 - `BASE64_FORCE_NEON64`
 - `BASE64_FORCE_PLAIN`

diff --git a/cmake/Modules/TargetSIMDInstructionSet.cmake b/cmake/Modules/TargetSIMDInstructionSet.cmake
@@ -21,6 +21,7 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 "-msse4.2")
         set(COMPILE_FLAGS_AVX "-mavx")
         set(COMPILE_FLAGS_AVX2 "-mavx2")
+        set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
 
         #arm
         set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
@@ -30,5 +31,6 @@ macro(define_SIMD_compile_flags)
         set(COMPILE_FLAGS_SSE42 " ")
         set(COMPILE_FLAGS_AVX "/arch:AVX")
         set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
+        set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
     endif()
 endmacro(define_SIMD_compile_flags)
diff --git a/cmake/config.h.in b/cmake/config.h.in
@@ -16,6 +16,9 @@
 #cmakedefine01 BASE64_WITH_AVX2
 #define HAVE_AVX2 BASE64_WITH_AVX2
 
+#cmakedefine01 BASE64_WITH_AVX512
+#define HAVE_AVX512 BASE64_WITH_AVX512
+
 #cmakedefine01 BASE64_WITH_NEON32
 #define HAVE_NEON32 BASE64_WITH_NEON32
 

diff --git a/include/libbase64.h b/include/libbase64.h
@@ -53,6 +53,7 @@ extern "C" {
 #define BASE64_FORCE_SSE41	(1 << 5)
 #define BASE64_FORCE_SSE42	(1 << 6)
 #define BASE64_FORCE_AVX	(1 << 7)
+#define BASE64_FORCE_AVX512	(1 << 8)
 
 struct base64_state {
 	int eof;

diff --git a/lib/arch/avx512/codec.c b/lib/arch/avx512/codec.c
@@ -0,0 +1,42 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX512
+#include <immintrin.h>
+
+#include "../avx2/dec_reshuffle.c"
+#include "../avx2/dec_loop.c"
+#include "enc_reshuffle_translate.c"
+#include "enc_loop.c"
+
+#endif	// HAVE_AVX512
+
+BASE64_ENC_FUNCTION(avx512)
+{
+#if HAVE_AVX512
+	#include "../generic/enc_head.c"
+	enc_loop_avx512(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+
+// Reuse AVX2 decoding. Not supporting AVX512 at present
+BASE64_DEC_FUNCTION(avx512)
+{
+#if HAVE_AVX512
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}
diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c
@@ -0,0 +1,61 @@
+static inline void
+enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input.
+	__m512i src = _mm512_loadu_si512((__m512i *) *s);
+
+	// Reshuffle, translate, store.
+	src = enc_reshuffle_translate(src);
+	_mm512_storeu_si512((__m512i *) *o, src);
+
+	*s += 48;
+	*o += 64;
+}
+
+static inline void
+enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 48 bytes at a time. Because blocks are loaded 64
+	// bytes at a time, ensure that there will be at least 24 remaining
+	// bytes after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer.
+	size_t rounds = (*slen - 24) / 48;
+
+	*slen -= rounds * 48;   // 48 bytes consumed per round
+	*olen += rounds * 64;   // 64 bytes produced per round
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx512_inner(s, o);
+		break;
+	}
+}
diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c
@@ -0,0 +1,50 @@
+// AVX512 algorithm is based on permutevar and multishift. The code is based on
+// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
+
+static inline __m512i
+enc_reshuffle_translate (const __m512i input)
+{
+	// 32-bit input
+	// [ 0  0  0  0  0  0  0  0|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
+	// output order  [1, 2, 0, 1]
+	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+
+	const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
+	                                                0x04050304,
+	                                                0x07080607,
+	                                                0x0a0b090a,
+	                                                0x0d0e0c0d,
+	                                                0x10110f10,
+	                                                0x13141213,
+	                                                0x16171516,
+	                                                0x191a1819,
+	                                                0x1c1d1b1c,
+	                                                0x1f201e1f,
+	                                                0x22232122,
+	                                                0x25262425,
+	                                                0x28292728,
+	                                                0x2b2c2a2b,
+	                                                0x2e2f2d2e);
+
+	// Reorder bytes
+	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+	const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
+
+	// After multishift a single 32-bit lane has following layout
+	// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
+	//  a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
+	// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
+
+	// 48, 54, 36, 42, 16, 22, 4, 10
+	const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
+	__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
+
+	// Translate immediatedly after reshuffled.
+	const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
+
+	// Translation 6-bit values to ASCII.
+	return _mm512_permutexvar_epi8(shuffled_in, lookup);
+}