#55: Cleanup: round of cleanups with no functional changes

Do a cleanup round before tagging the next version. To prevent accidental regressions, the cleanups in this commit set should introduce no externally visible functional changes.
aklomp · Nov 19, 2019 · 5d06820 · 5d06820
2 parents 2bc8314 + 6315000
commit 5d06820
Show file tree

Hide file tree

Showing 28 changed files with 371 additions and 409 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,5 @@
 Copyright (c) 2005-2007, Nick Galbreath
-Copyright (c) 2013-2017, Alfred Klomp
+Copyright (c) 2013-2019, Alfred Klomp
 Copyright (c) 2015-2017, Wojciech Mula
 Copyright (c) 2016-2017, Matthieu Darbois
 All rights reserved.

diff --git a/lib/arch/avx/codec.c b/lib/arch/avx/codec.c
@@ -8,8 +8,6 @@
 #if HAVE_AVX
 #include <immintrin.h>
 
-#include "../sse2/compare_macros.h"
-
 #include "../ssse3/dec_reshuffle.c"
 #include "../ssse3/enc_translate.c"
 #include "../ssse3/enc_reshuffle.c"

diff --git a/lib/arch/avx2/codec.c b/lib/arch/avx2/codec.c
@@ -8,155 +8,9 @@
 #if HAVE_AVX2
 #include <immintrin.h>
 
-#define CMPGT(s,n)	_mm256_cmpgt_epi8((s), _mm256_set1_epi8(n))
-#define CMPEQ(s,n)	_mm256_cmpeq_epi8((s), _mm256_set1_epi8(n))
-#define REPLACE(s,n)	_mm256_and_si256((s), _mm256_set1_epi8(n))
-#define RANGE(s,a,b)	_mm256_andnot_si256(CMPGT((s), (b)), CMPGT((s), (a) - 1))
-
-static inline __m256i enc_reshuffle(const __m256i input) {
-	// translation from SSSE3 into AVX2 of procedure
-	// This one works with shifted (4 bytes) input in order to
-	// be able to work efficiently in the 2 128-bit lanes
-
-	// input, bytes MSB to LSB:
-	// 0 0 0 0 x w v u t s r q p o n m
-	// l k j i h g f e d c b a 0 0 0 0
-
-	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
-		10, 11,  9, 10,
-		 7,  8,  6,  7,
-		 4,  5,  3,  4,
-		 1,  2,  0,  1,
-
-		14, 15, 13, 14,
-		11, 12, 10, 11,
-		 8,  9,  7,  8,
-		 5,  6,  4,  5));
-	// in, bytes MSB to LSB:
-	// w x v w
-	// t u s t
-	// q r p q
-	// n o m n
-	// k l j k
-	// h i g h
-	// e f d e
-	// b c a b
-
-	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
-	// bits, upper case are most significant bits, lower case are least significant bits.
-	// 0000wwww XX000000 VVVVVV00 00000000
-	// 0000tttt UU000000 SSSSSS00 00000000
-	// 0000qqqq RR000000 PPPPPP00 00000000
-	// 0000nnnn OO000000 MMMMMM00 00000000
-	// 0000kkkk LL000000 JJJJJJ00 00000000
-	// 0000hhhh II000000 GGGGGG00 00000000
-	// 0000eeee FF000000 DDDDDD00 00000000
-	// 0000bbbb CC000000 AAAAAA00 00000000
-
-	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
-	// 00000000 00wwwwXX 00000000 00VVVVVV
-	// 00000000 00ttttUU 00000000 00SSSSSS
-	// 00000000 00qqqqRR 00000000 00PPPPPP
-	// 00000000 00nnnnOO 00000000 00MMMMMM
-	// 00000000 00kkkkLL 00000000 00JJJJJJ
-	// 00000000 00hhhhII 00000000 00GGGGGG
-	// 00000000 00eeeeFF 00000000 00DDDDDD
-	// 00000000 00bbbbCC 00000000 00AAAAAA
-
-	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
-	// 00000000 00xxxxxx 000000vv WWWW0000
-	// 00000000 00uuuuuu 000000ss TTTT0000
-	// 00000000 00rrrrrr 000000pp QQQQ0000
-	// 00000000 00oooooo 000000mm NNNN0000
-	// 00000000 00llllll 000000jj KKKK0000
-	// 00000000 00iiiiii 000000gg HHHH0000
-	// 00000000 00ffffff 000000dd EEEE0000
-	// 00000000 00cccccc 000000aa BBBB0000
-
-	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
-	// 00xxxxxx 00000000 00vvWWWW 00000000
-	// 00uuuuuu 00000000 00ssTTTT 00000000
-	// 00rrrrrr 00000000 00ppQQQQ 00000000
-	// 00oooooo 00000000 00mmNNNN 00000000
-	// 00llllll 00000000 00jjKKKK 00000000
-	// 00iiiiii 00000000 00ggHHHH 00000000
-	// 00ffffff 00000000 00ddEEEE 00000000
-	// 00cccccc 00000000 00aaBBBB 00000000
-
-	return _mm256_or_si256(t1, t3);
-	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
-	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
-	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
-	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
-	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
-	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
-	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
-	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
-}
-
-static inline __m256i
-enc_translate (const __m256i in)
-{
-	// LUT contains Absolute offset for all ranges:
-	const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
-	                                     65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
-	// Translate values 0..63 to the Base64 alphabet. There are five sets:
-	// #  From      To         Abs    Index  Characters
-	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
-	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
-	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
-	// 3  [62]      [43]       -19       12  +
-	// 4  [63]      [47]       -16       13  /
-
-	// Create LUT indices from input:
-	// the index for range #0 is right, others are 1 less than expected:
-	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
-
-	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
-	__m256i mask = CMPGT(in, 25);
-
-	// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
-	indices = _mm256_sub_epi8(indices, mask);
-
-	// Add offsets to input values:
-	__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
-
-	return out;
-}
-
-static inline __m256i
-dec_reshuffle (__m256i in)
-{
-	// in, lower lane, bits, upper case are most significant bits, lower case are least significant bits:
-	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
-	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
-	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
-	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
-
-	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
-	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
-	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
-	// 0000eeee FFffffff 0000DDDD DDddEEEE
-	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
-
-	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
-	// 00000000 JJJJJJjj KKKKkkkk LLllllll
-	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
-	// 00000000 DDDDDDdd EEEEeeee FFffffff
-	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
-
-	// Pack bytes together in each lane:
-	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
-		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
-		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
-	// 00000000 00000000 00000000 00000000
-	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
-	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
-	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
-
-	// Pack lanes
-	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
-}
+#include "dec_reshuffle.c"
+#include "enc_translate.c"
+#include "enc_reshuffle.c"
 
 #endif	// HAVE_AVX2
 

diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c
@@ -8,7 +8,7 @@ while (srclen >= 45)
 	// Load string:
 	__m256i str = _mm256_loadu_si256((__m256i *)c);
 
-	// see ssse3/dec_loop.c for an explanation of how the code works.
+	// See ssse3/dec_loop.c for an explanation of how the code works.
 
 	const __m256i lut_lo = _mm256_setr_epi8(
 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
@@ -28,15 +28,15 @@ while (srclen >= 45)
 		0,  16,  19,   4, -65, -65, -71, -71,
 		0,   0,   0,   0,   0,   0,   0,   0);
 
-	const __m256i mask_2F = _mm256_set1_epi8(0x2f);
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
 
 	// lookup
-	const __m256i hi_nibbles  = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
-	const __m256i lo_nibbles  = _mm256_and_si256(str, mask_2F);
-	const __m256i hi          = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
-	const __m256i lo          = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
-	const __m256i eq_2F       = _mm256_cmpeq_epi8(str, mask_2F);
-	const __m256i roll        = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+	const __m256i eq_2F      = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll       = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
 
 	if (!_mm256_testz_si256(lo, hi)) {
 		break;

diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static inline __m256i
+dec_reshuffle (const __m256i in)
+{
+	// in, lower lane, bits, upper case are most significant bits, lower
+	// case are least significant bits:
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together in each lane:
+	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+	// Pack lanes:
+	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
@@ -3,25 +3,29 @@
 // full 32-byte read without segfaulting:
 
 if (srclen >= 32) {
-	const uint8_t* const o_orig = o;
+	const uint8_t *const o_orig = o;
 
-	// first load is done at c-0 not to get a segfault
+	// First load is done at c-0 not to get a segfault:
 	__m256i inputvector = _mm256_loadu_si256((__m256i *)(c - 0));
 
-	// shift by 4 bytes, as required by enc_reshuffle
+	// Shift by 4 bytes, as required by enc_reshuffle:
 	inputvector = _mm256_permutevar8x32_epi32(inputvector, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
 
 	for (;;) {
+
+		// Reshuffle, translate, store:
 		inputvector = enc_reshuffle(inputvector);
 		inputvector = enc_translate(inputvector);
 		_mm256_storeu_si256((__m256i *)o, inputvector);
+
 		c += 24;
 		o += 32;
 		srclen -= 24;
-		if(srclen < 28) {
+		if (srclen < 28) {
 			break;
 		}
-		// Load at c-4, as required by enc_reshuffle
+
+		// Load at c-4, as required by enc_reshuffle:
 		inputvector = _mm256_loadu_si256((__m256i *)(c - 4));
 	}
 	outl += (size_t)(o - o_orig);

diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c
@@ -0,0 +1,83 @@
+static inline __m256i
+enc_reshuffle (const __m256i input)
+{
+	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
+	// works with shifted (4 bytes) input in order to be able to work
+	// efficiently in the two 128-bit lanes.
+
+	// Input, bytes MSB to LSB:
+	// 0 0 0 0 x w v u t s r q p o n m
+	// l k j i h g f e d c b a 0 0 0 0
+
+	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1,
+
+		14, 15, 13, 14,
+		11, 12, 10, 11,
+		 8,  9,  7,  8,
+		 5,  6,  4,  5));
+	// in, bytes MSB to LSB:
+	// w x v w
+	// t u s t
+	// q r p q
+	// n o m n
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
+	// bits, upper case are most significant bits, lower case are least
+	// significant bits.
+	// 0000wwww XX000000 VVVVVV00 00000000
+	// 0000tttt UU000000 SSSSSS00 00000000
+	// 0000qqqq RR000000 PPPPPP00 00000000
+	// 0000nnnn OO000000 MMMMMM00 00000000
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+	// 00000000 00wwwwXX 00000000 00VVVVVV
+	// 00000000 00ttttUU 00000000 00SSSSSS
+	// 00000000 00qqqqRR 00000000 00PPPPPP
+	// 00000000 00nnnnOO 00000000 00MMMMMM
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
+	// 00000000 00xxxxxx 000000vv WWWW0000
+	// 00000000 00uuuuuu 000000ss TTTT0000
+	// 00000000 00rrrrrr 000000pp QQQQ0000
+	// 00000000 00oooooo 000000mm NNNN0000
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+	// 00xxxxxx 00000000 00vvWWWW 00000000
+	// 00uuuuuu 00000000 00ssTTTT 00000000
+	// 00rrrrrr 00000000 00ppQQQQ 00000000
+	// 00oooooo 00000000 00mmNNNN 00000000
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm256_or_si256(t1, t3);
+	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}
diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c
@@ -0,0 +1,30 @@
+static inline __m256i
+enc_translate (const __m256i in)
+{
+	// A lookup table containing the absolute offsets for all ranges:
+	const __m256i lut = _mm256_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs    Index  Characters
+	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+	// 3  [62]      [43]       -19       12  +
+	// 4  [63]      [47]       -16       13  /
+
+	// Create LUT indices from the input. The index for range #0 is right,
+	// others are 1 less than expected:
+	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
+
+	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+	const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
+
+	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
+	// now correct:
+	indices = _mm256_sub_epi8(indices, mask);
+
+	// Add offsets to input values:
+	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
+}