Skip to content

Commit

Permalink
Reapply: Implement optimized 64-bit varint decoding functions.
Browse files Browse the repository at this point in the history
Summary:
This fixes the issue introduced in the previous iteration of this diff
(_mm_extract_... functions live in intrin.h in MSVC, not immintrin.h).
Otherwise, it's unchanged.

Original commit message:
Implements optimized decoding of 64-bit varints using the BMI2 pext instruction
(when available) or a shift/shuffle/mask SSE strategy when not. Microbenchmarks
show that the new versions are substantially faster for small (1-2 byte
encodings) or large (6-10 byte encodings) varints, while being similar for the
middle range. This is the sort of performance profile we want given the
emprical distributions of varints; they tend to be heavily biased towards the
very small or very large parts of their range.

I think the microbenchmarks likely under-sell the improvement, though. The
current (unrolled) implementation relies on lots of branching, and benefits
from a microbenchmark setup where it can hog the entire branch predictor. The
new variants only branch on the "is small" and "is overflow" checks, and are
otherwise straightline code.

Reviewed By: vitaut

Differential Revision: D38510907

fbshipit-source-id: 451cbd2da6634fd826f9721444a2dd4141c2afac
  • Loading branch information
davidtgoldblatt authored and facebook-github-bot committed Aug 18, 2022
1 parent 77649c5 commit 110807c
Show file tree
Hide file tree
Showing 3 changed files with 442 additions and 81 deletions.
185 changes: 180 additions & 5 deletions thrift/lib/cpp/util/VarintUtils-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,31 @@
* limitations under the License.
*/

#ifdef __BMI2__
#include <immintrin.h>
#endif

#include <array>

#include <folly/Portability.h>
#include <folly/Utility.h>
#include <folly/io/Cursor.h>
#include <folly/lang/Bits.h>
#include <folly/portability/Builtins.h>

// We need 64-bit for __mm_extra_epi64 and _pext_u64. MSVC support seems to be
// difficult to detect, so disable the BMI2 and SIMD versions entirely there.
#if defined(__BMI2__) && FOLLY_X64 && !defined(_MSC_VER)
#define THRIFT_UTIL_VARINTUTILS_BMI2_DECODER 1
#else
#define THRIFT_UTIL_VARINTUTILS_BMI2_DECODER 0
#endif

#if FOLLY_SSE_PREREQ(4, 1) && FOLLY_X64 && !defined(_MSC_VER)
#define THRIFT_UTIL_VARINTUTILS_SIMD_DECODER 1
#else
#define THRIFT_UTIL_VARINTUTILS_SIMD_DECODER 0
#endif

#if THRIFT_UTIL_VARINTUTILS_BMI2_DECODER || THRIFT_UTIL_VARINTUTILS_SIMD_DECODER
#include <immintrin.h>
#endif

namespace apache {
namespace thrift {
Expand Down Expand Up @@ -60,7 +76,8 @@ void readVarintSlow(CursorT& c, T& value) {
[[noreturn]] void throwInvalidVarint();

template <class T, class CursorT>
void readVarintMediumSlow(CursorT& c, T& value, const uint8_t* p, size_t len) {
void readVarintMediumSlowUnrolled(
CursorT& c, T& value, const uint8_t* p, size_t len) {
enum { maxSize = (8 * sizeof(T) + 6) / 7 };

// check that the available data is more than the longest possible varint or
Expand Down Expand Up @@ -94,6 +111,164 @@ void readVarintMediumSlow(CursorT& c, T& value, const uint8_t* p, size_t len) {
}
}

// The fast-path of the optimized medium-slow paths. Decodes the first two bytes
// of the varint (into result).
// Returns true if parsing is finished after those two bytes, and false
// otherwise. Either way, the data bits of the first two bytes are stored into
// the low bits of result.
FOLLY_ALWAYS_INLINE bool tryReadFirstTwoBytesU64(
uint64_t& result, const uint8_t* p, size_t len) {
// This is only called from mediumSlow pathways after we've done size
// validation. In particular, we should know that it's not a
// single-byte-encoded varint, and that there's space in the buffer for the
// maximum size an encoded varint can be.
DCHECK((p[0] & 0x80) != 0);
DCHECK(len >= 2);
uint64_t hi = p[1];
if ((hi & 0x80) == 0) {
result = (hi << 7) | (p[0] & 0x7f);
return true;
}
result = ((hi & 0x7F) << 7) | (p[0] & 0x7f);
return false;
}

#if THRIFT_UTIL_VARINTUTILS_SIMD_DECODER
template <class CursorT>
FOLLY_ALWAYS_INLINE void readVarintMediumSlowU64SIMD(
CursorT& c, uint64_t& value, const uint8_t* p, size_t len) {
enum { maxSize = (8 * sizeof(uint64_t) + 6) / 7 };
if (LIKELY(len >= maxSize)) {
uint64_t result;
if (tryReadFirstTwoBytesU64(result, p, len)) {
c.skipNoAdvance(2);
value = result;
return;
}
p += 2;
// This has alternating data bits and continuation bits, then, after the
// first 0 continuation bit, junk (well, message data that we're not
// interested in for varint decoding purposes).
uint64_t bits = folly::loadUnaligned<uint64_t>(p);
// This has 1s in all bits in the encoded int except the last continuation
// bit, then, after that, junk.
uint64_t bitsOnlyContinations = bits | 0x7F7F7F7F7F7F7F7FULL;
// This has all zeros in all continuation and data bits in the int except
// for the last continuation bit, which is a 1. After that, junk.
uint64_t lastContinuationBitSet = bitsOnlyContinations + 1;
if (lastContinuationBitSet == 0) {
// Last continuation bit was the last bit in the uint64_t, and it was a 1.
throwInvalidVarint();
}
size_t intBytes = __builtin_ctzll(lastContinuationBitSet) / 8 + 1;
c.skipNoAdvance(2 + intBytes);
// "Extract lowest 1 bit" idiom. Use `~lastContinuationBitSet + 1` instead
// of `-lastContinuationBitSet` to avoid an MSVC warning.
uint64_t solelyLastContinuationBitSet =
lastContinuationBitSet & (~lastContinuationBitSet + 1);
// Mask out all the junk bits.
uint64_t dataAndContinuationBits =
bits & (solelyLastContinuationBitSet - 1);
uint64_t dataBits64 = dataAndContinuationBits & 0x7F7F7F7F7F7F7F7FULL;
// clang-format off
// dataBits will be:
// 15 14 13 12 11 10 9 8
// [00000000][00000000][00000000][00000000][00000000][00000000][00000000][00000000]
// 7 6 5 4 3 2 1 0
// [0AAAAAAa][0BBBBBBB][0CCCCCCC][0Ddddddd][0EEeeeee][0FFFffff][0GGGGggg][0HHHHHhh]
__m128i dataBits = _mm_set_epi64x(0, dataBits64);
// alternatingZeros will be:
// 15 14 13 12 11 10 9 8
// [00000000][0AAAAAAa][00000000][0BBBBBBB][00000000][0CCCCCCC][00000000][0Ddddddd]
// 7 6 5 4 3 2 1 0
// [00000000][0EEeeeee][00000000][0FFFffff][00000000][0GGGGggg][00000000][0HHHHHhh]
__m128i alternatingZeros = _mm_cvtepu8_epi16(dataBits);
// shifted will be:
// 15 14 13 12 11 10 9 8
// [00AAAAAA][a0000000][00000000][0BBBBBBB][00000000][CCCCCCC0][0000000D][dddddd00]
// 7 6 5 4 3 2 1 0
// [000000EE][eeeee000][00000FFF][ffff0000][0000GGGG][ggg00000][000HHHHH][hh000000]
// (We implement the shift as a multiply just because of ISA limitations).
__m128i shifted = _mm_mullo_epi16(alternatingZeros, _mm_set_epi16(
1 << 7, 1 << 0, 1 << 1, 1 << 2,
1 << 3, 1 << 4, 1 << 5, 1 << 6));
// shuffled will be:
// 15 14 13 12 11 10 9 8
// [0BBBBBBB][CCCCCCC0][dddddd00][eeeee000][ffff0000][ggg00000][hh000000][00000000]
// 7 6 5 4 3 2 1 0
// [a0000000][0000000D][000000EE][00000FFF][0000GGGG][000HHHHH][00000000][00000000]
__m128i shuffled = _mm_shuffle_epi8(shifted, _mm_set_epi8(
12, 10, 8, 6, 4, 2, 0, -1,
14, 9, 7, 5, 3, 1, -1, -1));
// clang-format on
uint64_t highData1 = _mm_extract_epi64(shuffled, 0);
uint64_t highData2 = _mm_extract_epi64(shuffled, 1);
result = result + highData1 + highData2;
value = result;
} else {
readVarintSlow(c, value);
}
}
#endif // THRIFT_UTIL_VARINTUTILS_SIMD_DECODER

#if THRIFT_UTIL_VARINTUTILS_BMI2_DECODER
template <class CursorT>
void readVarintMediumSlowU64BMI2(
CursorT& c, uint64_t& value, const uint8_t* p, size_t len) {
enum { maxSize = (8 * sizeof(uint64_t) + 6) / 7 };
if (LIKELY(len >= maxSize)) {
uint64_t result;
if (tryReadFirstTwoBytesU64(result, p, len)) {
c.skipNoAdvance(2);
value = result;
return;
}
p += 2;
// This has alternating data bits and continuation bits, then, after the
// first 0 continuation bit, junk (well, message data that we're not
// interested in for varint decoding purposes).
uint64_t bits = folly::loadUnaligned<uint64_t>(p);
uint64_t continuationBits = _pext_u64(bits, 0x8080808080808080ULL);
if (continuationBits == 0xFF) {
throwInvalidVarint();
}
size_t intBytes = __builtin_ctz(continuationBits + 1) + 1;
c.skipNoAdvance(2 + intBytes);

uint64_t mask = (1ULL << (8 * intBytes - 1)) - 1;
// You might think it would make more sense to to the pext first and mask
// afterwards (avoiding having two pexts in a single dependency chain at 3
// cycles / pop); this seems not to be borne out in microbenchmarks. The
// mask you need ends up being more complicated to compute.
uint64_t highBits = _pext_u64((bits & mask), 0x7F7F7F7F7F7F7F7FULL);
result |= (highBits << 14);

value = result;
} else {
readVarintSlow(c, value);
}
}
#endif // THRIFT_UTIL_VARINTUTILS_BMI2_DECODER

template <class T, class CursorT>
void readVarintMediumSlow(CursorT& c, T& value, const uint8_t* p, size_t len) {
static_assert(
sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8,
"Trying to deserialize into an unsupported type");
if (sizeof(T) <= 4) {
readVarintMediumSlowUnrolled(c, value, p, len);
} else {
uint64_t result;
#if THRIFT_UTIL_VARINTUTILS_BMI2_DECODER
readVarintMediumSlowU64BMI2(c, result, p, len);
#elif THRIFT_UTIL_VARINTUTILS_SIMD_DECODER
readVarintMediumSlowU64SIMD(c, result, p, len);
#else
readVarintMediumSlowUnrolled(c, result, p, len);
#endif
value = static_cast<T>(result);
}
}
} // namespace detail

template <class T, class CursorT>
Expand Down
Loading

0 comments on commit 110807c

Please sign in to comment.