Skip to content

Commit

Permalink
Add SSE41 path for streamvbyte_compressedbytes. (#57)
Browse files Browse the repository at this point in the history
* Move compressedbytes to _encode.c.

In preparation of SIMD implementations.

* Split compressedbytes scalar path

* Make data_bytes scalar path branchless

* Move x64 control byte calculation into a helper

This code path will be shared with compressedbytes calculation later.

* Add SSE41 path for streamvbyte_compressedbytes
  • Loading branch information
ishitatsuyuki authored Apr 12, 2023
1 parent f759071 commit 170fef1
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 39 deletions.
32 changes: 2 additions & 30 deletions include/streamvbyte.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,43 +43,15 @@ static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) {
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
// the compressed data: the user needs to ensure that this region is allocated, and it
// is not included by streamvbyte_compressedbytes.
static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;
// maximum number of control bytes:
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

if (val < (1 << 8)) db += 1;
else if (val < (1 << 16)) db += 2;
else if (val < (1 << 24)) db += 3;
else db += 4;
}
return cb + db;
}
size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length);

// return the exact number of compressed bytes given length input integers
// runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you
// care about speed more than potentially over-allocating memory
// Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
// the compressed data: the user needs to ensure that this region is allocated, and it
// is not included by streamvbyte_compressedbytes.
static inline size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;
// maximum number of control bytes:
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

if (val == 0) db += 0;
else if (val < (1 << 8)) db += 1;
else if (val < (1 << 16)) db += 2;
else db += 4;
}
return cb + db;
}
size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length);

// Read "length" 32-bit integers in varint format from in, storing the result in out.
// Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes
Expand Down
40 changes: 40 additions & 0 deletions src/streamvbyte_encode.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,46 @@ static uint8_t *svb_encode_scalar(const uint32_t *in,
#include "streamvbyte_arm_encode.c"
#endif

static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) {
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF);
db += bytes;
}
return db;
}

static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) {
size_t db = 0;
for (uint32_t c = 0; c < length; c++) {
uint32_t val = in[c];

uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2;
db += bytes;
}
return db;
}

size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;

#ifdef STREAMVBYTE_X64
if (streamvbyte_sse41()) {
return cb + svb_data_bytes_SSE41(in, length);
}
#endif
return cb + svb_data_bytes_scalar(in, length);
}

size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
// number of control bytes:
size_t cb = (length + 3) / 4;

return cb + svb_data_bytes_0124_scalar(in, length);
}


// Encode an array of a given length read from in to bout in streamvbyte format.
Expand Down
53 changes: 44 additions & 9 deletions src/streamvbyte_x64_encode.c
Original file line number Diff line number Diff line change
@@ -1,15 +1,55 @@
#include "streamvbyte_isadetection.h"
#ifdef STREAMVBYTE_X64
// contributed by aqrit

static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length);

STREAMVBYTE_TARGET_SSE41
static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) {
const __m128i mask_01 = _mm_set1_epi8(0x01);
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);

__m128i m0, m1;
size_t keys;

m0 = _mm_min_epu8(mask_01, lo);
m1 = _mm_min_epu8(mask_01, hi);
m0 = _mm_packus_epi16(m0, m1);
m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101
m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
keys = (size_t)_mm_movemask_epi8(m0);
return keys;
}
STREAMVBYTE_UNTARGET_REGION

STREAMVBYTE_TARGET_SSE41
size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) {
size_t dataLen = 0;

for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
{
__m128i r0, r1;
size_t keys;

r0 = _mm_loadu_si128((__m128i *) &in[0]);
r1 = _mm_loadu_si128((__m128i *) &in[4]);

keys = svb_control_SSE41(r0, r1);
dataLen += len_lut[keys & 0xFF];
dataLen += len_lut[keys >> 8];
}

dataLen += svb_data_bytes_scalar(in, count & 7);
return dataLen;
}
STREAMVBYTE_UNTARGET_REGION

STREAMVBYTE_TARGET_SSE41
size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) {
uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundry
uint8_t *restrict keyPtr = &out[0];
uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys

const __m128i mask_01 = _mm_set1_epi8(0x01);
const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);

for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
{
__m128i r0, r1, r2, r3;
Expand All @@ -18,12 +58,7 @@ size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* ou
r0 = _mm_loadu_si128((__m128i*)&in[0]);
r1 = _mm_loadu_si128((__m128i*)&in[4]);

r2 = _mm_min_epu8(mask_01, r0);
r3 = _mm_min_epu8(mask_01, r1);
r2 = _mm_packus_epi16(r2, r3);
r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101
r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
keys = (size_t)_mm_movemask_epi8(r2);
keys = svb_control_SSE41(r0, r1);

r2 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys << 4) & 0x03F0]);
r3 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys >> 4) & 0x03F0]);
Expand Down

0 comments on commit 170fef1

Please sign in to comment.