From 7ae8661062ba86b08075309db8867bee7b8f5a4a Mon Sep 17 00:00:00 2001 From: animetosho Date: Mon, 25 Sep 2017 14:11:32 +1000 Subject: [PATCH] Only prefer POPCNT for 8-bit bit-counting on newer AMD CPUs A lookup is likely faster everywhere else Ref #4 --- src/common.h | 2 -- src/decoder.cc | 4 ++-- src/encoder.cc | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/common.h b/src/common.h index e95b400..4a8566f 100644 --- a/src/common.h +++ b/src/common.h @@ -71,7 +71,6 @@ #endif -#ifndef __POPCNT__ // table from http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetTable static const unsigned char BitsSetTable256[256] = { @@ -83,7 +82,6 @@ static const unsigned char BitsSetTable256[256] = #undef B4 #undef B6 }; -#endif diff --git a/src/decoder.cc b/src/decoder.cc index 3afcbaa..88a33e3 100644 --- a/src/decoder.cc +++ b/src/decoder.cc @@ -281,7 +281,7 @@ size_t do_decode_sse(const unsigned char* src, unsigned char* dest, size_t len, // all that's left is to 'compress' the data (skip over masked chars) #ifdef __SSSE3__ if(use_ssse3) { -# ifdef __POPCNT__ +# if defined(__POPCNT__) && (defined(__tune_znver1__) || defined(__tune_btver2__)) unsigned char skipped = _mm_popcnt_u32(mask & 0xff); # else unsigned char skipped = BitsSetTable256[mask & 0xff]; @@ -300,7 +300,7 @@ size_t do_decode_sse(const unsigned char* src, unsigned char* dest, size_t len, STOREU_XMM(p, oData); // increment output position -# ifdef __POPCNT__ +# if defined(__POPCNT__) && !defined(__tune_btver1__) p += XMM_SIZE - _mm_popcnt_u32(mask); # else p += XMM_SIZE - skipped - BitsSetTable256[mask >> 8]; diff --git a/src/encoder.cc b/src/encoder.cc index f929cfc..e0fda4b 100644 --- a/src/encoder.cc +++ b/src/encoder.cc @@ -260,7 +260,7 @@ static size_t do_encode_fast(int line_size, int* colOffset, const unsigned char* data = _mm_add_epi8(data, shufMixMA); data2 = _mm_add_epi8(data2, shufMixMB); // store out -#ifdef __POPCNT__ +#if defined(__POPCNT__) && (defined(__tune_znver1__) || defined(__tune_btver2__)) unsigned char shufALen = _mm_popcnt_u32(m1) + 8; unsigned char shufBLen = _mm_popcnt_u32(m2) + 8; #else @@ -652,7 +652,7 @@ size_t do_encode_fast2(int line_size, int* colOffset, const unsigned char* src, data = _mm_add_epi8(data, shufMixMA); data2 = _mm_add_epi8(data2, shufMixMB); // store out -#ifdef __POPCNT__ +#if defined(__POPCNT__) && (defined(__tune_znver1__) || defined(__tune_btver2__)) unsigned char shufALen = _mm_popcnt_u32(m1) + 8; unsigned char shufBLen = _mm_popcnt_u32(m2) + 8; #else