Skip to content

Commit

Permalink
Full implementation of 16-bit vec128
Browse files Browse the repository at this point in the history
  • Loading branch information
adbancroft committed Feb 12, 2022
1 parent d290f5b commit a6090fa
Showing 1 changed file with 59 additions and 4 deletions.
63 changes: 59 additions & 4 deletions libdivide.h
Original file line number Diff line number Diff line change
Expand Up @@ -2723,11 +2723,24 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
////////// UINT26

__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
uint8_t more = denom->more;
if (!denom->magic) {
return _mm_srli_epi16(numers, more);
} else {
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
} else {
return _mm_srli_epi16(q, more);
}
}
}

__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
return _mm_srli_epi16(t, denom->more);
}

////////// UINT32
Expand Down Expand Up @@ -2787,11 +2800,53 @@ __m128i libdivide_u64_branchfree_do_vec128(
////////// SINT16

__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
uint8_t more = denom->more;
if (!denom->magic) {
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
uint16_t mask = ((uint16_t)1 << shift) - 1;
__m128i roundToZeroTweak = _mm_set1_epi16(mask);
// q = numer + ((numer >> 15) & roundToZeroTweak);
__m128i q = _mm_add_epi16(
numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
q = _mm_srai_epi16(q, shift);
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
// q = (q ^ sign) - sign;
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
return q;
} else {
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
// q += ((numer ^ sign) - sign);
q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
}
// q >>= shift
q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
return q;
}
}

__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
int16_t magic = denom->magic;
uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
// must be arithmetic shift
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
q = _mm_add_epi16(q, numers); // q += numers

// If q is non-negative, we have nothing to do
// If q is negative, we want to add either (2**shift)-1 if d is
// a power of 2, or (2**shift) if it is not a power of 2
uint16_t is_power_of_2 = (magic == 0);
__m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
__m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
q = _mm_srai_epi16(q, shift); // q >>= shift
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
return q;
}

////////// SINT32
Expand Down

0 comments on commit a6090fa

Please sign in to comment.