Skip to content

Commit

Permalink
Improve gathers to optimize better
Browse files Browse the repository at this point in the history
If possible, apply scale as immediate to the gather instruction. Also,
take many more chances at using vector instructions for index scaling
and vector gather instructions (especially for SSE vectors).

Ensure that converting gathers are correct (and more efficient than
before).

Fixes: gh-214

Signed-off-by: Matthias Kretz <kretz@kde.org>
  • Loading branch information
mattkretz committed Oct 24, 2018
1 parent f183218 commit 17d7fee
Show file tree
Hide file tree
Showing 17 changed files with 554 additions and 412 deletions.
19 changes: 19 additions & 0 deletions Vc/avx/intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,25 @@ Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
}
#endif

#ifdef Vc_IMPL_AVX2
template <int Scale> __m256 gather(const float *addr, __m256i idx)
{
return _mm256_i32gather_ps(addr, idx, Scale);
}
template <int Scale> __m256d gather(const double *addr, __m128i idx)
{
return _mm256_i32gather_pd(addr, idx, Scale);
}
template <int Scale> __m256i gather(const int *addr, __m256i idx)
{
return _mm256_i32gather_epi32(addr, idx, Scale);
}
template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
{
return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
}
#endif

} // namespace AvxIntrinsics
} // namespace Vc

Expand Down
6 changes: 4 additions & 2 deletions Vc/avx/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,10 @@ inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int,
const auto frexpMask =
_mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
fixed_size_simd<double, 8> ret = {
fixed_size_simd<double, 4>(_mm256_and_pd(exponentMaximized[0], frexpMask)),
fixed_size_simd<double, 4>(_mm256_and_pd(exponentMaximized[1], frexpMask))};
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
fixed_size_simd<double, 4>(
AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
const auto zeroMask = v == v.Zero();
ret(isnan(v) || !isfinite(v) || zeroMask) = v;
internal_data(*e) =
Expand Down
99 changes: 68 additions & 31 deletions Vc/avx/vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,47 +195,84 @@ template <typename T> class Vector<T, VectorAbi::Avx>
#include "../common/scatterinterface.h"
#if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
// skip this code for MSVC because it fails to do overload resolution correctly
Vc_INTRINSIC_L void gatherImplementation(
const EntryType *mem,
typename std::conditional<
Size == 8, AVX2::int_v,
typename std::conditional<Size == 4, SSE::int_v, void *>::type>::type
indexes) Vc_INTRINSIC_R;

template <class MT, class U>
Vc_INTRINSIC
enable_if<std::is_arithmetic<MT>::value && std::is_integral<U>::value &&
(sizeof(MT) >= sizeof(short)),
void>
gatherImplementation(const MT *mem, const SimdArray<U, Size> &indexes)
template <int Scale>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<
T, typename std::conditional<(Size <= 4), SSE::int_v, AVX2::int_v>::type,
Scale> &args)
{
d.v() = AVX::gather<sizeof(T) * Scale>(args.address, args.indexes.data());
}

template <class MT, class U, int Scale>
Vc_INTRINSIC enable_if<Traits::is_valid_vector_argument<MT>::value &&
std::is_integral<U>::value,
void>
gatherImplementation(
const Common::GatherArguments<MT, SimdArray<U, Size>, Scale> &args)
{
*this = simd_cast<Vector>(SimdArray<MT, Size>(mem, indexes));
*this = simd_cast<Vector>(SimdArray<MT, Size>(args));
}

template <class U>
Vc_INTRINSIC enable_if<std::is_integral<U>::value && sizeof(EntryType) == 2, void>
gatherImplementation(const EntryType *mem, const SimdArray<U, 16> &indexes)
template <class U, int Scale>
Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
const Common::GatherArguments<T, fixed_size_simd<U, 8>, Scale> &args)
{
const auto lo = simd_cast<AVX2::int_v, 0>(indexes);
const auto hi = simd_cast<AVX2::int_v, 1>(indexes);
*this = simd_cast<Vector>(AVX2::int_v(_mm256_i32gather_epi32(
aliasing_cast<int>(mem), lo.data(), 2)),
AVX2::int_v(_mm256_i32gather_epi32(
aliasing_cast<int>(mem), hi.data(), 2)));
gatherImplementation(Common::make_gather<Scale>(
args.address, simd_cast<AVX2::int_v>(args.indexes)));
}

template <class U, class V, std::size_t Wt>
Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 8, void>
gatherImplementation(const EntryType *mem, const SimdArray<U, 8, V, Wt> &indexes)
template <class U, int Scale>
Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
const Common::GatherArguments<T, fixed_size_simd<U, 4>, Scale> &args)
{
gatherImplementation(mem, simd_cast<AVX2::int_v>(indexes));
gatherImplementation(Common::make_gather<Scale>(
args.address, simd_cast<SSE::int_v>(args.indexes)));
}

template <class MT, class U, int Scale>
Vc_INTRINSIC
enable_if<Traits::is_valid_vector_argument<MT>::value &&
!std::is_same<T, MT>::value && std::is_integral<U>::value,
void>
gatherImplementation(
const Common::GatherArguments<MT, fixed_size_simd<U, Size>, Scale> &args)
{
*this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
}

template <
class MT, class U, class A2, int Scale,
class = enable_if<sizeof(T) == 2 && std::is_integral<MT>::value &&
(sizeof(MT) <= 2) && Vector<U, A2>::size() == size() &&
std::is_integral<U>::value>>
Vc_INTRINSIC void gatherImplementation(
const Common::GatherArguments<MT, Vector<U, A2>, Scale> &args)
{
using AVX2::int_v;
const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
*this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx0)),
int_v(AVX::gather<sizeof(MT) * Scale>(
aliasing_cast<int>(args.address), idx1)));
if (sizeof(MT) == 1) {
*this &= 0xff;
if (std::is_signed<MT>::value) {
using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
*this = (simd_cast<Signed>(*this) << 8) >> 8; // sign extend
}
}
}

template <class U, class V, std::size_t Wt>
Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 4, void>
gatherImplementation(const EntryType *mem, const SimdArray<U, 4, V, Wt> &indexes)
template <class U, int Scale>
Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
const Common::GatherArguments<EntryType, SimdArray<U, Size>, Scale> &args)
{
gatherImplementation(mem, simd_cast<SSE::int_v>(indexes));
gatherImplementation(Common::make_gather<Scale>(
args.address,
simd_cast<typename std::conditional<(Size == 4), SSE::int_v,
AVX2::int_v>::type>(args.indexes)));
}
#endif // Vc_IMPL_AVX2 && !MSVC

Expand Down
114 changes: 32 additions & 82 deletions Vc/avx/vector.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -386,107 +386,57 @@ Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
}
// gathers {{{1
template <>
template <typename MT, typename IT>
inline void AVX2::double_v::gatherImplementation(const MT *mem, const IT &indexes)
{
d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
}
#define Vc_GATHER_IMPL(V_) \
template <> \
template <class MT, class IT, int Scale> \
inline void AVX2::V_::gatherImplementation( \
const Common::GatherArguments<MT, IT, Scale> &args)
#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }

template <>
template <typename MT, typename IT>
inline void AVX2::float_v::gatherImplementation(const MT *mem, const IT &indexes)
Vc_GATHER_IMPL(float_v)
{
d.v() = _mm256_setr_ps(mem[indexes[0]],
mem[indexes[1]],
mem[indexes[2]],
mem[indexes[3]],
mem[indexes[4]],
mem[indexes[5]],
mem[indexes[6]],
mem[indexes[7]]);
d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
Vc_M(7));
}

#ifdef Vc_IMPL_AVX2
#ifndef Vc_MSVC
// skip this code for MSVC because it fails to do overload resolution correctly
template <>
Vc_INTRINSIC void AVX2::double_v::gatherImplementation(const double *mem,
SSE::int_v indexes)
Vc_GATHER_IMPL(int_v)
{
d.v() = _mm256_i32gather_pd(mem, indexes.data(), sizeof(double));
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}

template <>
Vc_INTRINSIC void AVX2::float_v::gatherImplementation(const float *mem,
AVX2::int_v indexes)
Vc_GATHER_IMPL(uint_v)
{
d.v() = _mm256_i32gather_ps(mem, indexes.data(), sizeof(float));
d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7));
}

template <>
Vc_INTRINSIC void AVX2::int_v::gatherImplementation(const int *mem,
AVX2::int_v indexes)
{
d.v() = _mm256_i32gather_epi32(mem, indexes.data(), sizeof(int));
}

template <>
Vc_INTRINSIC void AVX2::uint_v::gatherImplementation(const uint *mem,
AVX2::int_v indexes)
Vc_GATHER_IMPL(short_v)
{
d.v() =
_mm256_i32gather_epi32(aliasing_cast<int>(mem), indexes.data(), sizeof(unsigned));
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
#endif // !Vc_MSVC

template <>
template <typename MT, typename IT>
inline void AVX2::int_v::gatherImplementation(const MT *mem, const IT &indexes)
Vc_GATHER_IMPL(ushort_v)
{
d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
mem[indexes[6]], mem[indexes[7]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::uint_v::gatherImplementation(const MT *mem, const IT &indexes)
{
d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
mem[indexes[6]], mem[indexes[7]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::short_v::gatherImplementation(const MT *mem, const IT &indexes)
{
d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
mem[indexes[15]]);
}

template <>
template <typename MT, typename IT>
inline void AVX2::ushort_v::gatherImplementation(const MT *mem, const IT &indexes)
{
d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
mem[indexes[15]]);
d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
}
#endif
#undef Vc_M
#undef Vc_GATHER_IMPL

template <typename T>
template <typename MT, typename IT>
inline void Vector<T, VectorAbi::Avx>::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask)
template <class T>
template <class MT, class IT, int Scale>
inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
{
const auto *mem = args.address;
const auto indexes = Scale * args.indexes;
using Selector = std::integral_constant < Common::GatherScatterImplementation,
#ifdef Vc_USE_SET_GATHERS
Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
Expand Down
Loading

0 comments on commit 17d7fee

Please sign in to comment.