Improve gathers to optimize better

If possible, apply scale as immediate to the gather instruction. Also, take many more chances at using vector instructions for index scaling and vector gather instructions (especially for SSE vectors). Ensure that converting gathers are correct (and more efficient than before). Fixes: gh-214 Signed-off-by: Matthias Kretz <kretz@kde.org>
VcDevel · Oct 24, 2018 · 17d7fee · 17d7fee
1 parent f183218
commit 17d7fee
Show file tree

Hide file tree

Showing 17 changed files with 554 additions and 412 deletions.
diff --git a/Vc/avx/intrinsics.h b/Vc/avx/intrinsics.h
@@ -584,6 +584,25 @@ Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
 }
 #endif
 
+#ifdef Vc_IMPL_AVX2
+template <int Scale> __m256 gather(const float *addr, __m256i idx)
+{
+    return _mm256_i32gather_ps(addr, idx, Scale);
+}
+template <int Scale> __m256d gather(const double *addr, __m128i idx)
+{
+    return _mm256_i32gather_pd(addr, idx, Scale);
+}
+template <int Scale> __m256i gather(const int *addr, __m256i idx)
+{
+    return _mm256_i32gather_epi32(addr, idx, Scale);
+}
+template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
+{
+    return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
+}
+#endif
+
 }  // namespace AvxIntrinsics
 }  // namespace Vc
 

diff --git a/Vc/avx/math.h b/Vc/avx/math.h
@@ -200,8 +200,10 @@ inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int,
     const auto frexpMask =
         _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
     fixed_size_simd<double, 8> ret = {
-        fixed_size_simd<double, 4>(_mm256_and_pd(exponentMaximized[0], frexpMask)),
-        fixed_size_simd<double, 4>(_mm256_and_pd(exponentMaximized[1], frexpMask))};
+        fixed_size_simd<double, 4>(
+            AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
+        fixed_size_simd<double, 4>(
+            AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
     const auto zeroMask = v == v.Zero();
     ret(isnan(v) || !isfinite(v) || zeroMask) = v;
     internal_data(*e) =

diff --git a/Vc/avx/vector.h b/Vc/avx/vector.h
@@ -195,47 +195,84 @@ template <typename T> class Vector<T, VectorAbi::Avx>
 #include "../common/scatterinterface.h"
 #if defined Vc_IMPL_AVX2 && !defined Vc_MSVC
         // skip this code for MSVC because it fails to do overload resolution correctly
-        Vc_INTRINSIC_L void gatherImplementation(
-            const EntryType *mem,
-            typename std::conditional<
-                Size == 8, AVX2::int_v,
-                typename std::conditional<Size == 4, SSE::int_v, void *>::type>::type
-                indexes) Vc_INTRINSIC_R;
-
-        template <class MT, class U>
-        Vc_INTRINSIC
-            enable_if<std::is_arithmetic<MT>::value && std::is_integral<U>::value &&
-                          (sizeof(MT) >= sizeof(short)),
-                      void>
-            gatherImplementation(const MT *mem, const SimdArray<U, Size> &indexes)
+        template <int Scale>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<
+                T, typename std::conditional<(Size <= 4), SSE::int_v, AVX2::int_v>::type,
+                Scale> &args)
+        {
+            d.v() = AVX::gather<sizeof(T) * Scale>(args.address, args.indexes.data());
+        }
+
+        template <class MT, class U, int Scale>
+        Vc_INTRINSIC enable_if<Traits::is_valid_vector_argument<MT>::value &&
+                                   std::is_integral<U>::value,
+                               void>
+        gatherImplementation(
+            const Common::GatherArguments<MT, SimdArray<U, Size>, Scale> &args)
         {
-            *this = simd_cast<Vector>(SimdArray<MT, Size>(mem, indexes));
+            *this = simd_cast<Vector>(SimdArray<MT, Size>(args));
         }
 
-        template <class U>
-        Vc_INTRINSIC enable_if<std::is_integral<U>::value && sizeof(EntryType) == 2, void>
-        gatherImplementation(const EntryType *mem, const SimdArray<U, 16> &indexes)
+        template <class U, int Scale>
+        Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
+            const Common::GatherArguments<T, fixed_size_simd<U, 8>, Scale> &args)
         {
-            const auto lo = simd_cast<AVX2::int_v, 0>(indexes);
-            const auto hi = simd_cast<AVX2::int_v, 1>(indexes);
-            *this = simd_cast<Vector>(AVX2::int_v(_mm256_i32gather_epi32(
-                                          aliasing_cast<int>(mem), lo.data(), 2)),
-                                      AVX2::int_v(_mm256_i32gather_epi32(
-                                          aliasing_cast<int>(mem), hi.data(), 2)));
+            gatherImplementation(Common::make_gather<Scale>(
+                args.address, simd_cast<AVX2::int_v>(args.indexes)));
         }
 
-        template <class U, class V, std::size_t Wt>
-        Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 8, void>
-        gatherImplementation(const EntryType *mem, const SimdArray<U, 8, V, Wt> &indexes)
+        template <class U, int Scale>
+        Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
+            const Common::GatherArguments<T, fixed_size_simd<U, 4>, Scale> &args)
         {
-            gatherImplementation(mem, simd_cast<AVX2::int_v>(indexes));
+            gatherImplementation(Common::make_gather<Scale>(
+                args.address, simd_cast<SSE::int_v>(args.indexes)));
+        }
+
+        template <class MT, class U, int Scale>
+        Vc_INTRINSIC
+            enable_if<Traits::is_valid_vector_argument<MT>::value &&
+                          !std::is_same<T, MT>::value && std::is_integral<U>::value,
+                      void>
+            gatherImplementation(
+                const Common::GatherArguments<MT, fixed_size_simd<U, Size>, Scale> &args)
+        {
+            *this = simd_cast<Vector>(fixed_size_simd<MT, Size>(args));
+        }
+
+        template <
+            class MT, class U, class A2, int Scale,
+            class = enable_if<sizeof(T) == 2 && std::is_integral<MT>::value &&
+                              (sizeof(MT) <= 2) && Vector<U, A2>::size() == size() &&
+                              std::is_integral<U>::value>>
+        Vc_INTRINSIC void gatherImplementation(
+            const Common::GatherArguments<MT, Vector<U, A2>, Scale> &args)
+        {
+            using AVX2::int_v;
+            const auto idx0 = simd_cast<int_v, 0>(args.indexes).data();
+            const auto idx1 = simd_cast<int_v, 1>(args.indexes).data();
+            *this = simd_cast<Vector>(int_v(AVX::gather<sizeof(MT) * Scale>(
+                                          aliasing_cast<int>(args.address), idx0)),
+                                      int_v(AVX::gather<sizeof(MT) * Scale>(
+                                          aliasing_cast<int>(args.address), idx1)));
+            if (sizeof(MT) == 1) {
+                *this &= 0xff;
+                if (std::is_signed<MT>::value) {
+                    using Signed = AVX2::Vector<typename std::make_signed<T>::type>;
+                    *this = (simd_cast<Signed>(*this) << 8) >> 8;  // sign extend
+                }
+            }
         }
 
-        template <class U, class V, std::size_t Wt>
-        Vc_INTRINSIC enable_if<std::is_integral<U>::value && Size == 4, void>
-        gatherImplementation(const EntryType *mem, const SimdArray<U, 4, V, Wt> &indexes)
+        template <class U, int Scale>
+        Vc_INTRINSIC enable_if<std::is_integral<U>::value, void> gatherImplementation(
+            const Common::GatherArguments<EntryType, SimdArray<U, Size>, Scale> &args)
         {
-            gatherImplementation(mem, simd_cast<SSE::int_v>(indexes));
+            gatherImplementation(Common::make_gather<Scale>(
+                args.address,
+                simd_cast<typename std::conditional<(Size == 4), SSE::int_v,
+                                                    AVX2::int_v>::type>(args.indexes)));
         }
 #endif  // Vc_IMPL_AVX2 && !MSVC
 

diff --git a/Vc/avx/vector.tcc b/Vc/avx/vector.tcc
@@ -386,107 +386,57 @@ Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
         AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
 }
 // gathers {{{1
-template <>
-template <typename MT, typename IT>
-inline void AVX2::double_v::gatherImplementation(const MT *mem, const IT &indexes)
-{
-    d.v() = _mm256_setr_pd(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]);
-}
+#define Vc_GATHER_IMPL(V_)                                                               \
+    template <>                                                                          \
+    template <class MT, class IT, int Scale>                                             \
+    inline void AVX2::V_::gatherImplementation(                                          \
+        const Common::GatherArguments<MT, IT, Scale> &args)
+#define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
+Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
 
-template <>
-template <typename MT, typename IT>
-inline void AVX2::float_v::gatherImplementation(const MT *mem, const IT &indexes)
+Vc_GATHER_IMPL(float_v)
 {
-    d.v() = _mm256_setr_ps(mem[indexes[0]],
-                           mem[indexes[1]],
-                           mem[indexes[2]],
-                           mem[indexes[3]],
-                           mem[indexes[4]],
-                           mem[indexes[5]],
-                           mem[indexes[6]],
-                           mem[indexes[7]]);
+    d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
+                           Vc_M(7));
 }
 
 #ifdef Vc_IMPL_AVX2
-#ifndef Vc_MSVC
-// skip this code for MSVC because it fails to do overload resolution correctly
-template <>
-Vc_INTRINSIC void AVX2::double_v::gatherImplementation(const double *mem,
-                                                       SSE::int_v indexes)
+Vc_GATHER_IMPL(int_v)
 {
-    d.v() = _mm256_i32gather_pd(mem, indexes.data(), sizeof(double));
+    d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7));
 }
 
-template <>
-Vc_INTRINSIC void AVX2::float_v::gatherImplementation(const float *mem,
-                                                      AVX2::int_v indexes)
+Vc_GATHER_IMPL(uint_v)
 {
-    d.v() = _mm256_i32gather_ps(mem, indexes.data(), sizeof(float));
+    d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7));
 }
 
-template <>
-Vc_INTRINSIC void AVX2::int_v::gatherImplementation(const int *mem,
-                                                    AVX2::int_v indexes)
-{
-    d.v() = _mm256_i32gather_epi32(mem, indexes.data(), sizeof(int));
-}
-
-template <>
-Vc_INTRINSIC void AVX2::uint_v::gatherImplementation(const uint *mem,
-                                                     AVX2::int_v indexes)
+Vc_GATHER_IMPL(short_v)
 {
-    d.v() =
-        _mm256_i32gather_epi32(aliasing_cast<int>(mem), indexes.data(), sizeof(unsigned));
+    d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
+                              Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
 }
-#endif  // !Vc_MSVC
 
-template <>
-template <typename MT, typename IT>
-inline void AVX2::int_v::gatherImplementation(const MT *mem, const IT &indexes)
+Vc_GATHER_IMPL(ushort_v)
 {
-    d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
-                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
-                              mem[indexes[6]], mem[indexes[7]]);
-}
-
-template <>
-template <typename MT, typename IT>
-inline void AVX2::uint_v::gatherImplementation(const MT *mem, const IT &indexes)
-{
-    d.v() = _mm256_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
-                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
-                              mem[indexes[6]], mem[indexes[7]]);
-}
-
-template <>
-template <typename MT, typename IT>
-inline void AVX2::short_v::gatherImplementation(const MT *mem, const IT &indexes)
-{
-    d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
-                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
-                              mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
-                              mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
-                              mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
-                              mem[indexes[15]]);
-}
-
-template <>
-template <typename MT, typename IT>
-inline void AVX2::ushort_v::gatherImplementation(const MT *mem, const IT &indexes)
-{
-    d.v() = _mm256_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]],
-                              mem[indexes[3]], mem[indexes[4]], mem[indexes[5]],
-                              mem[indexes[6]], mem[indexes[7]], mem[indexes[8]],
-                              mem[indexes[9]], mem[indexes[10]], mem[indexes[11]],
-                              mem[indexes[12]], mem[indexes[13]], mem[indexes[14]],
-                              mem[indexes[15]]);
+    d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
+                              Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
+                              Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
 }
 #endif
+#undef Vc_M
+#undef Vc_GATHER_IMPL
 
-template <typename T>
-template <typename MT, typename IT>
-inline void Vector<T, VectorAbi::Avx>::gatherImplementation(const MT *mem, const IT &indexes, MaskArgument mask)
+template <class T>
+template <class MT, class IT, int Scale>
+inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
+    const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
 {
+    const auto *mem = args.address;
+    const auto indexes = Scale * args.indexes;
     using Selector = std::integral_constant < Common::GatherScatterImplementation,
 #ifdef Vc_USE_SET_GATHERS
           Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :