intel · r-devulap · Feb 26, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/meson.build b/meson.build
@@ -1,5 +1,5 @@
 project('x86-simd-sort', 'cpp',
-        version : '4.0.0',
+        version : '5.0.0',
         license : 'BSD 3-clause',
         default_options : ['cpp_std=c++17'])
 fs = import('fs')

diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
@@ -25,6 +25,7 @@ template <typename vtype, typename reg_t>
 X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm);
 
 struct avx512_64bit_swizzle_ops;
+struct avx512_ymm_64bit_swizzle_ops;
 
 template <>
 struct ymm_vector<float> {
@@ -34,6 +35,7 @@ struct ymm_vector<float> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -208,6 +210,10 @@ struct ymm_vector<float> {
     {
         return _mm256_castps_si256(v);
     }
+    static bool all_false(opmask_t k)
+    {
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -222,6 +228,7 @@ struct ymm_vector<uint32_t> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -382,6 +389,10 @@ struct ymm_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k)
+    {
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -396,6 +407,7 @@ struct ymm_vector<int32_t> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -556,6 +568,10 @@ struct ymm_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k)
+    {
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -1204,4 +1220,77 @@ struct avx512_64bit_swizzle_ops {
     }
 };
 
+struct avx512_ymm_64bit_swizzle_ops {
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
+    {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b10110001);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 4) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b01001110);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 8) {
+            v = _mm256_permute2x128_si256(v, v, 0b00000001);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t
+    reverse_n(typename vtype::reg_t reg)
+    {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
+        else if constexpr (scale == 4) {
+            constexpr uint64_t mask = 0b00011011;
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, mask);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 8) {
+            return vtype::reverse(reg);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t
+    merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
+    {
+        __m256i v1 = vtype::cast_to(reg);
+        __m256i v2 = vtype::cast_to(other);
+
+        if constexpr (scale == 2) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b01010101);
+        }
+        else if constexpr (scale == 4) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00110011);
+        }
+        else if constexpr (scale == 8) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00001111);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v1);
+    }
+};
+
 #endif
diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp
@@ -388,7 +388,10 @@ X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys,
         return;
     }
 
-    type1_t pivot = get_pivot_blocks<vtype1>(keys, left, right);
+    type1_t pivot;
+    auto pivot_result = get_pivot_smart<vtype1, type1_t>(keys, left, right);
+    pivot = pivot_result.pivot;
+
     type1_t smallest = vtype1::type_max();
     type1_t biggest = vtype1::type_min();
     arrsize_t pivot_index = partition_avx512_unrolled<vtype1, vtype2, 4>(

diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
@@ -157,12 +157,7 @@ get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right)
         // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition
         return pivot_results<type_t>(median);
     }
-    else {
-        // Should be unreachable
-        return pivot_results<type_t>(median);
-    }
 
-    // Should be unreachable
     return pivot_results<type_t>(median);
 }
 

diff --git a/utils/rand_array.h b/utils/rand_array.h
@@ -137,7 +137,7 @@ static std::vector<T> get_array(std::string arrtype,
             val = std::numeric_limits<T>::max();
         }
         for (size_t ii = 1; ii <= arrsize; ++ii) {
-            if (rand() % 0x1) { arr[ii] = val; }
+            if (rand() & 0x1) { arr[ii] = val; }
         }
     }
     else {