From 0041c05d070b94af018d8818b5dc6d3b53274bdd Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Mon, 29 Jan 2024 15:29:08 -0800
Subject: [PATCH 1/6] New pivot selection to improve performance in many
 special cases

---
 src/avx2-32bit-qsort.hpp       |  25 ++++-
 src/avx2-64bit-qsort.hpp       |  30 +++++-
 src/avx512-16bit-qsort.hpp     |  21 ++++
 src/avx512-32bit-qsort.hpp     |   9 ++
 src/avx512-64bit-common.h      |   9 ++
 src/avx512fp16-16bit-qsort.hpp |   3 +
 src/xss-common-qsort.h         |  14 ++-
 src/xss-network-qsort.hpp      |  24 +++--
 src/xss-optimal-networks.hpp   |   3 +
 src/xss-pivot-selection.hpp    | 177 +++++++++++++++++++++++++++++++++
 10 files changed, 301 insertions(+), 14 deletions(-)
diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp
index cf0fbd55..667b2075 100644
--- a/src/avx2-32bit-qsort.hpp
+++ b/src/avx2-32bit-qsort.hpp
@@ -86,6 +86,11 @@ struct avx2_vector<int32_t> {
     {
         return _mm256_set1_epi32(type_max());
     } // TODO: this should broadcast bits as is?
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -204,6 +209,9 @@ struct avx2_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -242,6 +250,11 @@ struct avx2_vector<uint32_t> {
     {
         return _mm256_set1_epi32(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -349,6 +362,9 @@ struct avx2_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -387,7 +403,11 @@ struct avx2_vector<float> {
     {
         return _mm256_set1_ps(type_max());
     }
-
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1);
+        return _mm256_xor_si256(x, allOnes);
+    }
     static ymmi_t
     seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
     {
@@ -514,6 +534,9 @@ struct avx2_vector<float> {
     {
         return _mm256_castps_si256(v);
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp
index e5f53808..604c974a 100644
--- a/src/avx2-64bit-qsort.hpp
+++ b/src/avx2-64bit-qsort.hpp
@@ -68,12 +68,17 @@ struct avx2_vector<int64_t> {
     {
         return _mm256_set1_epi64x(type_max());
     } // TODO: this should broadcast bits as is?
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -209,6 +214,9 @@ struct avx2_vector<int64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 template <>
 struct avx2_vector<uint64_t> {
@@ -239,12 +247,17 @@ struct avx2_vector<uint64_t> {
     {
         return _mm256_set1_epi64x(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -378,6 +391,9 @@ struct avx2_vector<uint64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 
 /*
@@ -421,6 +437,11 @@ struct avx2_vector<double> {
     {
         return _mm256_set1_pd(type_max());
     }
+    static opmask_t knot_opmask(opmask_t x)
+    {
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        return _mm256_xor_si256(x, allTrue);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
@@ -440,7 +461,7 @@ struct avx2_vector<double> {
             static_assert(type == (0x01 | 0x80), "should not reach here");
         }
     }
-    static ymmi_t seti(int v1, int v2, int v3, int v4)
+    static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
     }
@@ -571,6 +592,9 @@ struct avx2_vector<double> {
     {
         return _mm256_castpd_si256(v);
     }
+    static bool all_false(opmask_t k){
+        return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0;
+    }
 };
 
 struct avx2_64bit_swizzle_ops {
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
index 32d7419c..9d9caf1f 100644
--- a/src/avx512-16bit-qsort.hpp
+++ b/src/avx512-16bit-qsort.hpp
@@ -81,6 +81,10 @@ struct zmm_vector<float16> {
                           exp_eq, mant_x, mant_y, _MM_CMPINT_NLT);
         return _kxor_mask32(mask_ge, neg);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epu16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -186,6 +190,9 @@ struct zmm_vector<float16> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -238,6 +245,10 @@ struct zmm_vector<int16_t> {
     {
         return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epi16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -323,6 +334,9 @@ struct zmm_vector<int16_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -374,6 +388,10 @@ struct zmm_vector<uint16_t> {
     {
         return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmpeq_epu16_mask(x, y);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
@@ -457,6 +475,9 @@ struct zmm_vector<uint16_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp
index 74615765..3fe9b076 100644
--- a/src/avx512-32bit-qsort.hpp
+++ b/src/avx512-32bit-qsort.hpp
@@ -198,6 +198,9 @@ struct zmm_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -377,6 +380,9 @@ struct zmm_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -570,6 +576,9 @@ struct zmm_vector<float> {
     {
         return _mm512_castps_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index 0c4e3d58..e885d11a 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -732,6 +732,9 @@ struct zmm_vector<int64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -903,6 +906,9 @@ struct zmm_vector<uint64_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
@@ -1093,6 +1099,9 @@ struct zmm_vector<double> {
     {
         return _mm512_castpd_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
index f44209fa..3d72d656 100644
--- a/src/avx512fp16-16bit-qsort.hpp
+++ b/src/avx512fp16-16bit-qsort.hpp
@@ -150,6 +150,9 @@ struct zmm_vector<_Float16> {
     {
         return _mm512_castph_si512(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static int double_compressstore(type_t *left_addr,
                                     type_t *right_addr,
                                     opmask_t k,
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 7b89ba21..47267b82 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -498,14 +498,24 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
                 arr + left, (int32_t)(right + 1 - left));
         return;
     }
-
-    type_t pivot = get_pivot_blocks<vtype, type_t>(arr, left, right);
+    
+    auto pivot_result = get_pivot_smart<vtype, type_t>(arr, left, right);
+    type_t pivot = pivot_result.pivot;
+    
+    if (pivot_result.alreadySorted){
+        return;
+    }
+    
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
 
     arrsize_t pivot_index
             = partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
                     arr, left, right + 1, pivot, &smallest, &biggest);
+    
+    if (pivot_result.only2Values){
+        return;
+    }
 
     if (pivot != smallest)
         qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp
index 56a1aca1..1c0188d9 100644
--- a/src/xss-network-qsort.hpp
+++ b/src/xss-network-qsort.hpp
@@ -4,6 +4,9 @@
 #include "xss-optimal-networks.hpp"
 #include "xss-common-qsort.h"
 
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
+
 template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
 {
@@ -140,6 +143,17 @@ X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs)
     }
 }
 
+template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
+X86_SIMD_SORT_FINLINE void sort_vectors(reg_t * vecs){
+    /* Run the initial sorting network to sort the columns of the [numVecs x
+     * num_lanes] matrix
+     */
+    bitonic_sort_n_vec<vtype, numVecs>(vecs);
+
+    // Merge the vectors using bitonic merging networks
+    merge_n_vec<vtype, numVecs>(vecs);
+}
+
 template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
 {
@@ -174,14 +188,8 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
         vecs[i] = vtype::mask_loadu(
                 vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
     }
-
-    /* Run the initial sorting network to sort the columns of the [numVecs x
-     * num_lanes] matrix
-     */
-    bitonic_sort_n_vec<vtype, numVecs>(vecs);
-
-    // Merge the vectors using bitonic merging networks
-    merge_n_vec<vtype, numVecs>(vecs);
+    
+    sort_vectors<vtype, numVecs>(vecs);
 
     // Unmasked part of the store
     X86_SIMD_SORT_UNROLL_LOOP(64)
diff --git a/src/xss-optimal-networks.hpp b/src/xss-optimal-networks.hpp
index 3dfa5281..bffe493d 100644
--- a/src/xss-optimal-networks.hpp
+++ b/src/xss-optimal-networks.hpp
@@ -1,6 +1,9 @@
 // All of these sources files are generated from the optimal networks described in
 // https://bertdobbelaere.github.io/sorting_networks.html
 
+template <typename vtype, typename mm_t>
+X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
+
 template <typename vtype, typename reg_t = typename vtype::reg_t>
 X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs)
 {
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index 2a28b348..348adb1a 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -1,6 +1,45 @@
+#ifndef XSS_PIVOT_SELECTION
+#define XSS_PIVOT_SELECTION
+
+#include "xss-network-qsort.hpp"
+
+template <typename type_t>
+struct pivot_results{
+    bool alreadySorted = false;
+    bool only2Values = false;
+    type_t pivot = 0;
+    
+    pivot_results(type_t _pivot){
+        pivot = _pivot;
+        alreadySorted = false;
+    }
+    
+    pivot_results(type_t _pivot, bool _alreadySorted){
+        pivot = _pivot;
+        alreadySorted = _alreadySorted;
+    }
+};
+
+template <typename type_t>
+type_t next_value(type_t value){
+    // TODO this probably handles non-native float16 wrong
+    if constexpr (std::is_floating_point<type_t>::value){
+        return std::nextafter(value, std::numeric_limits<type_t>::infinity());
+    }else{
+        if (value < std::numeric_limits<type_t>::max()){
+            return value + 1;
+        }else{
+            return value;
+        }
+    }
+}
+
 template <typename vtype, typename mm_t>
 X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
 
+template <typename vtype, typename T = typename vtype::type_t>
+X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
+
 template <typename vtype, typename type_t>
 X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr,
                                       const arrsize_t left,
@@ -61,3 +100,141 @@ X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr,
     vtype::storeu(data, vec);
     return data[vtype::numlanes / 2];
 }
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
+                                             type_t commonValue,
+                                             const arrsize_t left,
+                                             const arrsize_t right);
+
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_smart(type_t *arr,
+                                             const arrsize_t left,
+                                             const arrsize_t right)
+{
+    using reg_t = typename vtype::reg_t;
+    constexpr int numVecs = 4;
+    
+    if (right - left + 1 <= 4 * numVecs * vtype::numlanes){
+        return pivot_results<type_t>(get_pivot<vtype>(arr, left, right)); 
+    }
+    
+    constexpr int N = numVecs * vtype::numlanes;
+
+    arrsize_t width = (right - vtype::numlanes) - left;
+    arrsize_t delta = width / numVecs;
+
+    reg_t vecs[numVecs];
+    for (int i = 0; i < numVecs; i++) {
+        vecs[i] = vtype::loadu(arr + left + delta * i);
+    }
+    
+    // Sort the samples
+    sort_vectors<vtype, numVecs>(vecs);
+    
+    type_t samples[N];
+    for (int i = 0; i < numVecs; i++){
+        vtype::storeu(samples + vtype::numlanes * i, vecs[i]);
+    }
+    
+    type_t smallest = samples[0];
+    type_t largest = samples[N - 1];
+    type_t median = samples[N / 2];
+    
+    if (smallest == largest){
+        // We have a very unlucky sample, or the array is constant / near constant
+        // Run a special function meant to deal with this situation
+        return get_pivot_near_constant<vtype, type_t>(arr, median, left, right);
+    }else if (median != smallest && median != largest){
+        // We have a normal sample; use it's median
+        return pivot_results<type_t>(median);
+    }else if (median == smallest){
+        // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample
+        // Try just doing the next largest value greater than this seemingly very common value to seperate them out
+        return pivot_results<type_t>(next_value<type_t>(median));
+    }else if (median == largest){
+        // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample
+        // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition
+        return pivot_results<type_t>(median);
+    }else{
+        // Should be unreachable
+        return pivot_results<type_t>(median);
+    }
+    
+    // Should be unreachable
+    return pivot_results<type_t>(median);
+}
+
+// Handles the case where we seem to have a near-constant array, since our sample of the array was constant
+template <typename vtype, typename type_t>
+X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
+                                             type_t commonValue,
+                                             const arrsize_t left,
+                                             const arrsize_t right)
+{
+    using reg_t = typename vtype::reg_t;
+    
+    arrsize_t index = left;
+    
+    type_t value1 = 0;
+    type_t value2 = 0;
+    
+    // First, search for any value not equal to the common value
+    // First vectorized
+    reg_t commonVec = vtype::set1(commonValue);
+    for (; index <= right - vtype::numlanes; index += vtype::numlanes){
+        reg_t data = vtype::loadu(arr + index);
+        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))){
+            break;
+        }
+    }
+    
+    // Than scalar at the end
+    for (; index <= right; index++){
+        if (arr[index] != commonValue){
+            value1 = arr[index];
+            break;
+        } 
+    }
+    
+    if (index == right + 1){
+        // The array is completely constant
+        // Setting the second flag to true skips partitioning, as the array is constant and thus sorted
+        return pivot_results<type_t>(commonValue, true);
+    }
+    
+    // Secondly, search for a second value not equal to either of the previous two
+    // First vectorized
+    reg_t value1Vec = vtype::set1(value1);
+    for (; index <= right - vtype::numlanes; index += vtype::numlanes){
+        reg_t data = vtype::loadu(arr + index);
+        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) && !vtype::all_false(vtype::knot_opmask(vtype::eq(data, value1Vec)))){
+            break;
+        }
+    }
+    
+    // Then scalar
+    for (; index <= right; index++){
+        if (arr[index] != commonValue && arr[index] != value1){
+            value2 = arr[index];
+            break;
+        } 
+    }
+    
+    if (index == right + 1){
+        // The array contains only 2 values
+        // We must pick the larger one, else the right partition is empty
+        // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value
+        // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere
+        type_t pivot = std::max(value1, commonValue, comparison_func<vtype>);
+        auto result = pivot_results<type_t>(pivot, false);
+        result.only2Values = true;
+        return result;
+    }
+    
+    // The array has at least 3 distinct values. Use the middle one as the pivot
+    type_t median = std::max(std::min(value1,value2, comparison_func<vtype>), std::min(std::max(value1,value2, comparison_func<vtype>),commonValue, comparison_func<vtype>), comparison_func<vtype>);
+    return pivot_results<type_t>(median);
+}
+
+#endif
\ No newline at end of file

From 0870e1e767ae00539f4b201fd9c83ea78c9ab1c6 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Wed, 31 Jan 2024 09:52:40 -0800
Subject: [PATCH 2/6] Fixed build issues and missing function for AVX512FP16
 vector type

---
 src/avx512fp16-16bit-qsort.hpp | 4 ++++
 src/xss-pivot-selection.hpp    | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
index 3d72d656..f4b31c9e 100644
--- a/src/avx512fp16-16bit-qsort.hpp
+++ b/src/avx512fp16-16bit-qsort.hpp
@@ -55,6 +55,10 @@ struct zmm_vector<_Float16> {
     {
         return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ);
     }
+    static opmask_t eq(reg_t x, reg_t y)
+    {
+        return _mm512_cmp_ph_mask(x, y, _CMP_EQ_OQ);
+    }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
     {
         return ((0x1ull << num_to_read) - 0x1ull);
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index 348adb1a..bd793e60 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -37,7 +37,7 @@ type_t next_value(type_t value){
 template <typename vtype, typename mm_t>
 X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
 
-template <typename vtype, typename T = typename vtype::type_t>
+template <typename vtype, typename T>
 X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
 
 template <typename vtype, typename type_t>

From 88edcf71cbe08c982f9ab4ea76d912d6ed5c96d7 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Wed, 31 Jan 2024 10:57:09 -0800
Subject: [PATCH 3/6] Moved declaration of comparison_func to fix build issues

---
 src/xss-common-includes.h   | 3 +++
 src/xss-common-qsort.h      | 2 +-
 src/xss-pivot-selection.hpp | 3 ---
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h
index 98a3fe15..281bcc01 100644
--- a/src/xss-common-includes.h
+++ b/src/xss-common-includes.h
@@ -106,4 +106,7 @@ struct avx2_half_vector;
 
 enum class simd_type : int { AVX2, AVX512 };
 
+template <typename vtype, typename T = typename vtype::type_t>
+X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
+
 #endif // XSS_COMMON_INCLUDES
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 47267b82..f95fab79 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -136,7 +136,7 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size)
     return size - count - 1;
 }
 
-template <typename vtype, typename T = typename vtype::type_t>
+template <typename vtype, typename T>
 X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b)
 {
     return a < b;
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index bd793e60..2a1c3bd5 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -37,9 +37,6 @@ type_t next_value(type_t value){
 template <typename vtype, typename mm_t>
 X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
 
-template <typename vtype, typename T>
-X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
-
 template <typename vtype, typename type_t>
 X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr,
                                       const arrsize_t left,

From af30caa8aa1201c7bf6b8ced397bb59bd6448014 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Mon, 5 Feb 2024 11:34:15 -0800
Subject: [PATCH 4/6] Fixed bug in avx2-64bit logic and cleaned up some special
 case handling

---
 src/avx2-64bit-qsort.hpp    |  6 +++---
 src/xss-common-qsort.h      |  4 ++--
 src/xss-pivot-selection.hpp | 21 ++++++++-------------
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp
index 604c974a..e7f8129e 100644
--- a/src/avx2-64bit-qsort.hpp
+++ b/src/avx2-64bit-qsort.hpp
@@ -70,7 +70,7 @@ struct avx2_vector<int64_t> {
     } // TODO: this should broadcast bits as is?
     static opmask_t knot_opmask(opmask_t x)
     {
-        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF);
         return _mm256_xor_si256(x, allTrue);
     }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
@@ -249,7 +249,7 @@ struct avx2_vector<uint64_t> {
     }
     static opmask_t knot_opmask(opmask_t x)
     {
-        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF);
         return _mm256_xor_si256(x, allTrue);
     }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
@@ -439,7 +439,7 @@ struct avx2_vector<double> {
     }
     static opmask_t knot_opmask(opmask_t x)
     {
-        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF);
+        auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF);
         return _mm256_xor_si256(x, allTrue);
     }
     static opmask_t get_partial_loadmask(uint64_t num_to_read)
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index f95fab79..026bf30c 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -502,7 +502,7 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
     auto pivot_result = get_pivot_smart<vtype, type_t>(arr, left, right);
     type_t pivot = pivot_result.pivot;
     
-    if (pivot_result.alreadySorted){
+    if (pivot_result.result == pivot_result_t::Sorted){
         return;
     }
     
@@ -513,7 +513,7 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
             = partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
                     arr, left, right + 1, pivot, &smallest, &biggest);
     
-    if (pivot_result.only2Values){
+    if (pivot_result.result == pivot_result_t::Only2Values){
         return;
     }
 
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index 2a1c3bd5..00beeff4 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -3,20 +3,17 @@
 
 #include "xss-network-qsort.hpp"
 
+enum class pivot_result_t : int { Normal, Sorted, Only2Values };
+
 template <typename type_t>
 struct pivot_results{
-    bool alreadySorted = false;
-    bool only2Values = false;
-    type_t pivot = 0;
     
-    pivot_results(type_t _pivot){
-        pivot = _pivot;
-        alreadySorted = false;
-    }
+    pivot_result_t result = pivot_result_t::Normal;
+    type_t pivot = 0;
     
-    pivot_results(type_t _pivot, bool _alreadySorted){
+    pivot_results(type_t _pivot, pivot_result_t _result = pivot_result_t::Normal){
         pivot = _pivot;
-        alreadySorted = _alreadySorted;
+        result = _result;
     }
 };
 
@@ -197,7 +194,7 @@ X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
     if (index == right + 1){
         // The array is completely constant
         // Setting the second flag to true skips partitioning, as the array is constant and thus sorted
-        return pivot_results<type_t>(commonValue, true);
+        return pivot_results<type_t>(commonValue, pivot_result_t::Sorted);
     }
     
     // Secondly, search for a second value not equal to either of the previous two
@@ -224,9 +221,7 @@ X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
         // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value
         // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere
         type_t pivot = std::max(value1, commonValue, comparison_func<vtype>);
-        auto result = pivot_results<type_t>(pivot, false);
-        result.only2Values = true;
-        return result;
+        return pivot_results<type_t>(pivot, pivot_result_t::Only2Values);
     }
     
     // The array has at least 3 distinct values. Use the middle one as the pivot

From 2b913b813cc972b2445c493106cf269dbb6f498a Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Wed, 21 Feb 2024 13:38:20 -0800
Subject: [PATCH 5/6] Rebase with main and fix formatting

---
 src/xss-common-qsort.h           |  19 ++---
 src/xss-network-keyvaluesort.hpp |  10 +--
 src/xss-network-qsort.hpp        |   5 +-
 src/xss-pivot-selection.hpp      | 133 +++++++++++++++++--------------
 4 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 026bf30c..ac864ef5 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -87,7 +87,8 @@ X86_SIMD_SORT_INLINE bool array_has_nan(type_t *arr, arrsize_t size)
         else {
             in = vtype::loadu(arr + ii);
         }
-        auto nanmask = vtype::convert_mask_to_int(vtype::template fpclass<0x01 | 0x80>(in));
+        auto nanmask = vtype::convert_mask_to_int(
+                vtype::template fpclass<0x01 | 0x80>(in));
         if (nanmask != 0x00) {
             found_nan = true;
             break;
@@ -498,24 +499,20 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
                 arr + left, (int32_t)(right + 1 - left));
         return;
     }
-    
+
     auto pivot_result = get_pivot_smart<vtype, type_t>(arr, left, right);
     type_t pivot = pivot_result.pivot;
-    
-    if (pivot_result.result == pivot_result_t::Sorted){
-        return;
-    }
-    
+
+    if (pivot_result.result == pivot_result_t::Sorted) { return; }
+
     type_t smallest = vtype::type_max();
     type_t biggest = vtype::type_min();
 
     arrsize_t pivot_index
             = partition_avx512_unrolled<vtype, vtype::partition_unroll_factor>(
                     arr, left, right + 1, pivot, &smallest, &biggest);
-    
-    if (pivot_result.result == pivot_result_t::Only2Values){
-        return;
-    }
+
+    if (pivot_result.result == pivot_result_t::Only2Values) { return; }
 
     if (pivot != smallest)
         qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp
index 1cbbc159..a20da171 100644
--- a/src/xss-network-keyvaluesort.hpp
+++ b/src/xss-network-keyvaluesort.hpp
@@ -441,9 +441,8 @@ bitonic_fullmerge_n_vec(typename keyType::reg_t *keys,
 }
 
 template <typename keyType, typename indexType, int numVecs>
-X86_SIMD_SORT_INLINE void argsort_n_vec(typename keyType::type_t *keys,
-                                        arrsize_t *indices,
-                                        int N)
+X86_SIMD_SORT_INLINE void
+argsort_n_vec(typename keyType::type_t *keys, arrsize_t *indices, int N)
 {
     using kreg_t = typename keyType::reg_t;
     using ireg_t = typename indexType::reg_t;
@@ -586,9 +585,8 @@ X86_SIMD_SORT_INLINE void kvsort_n_vec(typename keyType::type_t *keys,
 }
 
 template <typename keyType, typename indexType, int maxN>
-X86_SIMD_SORT_INLINE void argsort_n(typename keyType::type_t *keys,
-                                    arrsize_t *indices,
-                                    int N)
+X86_SIMD_SORT_INLINE void
+argsort_n(typename keyType::type_t *keys, arrsize_t *indices, int N)
 {
     static_assert(keyType::numlanes == indexType::numlanes,
                   "invalid pairing of value/index types");
diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp
index 1c0188d9..d883004a 100644
--- a/src/xss-network-qsort.hpp
+++ b/src/xss-network-qsort.hpp
@@ -144,7 +144,8 @@ X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs)
 }
 
 template <typename vtype, int numVecs, typename reg_t = typename vtype::reg_t>
-X86_SIMD_SORT_FINLINE void sort_vectors(reg_t * vecs){
+X86_SIMD_SORT_FINLINE void sort_vectors(reg_t *vecs)
+{
     /* Run the initial sorting network to sort the columns of the [numVecs x
      * num_lanes] matrix
      */
@@ -188,7 +189,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
         vecs[i] = vtype::mask_loadu(
                 vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
     }
-    
+
     sort_vectors<vtype, numVecs>(vecs);
 
     // Unmasked part of the store
diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index 00beeff4..59dc0489 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -6,26 +6,29 @@
 enum class pivot_result_t : int { Normal, Sorted, Only2Values };
 
 template <typename type_t>
-struct pivot_results{
-    
+struct pivot_results {
+
     pivot_result_t result = pivot_result_t::Normal;
     type_t pivot = 0;
-    
-    pivot_results(type_t _pivot, pivot_result_t _result = pivot_result_t::Normal){
+
+    pivot_results(type_t _pivot,
+                  pivot_result_t _result = pivot_result_t::Normal)
+    {
         pivot = _pivot;
         result = _result;
     }
 };
 
 template <typename type_t>
-type_t next_value(type_t value){
+type_t next_value(type_t value)
+{
     // TODO this probably handles non-native float16 wrong
-    if constexpr (std::is_floating_point<type_t>::value){
+    if constexpr (std::is_floating_point<type_t>::value) {
         return std::nextafter(value, std::numeric_limits<type_t>::infinity());
-    }else{
-        if (value < std::numeric_limits<type_t>::max()){
-            return value + 1;
-        }else{
+    }
+    else {
+        if (value < std::numeric_limits<type_t>::max()) { return value + 1; }
+        else {
             return value;
         }
     }
@@ -96,23 +99,23 @@ X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr,
 }
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
-                                             type_t commonValue,
-                                             const arrsize_t left,
-                                             const arrsize_t right);
+X86_SIMD_SORT_INLINE pivot_results<type_t>
+get_pivot_near_constant(type_t *arr,
+                        type_t commonValue,
+                        const arrsize_t left,
+                        const arrsize_t right);
 
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_smart(type_t *arr,
-                                             const arrsize_t left,
-                                             const arrsize_t right)
+X86_SIMD_SORT_INLINE pivot_results<type_t>
+get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right)
 {
     using reg_t = typename vtype::reg_t;
     constexpr int numVecs = 4;
-    
-    if (right - left + 1 <= 4 * numVecs * vtype::numlanes){
-        return pivot_results<type_t>(get_pivot<vtype>(arr, left, right)); 
+
+    if (right - left + 1 <= 4 * numVecs * vtype::numlanes) {
+        return pivot_results<type_t>(get_pivot<vtype>(arr, left, right));
     }
-    
+
     constexpr int N = numVecs * vtype::numlanes;
 
     arrsize_t width = (right - vtype::numlanes) - left;
@@ -122,100 +125,107 @@ X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_smart(type_t *arr,
     for (int i = 0; i < numVecs; i++) {
         vecs[i] = vtype::loadu(arr + left + delta * i);
     }
-    
+
     // Sort the samples
     sort_vectors<vtype, numVecs>(vecs);
-    
+
     type_t samples[N];
-    for (int i = 0; i < numVecs; i++){
+    for (int i = 0; i < numVecs; i++) {
         vtype::storeu(samples + vtype::numlanes * i, vecs[i]);
     }
-    
+
     type_t smallest = samples[0];
     type_t largest = samples[N - 1];
     type_t median = samples[N / 2];
-    
-    if (smallest == largest){
+
+    if (smallest == largest) {
         // We have a very unlucky sample, or the array is constant / near constant
         // Run a special function meant to deal with this situation
         return get_pivot_near_constant<vtype, type_t>(arr, median, left, right);
-    }else if (median != smallest && median != largest){
+    }
+    else if (median != smallest && median != largest) {
         // We have a normal sample; use it's median
         return pivot_results<type_t>(median);
-    }else if (median == smallest){
+    }
+    else if (median == smallest) {
         // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample
         // Try just doing the next largest value greater than this seemingly very common value to seperate them out
         return pivot_results<type_t>(next_value<type_t>(median));
-    }else if (median == largest){
+    }
+    else if (median == largest) {
         // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample
         // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition
         return pivot_results<type_t>(median);
-    }else{
+    }
+    else {
         // Should be unreachable
         return pivot_results<type_t>(median);
     }
-    
+
     // Should be unreachable
     return pivot_results<type_t>(median);
 }
 
 // Handles the case where we seem to have a near-constant array, since our sample of the array was constant
 template <typename vtype, typename type_t>
-X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
-                                             type_t commonValue,
-                                             const arrsize_t left,
-                                             const arrsize_t right)
+X86_SIMD_SORT_INLINE pivot_results<type_t>
+get_pivot_near_constant(type_t *arr,
+                        type_t commonValue,
+                        const arrsize_t left,
+                        const arrsize_t right)
 {
     using reg_t = typename vtype::reg_t;
-    
+
     arrsize_t index = left;
-    
+
     type_t value1 = 0;
     type_t value2 = 0;
-    
+
     // First, search for any value not equal to the common value
     // First vectorized
     reg_t commonVec = vtype::set1(commonValue);
-    for (; index <= right - vtype::numlanes; index += vtype::numlanes){
+    for (; index <= right - vtype::numlanes; index += vtype::numlanes) {
         reg_t data = vtype::loadu(arr + index);
-        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))){
+        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))) {
             break;
         }
     }
-    
+
     // Than scalar at the end
-    for (; index <= right; index++){
-        if (arr[index] != commonValue){
+    for (; index <= right; index++) {
+        if (arr[index] != commonValue) {
             value1 = arr[index];
             break;
-        } 
+        }
     }
-    
-    if (index == right + 1){
+
+    if (index == right + 1) {
         // The array is completely constant
         // Setting the second flag to true skips partitioning, as the array is constant and thus sorted
         return pivot_results<type_t>(commonValue, pivot_result_t::Sorted);
     }
-    
+
     // Secondly, search for a second value not equal to either of the previous two
     // First vectorized
     reg_t value1Vec = vtype::set1(value1);
-    for (; index <= right - vtype::numlanes; index += vtype::numlanes){
+    for (; index <= right - vtype::numlanes; index += vtype::numlanes) {
         reg_t data = vtype::loadu(arr + index);
-        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) && !vtype::all_false(vtype::knot_opmask(vtype::eq(data, value1Vec)))){
+        if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))
+            && !vtype::all_false(
+                    vtype::knot_opmask(vtype::eq(data, value1Vec)))) {
             break;
         }
     }
-    
+
     // Then scalar
-    for (; index <= right; index++){
-        if (arr[index] != commonValue && arr[index] != value1){
+    for (; index <= right; index++) {
+        if (arr[index] != commonValue && arr[index] != value1) {
             value2 = arr[index];
             break;
-        } 
+        }
     }
-    
-    if (index == right + 1){
+
+    if (index == right + 1) {
         // The array contains only 2 values
         // We must pick the larger one, else the right partition is empty
         // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value
@@ -223,10 +233,15 @@ X86_SIMD_SORT_INLINE pivot_results<type_t> get_pivot_near_constant(type_t *arr,
         type_t pivot = std::max(value1, commonValue, comparison_func<vtype>);
         return pivot_results<type_t>(pivot, pivot_result_t::Only2Values);
     }
-    
+
     // The array has at least 3 distinct values. Use the middle one as the pivot
-    type_t median = std::max(std::min(value1,value2, comparison_func<vtype>), std::min(std::max(value1,value2, comparison_func<vtype>),commonValue, comparison_func<vtype>), comparison_func<vtype>);
+    type_t median = std::max(
+            std::min(value1, value2, comparison_func<vtype>),
+            std::min(std::max(value1, value2, comparison_func<vtype>),
+                     commonValue,
+                     comparison_func<vtype>),
+            comparison_func<vtype>);
     return pivot_results<type_t>(median);
 }
 
-#endif
\ No newline at end of file
+#endif

From cf7b0e51261e4711bbd34037c8f22b243092c68b Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 26 Feb 2024 09:09:39 -0800
Subject: [PATCH 6/6] Add tests for random_5d

---
 tests/test-keyvalue.cpp | 1 +
 tests/test-objqsort.cpp | 1 +
 tests/test-qsort.cpp    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp
index c82b033a..fda9130d 100644
--- a/tests/test-keyvalue.cpp
+++ b/tests/test-keyvalue.cpp
@@ -20,6 +20,7 @@ class simdkvsort : public ::testing::Test {
                    "reverse",
                    "smallrange",
                    "max_at_the_end",
+                   "random_5d",
                    "rand_max"};
     }
     std::vector<std::string> arrtype;
diff --git a/tests/test-objqsort.cpp b/tests/test-objqsort.cpp
index 2b1a1860..81aa7c8c 100644
--- a/tests/test-objqsort.cpp
+++ b/tests/test-objqsort.cpp
@@ -32,6 +32,7 @@ class simdobjsort : public ::testing::Test {
                    "reverse",
                    "smallrange",
                    "max_at_the_end",
+                   "random_5d",
                    "rand_max"};
     }
     std::vector<std::string> arrtype;
diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp
index abf871a3..40a9bf98 100644
--- a/tests/test-qsort.cpp
+++ b/tests/test-qsort.cpp
@@ -17,6 +17,7 @@ class simdsort : public ::testing::Test {
                    "reverse",
                    "smallrange",
                    "max_at_the_end",
+                   "random_5d",
                    "rand_max",
                    "rand_with_nan"};
     }