From 0041c05d070b94af018d8818b5dc6d3b53274bdd Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Mon, 29 Jan 2024 15:29:08 -0800 Subject: [PATCH 1/6] New pivot selection to improve performance in many special cases --- src/avx2-32bit-qsort.hpp | 25 ++++- src/avx2-64bit-qsort.hpp | 30 +++++- src/avx512-16bit-qsort.hpp | 21 ++++ src/avx512-32bit-qsort.hpp | 9 ++ src/avx512-64bit-common.h | 9 ++ src/avx512fp16-16bit-qsort.hpp | 3 + src/xss-common-qsort.h | 14 ++- src/xss-network-qsort.hpp | 24 +++-- src/xss-optimal-networks.hpp | 3 + src/xss-pivot-selection.hpp | 177 +++++++++++++++++++++++++++++++++ 10 files changed, 301 insertions(+), 14 deletions(-) diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp index cf0fbd55..667b2075 100644 --- a/src/avx2-32bit-qsort.hpp +++ b/src/avx2-32bit-qsort.hpp @@ -86,6 +86,11 @@ struct avx2_vector { { return _mm256_set1_epi32(type_max()); } // TODO: this should broadcast bits as is? + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); + return _mm256_xor_si256(x, allOnes); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); @@ -204,6 +209,9 @@ struct avx2_vector { { return v; } + static bool all_false(opmask_t k){ + return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -242,6 +250,11 @@ struct avx2_vector { { return _mm256_set1_epi32(type_max()); } + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); + return _mm256_xor_si256(x, allOnes); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); @@ -349,6 +362,9 @@ struct avx2_vector { { return v; } + static bool all_false(opmask_t k){ + return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -387,7 +403,11 @@ struct avx2_vector { { return _mm256_set1_ps(type_max()); } - + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1, -1, -1, -1, -1); + return _mm256_xor_si256(x, allOnes); + } static ymmi_t seti(int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8) { @@ -514,6 +534,9 @@ struct avx2_vector { { return _mm256_castps_si256(v); } + static bool all_false(opmask_t k){ + return _mm256_movemask_ps(_mm256_castsi256_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index e5f53808..604c974a 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -68,12 +68,17 @@ struct avx2_vector { { return _mm256_set1_epi64x(type_max()); } // TODO: this should broadcast bits as is? + static opmask_t knot_opmask(opmask_t x) + { + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + return _mm256_xor_si256(x, allTrue); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_64bit(mask); } - static ymmi_t seti(int v1, int v2, int v3, int v4) + static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } @@ -209,6 +214,9 @@ struct avx2_vector { { return v; } + static bool all_false(opmask_t k){ + return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; + } }; template <> struct avx2_vector { @@ -239,12 +247,17 @@ struct avx2_vector { { return _mm256_set1_epi64x(type_max()); } + static opmask_t knot_opmask(opmask_t x) + { + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + return _mm256_xor_si256(x, allTrue); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); return convert_int_to_avx2_mask_64bit(mask); } - static ymmi_t seti(int v1, int v2, int v3, int v4) + static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } @@ -378,6 +391,9 @@ struct avx2_vector { { return v; } + static bool all_false(opmask_t k){ + return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; + } }; /* @@ -421,6 +437,11 @@ struct avx2_vector { { return _mm256_set1_pd(type_max()); } + static opmask_t knot_opmask(opmask_t x) + { + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + return _mm256_xor_si256(x, allTrue); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); @@ -440,7 +461,7 @@ struct avx2_vector { static_assert(type == (0x01 | 0x80), "should not reach here"); } } - static ymmi_t seti(int v1, int v2, int v3, int v4) + static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4) { return _mm256_set_epi64x(v1, v2, v3, v4); } @@ -571,6 +592,9 @@ struct avx2_vector { { return _mm256_castpd_si256(v); } + static bool all_false(opmask_t k){ + return _mm256_movemask_pd(_mm256_castsi256_pd(k)) == 0; + } }; struct avx2_64bit_swizzle_ops { diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 32d7419c..9d9caf1f 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -81,6 +81,10 @@ struct zmm_vector { exp_eq, mant_x, mant_y, _MM_CMPINT_NLT); return _kxor_mask32(mask_ge, neg); } + static opmask_t eq(reg_t x, reg_t y) + { + return _mm512_cmpeq_epu16_mask(x, y); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); @@ -186,6 +190,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -238,6 +245,10 @@ struct zmm_vector { { return _mm512_cmp_epi16_mask(x, y, _MM_CMPINT_NLT); } + static opmask_t eq(reg_t x, reg_t y) + { + return _mm512_cmpeq_epi16_mask(x, y); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); @@ -323,6 +334,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -374,6 +388,10 @@ struct zmm_vector { { return _mm512_cmp_epu16_mask(x, y, _MM_CMPINT_NLT); } + static opmask_t eq(reg_t x, reg_t y) + { + return _mm512_cmpeq_epu16_mask(x, y); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); @@ -457,6 +475,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 74615765..3fe9b076 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -198,6 +198,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -377,6 +380,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -570,6 +576,9 @@ struct zmm_vector { { return _mm512_castps_si512(v); } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 0c4e3d58..e885d11a 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -732,6 +732,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -903,6 +906,9 @@ struct zmm_vector { { return v; } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -1093,6 +1099,9 @@ struct zmm_vector { { return _mm512_castpd_si512(v); } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index f44209fa..3d72d656 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -150,6 +150,9 @@ struct zmm_vector<_Float16> { { return _mm512_castph_si512(v); } + static bool all_false(opmask_t k){ + return k == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 7b89ba21..47267b82 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -498,14 +498,24 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) arr + left, (int32_t)(right + 1 - left)); return; } - - type_t pivot = get_pivot_blocks(arr, left, right); + + auto pivot_result = get_pivot_smart(arr, left, right); + type_t pivot = pivot_result.pivot; + + if (pivot_result.alreadySorted){ + return; + } + type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); + + if (pivot_result.only2Values){ + return; + } if (pivot != smallest) qsort_(arr, left, pivot_index - 1, max_iters - 1); diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index 56a1aca1..1c0188d9 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -4,6 +4,9 @@ #include "xss-optimal-networks.hpp" #include "xss-common-qsort.h" +template +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); + template X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs) { @@ -140,6 +143,17 @@ X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs) } } +template +X86_SIMD_SORT_FINLINE void sort_vectors(reg_t * vecs){ + /* Run the initial sorting network to sort the columns of the [numVecs x + * num_lanes] matrix + */ + bitonic_sort_n_vec(vecs); + + // Merge the vectors using bitonic merging networks + merge_n_vec(vecs); +} + template X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) { @@ -174,14 +188,8 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) vecs[i] = vtype::mask_loadu( vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes); } - - /* Run the initial sorting network to sort the columns of the [numVecs x - * num_lanes] matrix - */ - bitonic_sort_n_vec(vecs); - - // Merge the vectors using bitonic merging networks - merge_n_vec(vecs); + + sort_vectors(vecs); // Unmasked part of the store X86_SIMD_SORT_UNROLL_LOOP(64) diff --git a/src/xss-optimal-networks.hpp b/src/xss-optimal-networks.hpp index 3dfa5281..bffe493d 100644 --- a/src/xss-optimal-networks.hpp +++ b/src/xss-optimal-networks.hpp @@ -1,6 +1,9 @@ // All of these sources files are generated from the optimal networks described in // https://bertdobbelaere.github.io/sorting_networks.html +template +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); + template X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs) { diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 2a28b348..348adb1a 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -1,6 +1,45 @@ +#ifndef XSS_PIVOT_SELECTION +#define XSS_PIVOT_SELECTION + +#include "xss-network-qsort.hpp" + +template +struct pivot_results{ + bool alreadySorted = false; + bool only2Values = false; + type_t pivot = 0; + + pivot_results(type_t _pivot){ + pivot = _pivot; + alreadySorted = false; + } + + pivot_results(type_t _pivot, bool _alreadySorted){ + pivot = _pivot; + alreadySorted = _alreadySorted; + } +}; + +template +type_t next_value(type_t value){ + // TODO this probably handles non-native float16 wrong + if constexpr (std::is_floating_point::value){ + return std::nextafter(value, std::numeric_limits::infinity()); + }else{ + if (value < std::numeric_limits::max()){ + return value + 1; + }else{ + return value; + } + } +} + template X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); +template +X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); + template X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left, @@ -61,3 +100,141 @@ X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, vtype::storeu(data, vec); return data[vtype::numlanes / 2]; } + +template +X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, + type_t commonValue, + const arrsize_t left, + const arrsize_t right); + +template +X86_SIMD_SORT_INLINE pivot_results get_pivot_smart(type_t *arr, + const arrsize_t left, + const arrsize_t right) +{ + using reg_t = typename vtype::reg_t; + constexpr int numVecs = 4; + + if (right - left + 1 <= 4 * numVecs * vtype::numlanes){ + return pivot_results(get_pivot(arr, left, right)); + } + + constexpr int N = numVecs * vtype::numlanes; + + arrsize_t width = (right - vtype::numlanes) - left; + arrsize_t delta = width / numVecs; + + reg_t vecs[numVecs]; + for (int i = 0; i < numVecs; i++) { + vecs[i] = vtype::loadu(arr + left + delta * i); + } + + // Sort the samples + sort_vectors(vecs); + + type_t samples[N]; + for (int i = 0; i < numVecs; i++){ + vtype::storeu(samples + vtype::numlanes * i, vecs[i]); + } + + type_t smallest = samples[0]; + type_t largest = samples[N - 1]; + type_t median = samples[N / 2]; + + if (smallest == largest){ + // We have a very unlucky sample, or the array is constant / near constant + // Run a special function meant to deal with this situation + return get_pivot_near_constant(arr, median, left, right); + }else if (median != smallest && median != largest){ + // We have a normal sample; use it's median + return pivot_results(median); + }else if (median == smallest){ + // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample + // Try just doing the next largest value greater than this seemingly very common value to seperate them out + return pivot_results(next_value(median)); + }else if (median == largest){ + // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample + // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition + return pivot_results(median); + }else{ + // Should be unreachable + return pivot_results(median); + } + + // Should be unreachable + return pivot_results(median); +} + +// Handles the case where we seem to have a near-constant array, since our sample of the array was constant +template +X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, + type_t commonValue, + const arrsize_t left, + const arrsize_t right) +{ + using reg_t = typename vtype::reg_t; + + arrsize_t index = left; + + type_t value1 = 0; + type_t value2 = 0; + + // First, search for any value not equal to the common value + // First vectorized + reg_t commonVec = vtype::set1(commonValue); + for (; index <= right - vtype::numlanes; index += vtype::numlanes){ + reg_t data = vtype::loadu(arr + index); + if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))){ + break; + } + } + + // Than scalar at the end + for (; index <= right; index++){ + if (arr[index] != commonValue){ + value1 = arr[index]; + break; + } + } + + if (index == right + 1){ + // The array is completely constant + // Setting the second flag to true skips partitioning, as the array is constant and thus sorted + return pivot_results(commonValue, true); + } + + // Secondly, search for a second value not equal to either of the previous two + // First vectorized + reg_t value1Vec = vtype::set1(value1); + for (; index <= right - vtype::numlanes; index += vtype::numlanes){ + reg_t data = vtype::loadu(arr + index); + if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) && !vtype::all_false(vtype::knot_opmask(vtype::eq(data, value1Vec)))){ + break; + } + } + + // Then scalar + for (; index <= right; index++){ + if (arr[index] != commonValue && arr[index] != value1){ + value2 = arr[index]; + break; + } + } + + if (index == right + 1){ + // The array contains only 2 values + // We must pick the larger one, else the right partition is empty + // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value + // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere + type_t pivot = std::max(value1, commonValue, comparison_func); + auto result = pivot_results(pivot, false); + result.only2Values = true; + return result; + } + + // The array has at least 3 distinct values. Use the middle one as the pivot + type_t median = std::max(std::min(value1,value2, comparison_func), std::min(std::max(value1,value2, comparison_func),commonValue, comparison_func), comparison_func); + return pivot_results(median); +} + +#endif \ No newline at end of file From 0870e1e767ae00539f4b201fd9c83ea78c9ab1c6 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 31 Jan 2024 09:52:40 -0800 Subject: [PATCH 2/6] Fixed build issues and missing function for AVX512FP16 vector type --- src/avx512fp16-16bit-qsort.hpp | 4 ++++ src/xss-pivot-selection.hpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 3d72d656..f4b31c9e 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -55,6 +55,10 @@ struct zmm_vector<_Float16> { { return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ); } + static opmask_t eq(reg_t x, reg_t y) + { + return _mm512_cmp_ph_mask(x, y, _CMP_EQ_OQ); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { return ((0x1ull << num_to_read) - 0x1ull); diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 348adb1a..bd793e60 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -37,7 +37,7 @@ type_t next_value(type_t value){ template X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); -template +template X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); template From 88edcf71cbe08c982f9ab4ea76d912d6ed5c96d7 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 31 Jan 2024 10:57:09 -0800 Subject: [PATCH 3/6] Moved declaration of comparison_func to fix build issues --- src/xss-common-includes.h | 3 +++ src/xss-common-qsort.h | 2 +- src/xss-pivot-selection.hpp | 3 --- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 98a3fe15..281bcc01 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -106,4 +106,7 @@ struct avx2_half_vector; enum class simd_type : int { AVX2, AVX512 }; +template +X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); + #endif // XSS_COMMON_INCLUDES diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 47267b82..f95fab79 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -136,7 +136,7 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size) return size - count - 1; } -template +template X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b) { return a < b; diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index bd793e60..2a1c3bd5 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -37,9 +37,6 @@ type_t next_value(type_t value){ template X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); -template -X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); - template X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, const arrsize_t left, From af30caa8aa1201c7bf6b8ced397bb59bd6448014 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Mon, 5 Feb 2024 11:34:15 -0800 Subject: [PATCH 4/6] Fixed bug in avx2-64bit logic and cleaned up some special case handling --- src/avx2-64bit-qsort.hpp | 6 +++--- src/xss-common-qsort.h | 4 ++-- src/xss-pivot-selection.hpp | 21 ++++++++------------- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index 604c974a..e7f8129e 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -70,7 +70,7 @@ struct avx2_vector { } // TODO: this should broadcast bits as is? static opmask_t knot_opmask(opmask_t x) { - auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } static opmask_t get_partial_loadmask(uint64_t num_to_read) @@ -249,7 +249,7 @@ struct avx2_vector { } static opmask_t knot_opmask(opmask_t x) { - auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } static opmask_t get_partial_loadmask(uint64_t num_to_read) @@ -439,7 +439,7 @@ struct avx2_vector { } static opmask_t knot_opmask(opmask_t x) { - auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF); + auto allTrue = _mm256_set1_epi64x(0xFFFF'FFFF'FFFF'FFFF); return _mm256_xor_si256(x, allTrue); } static opmask_t get_partial_loadmask(uint64_t num_to_read) diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index f95fab79..026bf30c 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -502,7 +502,7 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) auto pivot_result = get_pivot_smart(arr, left, right); type_t pivot = pivot_result.pivot; - if (pivot_result.alreadySorted){ + if (pivot_result.result == pivot_result_t::Sorted){ return; } @@ -513,7 +513,7 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); - if (pivot_result.only2Values){ + if (pivot_result.result == pivot_result_t::Only2Values){ return; } diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 2a1c3bd5..00beeff4 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -3,20 +3,17 @@ #include "xss-network-qsort.hpp" +enum class pivot_result_t : int { Normal, Sorted, Only2Values }; + template struct pivot_results{ - bool alreadySorted = false; - bool only2Values = false; - type_t pivot = 0; - pivot_results(type_t _pivot){ - pivot = _pivot; - alreadySorted = false; - } + pivot_result_t result = pivot_result_t::Normal; + type_t pivot = 0; - pivot_results(type_t _pivot, bool _alreadySorted){ + pivot_results(type_t _pivot, pivot_result_t _result = pivot_result_t::Normal){ pivot = _pivot; - alreadySorted = _alreadySorted; + result = _result; } }; @@ -197,7 +194,7 @@ X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, if (index == right + 1){ // The array is completely constant // Setting the second flag to true skips partitioning, as the array is constant and thus sorted - return pivot_results(commonValue, true); + return pivot_results(commonValue, pivot_result_t::Sorted); } // Secondly, search for a second value not equal to either of the previous two @@ -224,9 +221,7 @@ X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere type_t pivot = std::max(value1, commonValue, comparison_func); - auto result = pivot_results(pivot, false); - result.only2Values = true; - return result; + return pivot_results(pivot, pivot_result_t::Only2Values); } // The array has at least 3 distinct values. Use the middle one as the pivot From 2b913b813cc972b2445c493106cf269dbb6f498a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 21 Feb 2024 13:38:20 -0800 Subject: [PATCH 5/6] Rebase with main and fix formatting --- src/xss-common-qsort.h | 19 ++--- src/xss-network-keyvaluesort.hpp | 10 +-- src/xss-network-qsort.hpp | 5 +- src/xss-pivot-selection.hpp | 133 +++++++++++++++++-------------- 4 files changed, 89 insertions(+), 78 deletions(-) diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 026bf30c..ac864ef5 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -87,7 +87,8 @@ X86_SIMD_SORT_INLINE bool array_has_nan(type_t *arr, arrsize_t size) else { in = vtype::loadu(arr + ii); } - auto nanmask = vtype::convert_mask_to_int(vtype::template fpclass<0x01 | 0x80>(in)); + auto nanmask = vtype::convert_mask_to_int( + vtype::template fpclass<0x01 | 0x80>(in)); if (nanmask != 0x00) { found_nan = true; break; @@ -498,24 +499,20 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) arr + left, (int32_t)(right + 1 - left)); return; } - + auto pivot_result = get_pivot_smart(arr, left, right); type_t pivot = pivot_result.pivot; - - if (pivot_result.result == pivot_result_t::Sorted){ - return; - } - + + if (pivot_result.result == pivot_result_t::Sorted) { return; } + type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); - - if (pivot_result.result == pivot_result_t::Only2Values){ - return; - } + + if (pivot_result.result == pivot_result_t::Only2Values) { return; } if (pivot != smallest) qsort_(arr, left, pivot_index - 1, max_iters - 1); diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index 1cbbc159..a20da171 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -441,9 +441,8 @@ bitonic_fullmerge_n_vec(typename keyType::reg_t *keys, } template -X86_SIMD_SORT_INLINE void argsort_n_vec(typename keyType::type_t *keys, - arrsize_t *indices, - int N) +X86_SIMD_SORT_INLINE void +argsort_n_vec(typename keyType::type_t *keys, arrsize_t *indices, int N) { using kreg_t = typename keyType::reg_t; using ireg_t = typename indexType::reg_t; @@ -586,9 +585,8 @@ X86_SIMD_SORT_INLINE void kvsort_n_vec(typename keyType::type_t *keys, } template -X86_SIMD_SORT_INLINE void argsort_n(typename keyType::type_t *keys, - arrsize_t *indices, - int N) +X86_SIMD_SORT_INLINE void +argsort_n(typename keyType::type_t *keys, arrsize_t *indices, int N) { static_assert(keyType::numlanes == indexType::numlanes, "invalid pairing of value/index types"); diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index 1c0188d9..d883004a 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -144,7 +144,8 @@ X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs) } template -X86_SIMD_SORT_FINLINE void sort_vectors(reg_t * vecs){ +X86_SIMD_SORT_FINLINE void sort_vectors(reg_t *vecs) +{ /* Run the initial sorting network to sort the columns of the [numVecs x * num_lanes] matrix */ @@ -188,7 +189,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) vecs[i] = vtype::mask_loadu( vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes); } - + sort_vectors(vecs); // Unmasked part of the store diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 00beeff4..59dc0489 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -6,26 +6,29 @@ enum class pivot_result_t : int { Normal, Sorted, Only2Values }; template -struct pivot_results{ - +struct pivot_results { + pivot_result_t result = pivot_result_t::Normal; type_t pivot = 0; - - pivot_results(type_t _pivot, pivot_result_t _result = pivot_result_t::Normal){ + + pivot_results(type_t _pivot, + pivot_result_t _result = pivot_result_t::Normal) + { pivot = _pivot; result = _result; } }; template -type_t next_value(type_t value){ +type_t next_value(type_t value) +{ // TODO this probably handles non-native float16 wrong - if constexpr (std::is_floating_point::value){ + if constexpr (std::is_floating_point::value) { return std::nextafter(value, std::numeric_limits::infinity()); - }else{ - if (value < std::numeric_limits::max()){ - return value + 1; - }else{ + } + else { + if (value < std::numeric_limits::max()) { return value + 1; } + else { return value; } } @@ -96,23 +99,23 @@ X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, } template -X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, - type_t commonValue, - const arrsize_t left, - const arrsize_t right); +X86_SIMD_SORT_INLINE pivot_results +get_pivot_near_constant(type_t *arr, + type_t commonValue, + const arrsize_t left, + const arrsize_t right); template -X86_SIMD_SORT_INLINE pivot_results get_pivot_smart(type_t *arr, - const arrsize_t left, - const arrsize_t right) +X86_SIMD_SORT_INLINE pivot_results +get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right) { using reg_t = typename vtype::reg_t; constexpr int numVecs = 4; - - if (right - left + 1 <= 4 * numVecs * vtype::numlanes){ - return pivot_results(get_pivot(arr, left, right)); + + if (right - left + 1 <= 4 * numVecs * vtype::numlanes) { + return pivot_results(get_pivot(arr, left, right)); } - + constexpr int N = numVecs * vtype::numlanes; arrsize_t width = (right - vtype::numlanes) - left; @@ -122,100 +125,107 @@ X86_SIMD_SORT_INLINE pivot_results get_pivot_smart(type_t *arr, for (int i = 0; i < numVecs; i++) { vecs[i] = vtype::loadu(arr + left + delta * i); } - + // Sort the samples sort_vectors(vecs); - + type_t samples[N]; - for (int i = 0; i < numVecs; i++){ + for (int i = 0; i < numVecs; i++) { vtype::storeu(samples + vtype::numlanes * i, vecs[i]); } - + type_t smallest = samples[0]; type_t largest = samples[N - 1]; type_t median = samples[N / 2]; - - if (smallest == largest){ + + if (smallest == largest) { // We have a very unlucky sample, or the array is constant / near constant // Run a special function meant to deal with this situation return get_pivot_near_constant(arr, median, left, right); - }else if (median != smallest && median != largest){ + } + else if (median != smallest && median != largest) { // We have a normal sample; use it's median return pivot_results(median); - }else if (median == smallest){ + } + else if (median == smallest) { // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample // Try just doing the next largest value greater than this seemingly very common value to seperate them out return pivot_results(next_value(median)); - }else if (median == largest){ + } + else if (median == largest) { // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition return pivot_results(median); - }else{ + } + else { // Should be unreachable return pivot_results(median); } - + // Should be unreachable return pivot_results(median); } // Handles the case where we seem to have a near-constant array, since our sample of the array was constant template -X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, - type_t commonValue, - const arrsize_t left, - const arrsize_t right) +X86_SIMD_SORT_INLINE pivot_results +get_pivot_near_constant(type_t *arr, + type_t commonValue, + const arrsize_t left, + const arrsize_t right) { using reg_t = typename vtype::reg_t; - + arrsize_t index = left; - + type_t value1 = 0; type_t value2 = 0; - + // First, search for any value not equal to the common value // First vectorized reg_t commonVec = vtype::set1(commonValue); - for (; index <= right - vtype::numlanes; index += vtype::numlanes){ + for (; index <= right - vtype::numlanes; index += vtype::numlanes) { reg_t data = vtype::loadu(arr + index); - if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))){ + if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))) { break; } } - + // Than scalar at the end - for (; index <= right; index++){ - if (arr[index] != commonValue){ + for (; index <= right; index++) { + if (arr[index] != commonValue) { value1 = arr[index]; break; - } + } } - - if (index == right + 1){ + + if (index == right + 1) { // The array is completely constant // Setting the second flag to true skips partitioning, as the array is constant and thus sorted return pivot_results(commonValue, pivot_result_t::Sorted); } - + // Secondly, search for a second value not equal to either of the previous two // First vectorized reg_t value1Vec = vtype::set1(value1); - for (; index <= right - vtype::numlanes; index += vtype::numlanes){ + for (; index <= right - vtype::numlanes; index += vtype::numlanes) { reg_t data = vtype::loadu(arr + index); - if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) && !vtype::all_false(vtype::knot_opmask(vtype::eq(data, value1Vec)))){ + if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) + && !vtype::all_false( + vtype::knot_opmask(vtype::eq(data, value1Vec)))) { break; } } - + // Then scalar - for (; index <= right; index++){ - if (arr[index] != commonValue && arr[index] != value1){ + for (; index <= right; index++) { + if (arr[index] != commonValue && arr[index] != value1) { value2 = arr[index]; break; - } + } } - - if (index == right + 1){ + + if (index == right + 1) { // The array contains only 2 values // We must pick the larger one, else the right partition is empty // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the larger value @@ -223,10 +233,15 @@ X86_SIMD_SORT_INLINE pivot_results get_pivot_near_constant(type_t *arr, type_t pivot = std::max(value1, commonValue, comparison_func); return pivot_results(pivot, pivot_result_t::Only2Values); } - + // The array has at least 3 distinct values. Use the middle one as the pivot - type_t median = std::max(std::min(value1,value2, comparison_func), std::min(std::max(value1,value2, comparison_func),commonValue, comparison_func), comparison_func); + type_t median = std::max( + std::min(value1, value2, comparison_func), + std::min(std::max(value1, value2, comparison_func), + commonValue, + comparison_func), + comparison_func); return pivot_results(median); } -#endif \ No newline at end of file +#endif From cf7b0e51261e4711bbd34037c8f22b243092c68b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 26 Feb 2024 09:09:39 -0800 Subject: [PATCH 6/6] Add tests for random_5d --- tests/test-keyvalue.cpp | 1 + tests/test-objqsort.cpp | 1 + tests/test-qsort.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index c82b033a..fda9130d 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -20,6 +20,7 @@ class simdkvsort : public ::testing::Test { "reverse", "smallrange", "max_at_the_end", + "random_5d", "rand_max"}; } std::vector arrtype; diff --git a/tests/test-objqsort.cpp b/tests/test-objqsort.cpp index 2b1a1860..81aa7c8c 100644 --- a/tests/test-objqsort.cpp +++ b/tests/test-objqsort.cpp @@ -32,6 +32,7 @@ class simdobjsort : public ::testing::Test { "reverse", "smallrange", "max_at_the_end", + "random_5d", "rand_max"}; } std::vector arrtype; diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index abf871a3..40a9bf98 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -17,6 +17,7 @@ class simdsort : public ::testing::Test { "reverse", "smallrange", "max_at_the_end", + "random_5d", "rand_max", "rand_with_nan"}; }