diff --git a/meson.build b/meson.build index 873094ba..1e2a913a 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,5 @@ project('x86-simd-sort', 'cpp', - version : '4.0.0', + version : '5.0.0', license : 'BSD 3-clause', default_options : ['cpp_std=c++17']) fs = import('fs') diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 1cd4ca1c..cbd38a32 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -25,6 +25,7 @@ template X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm); struct avx512_64bit_swizzle_ops; +struct avx512_ymm_64bit_swizzle_ops; template <> struct ymm_vector { @@ -34,6 +35,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -208,6 +210,10 @@ struct ymm_vector { { return _mm256_castps_si256(v); } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -222,6 +228,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -382,6 +389,10 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -396,6 +407,7 @@ struct ymm_vector { using opmask_t = __mmask8; static const uint8_t numlanes = 8; static constexpr simd_type vec_type = simd_type::AVX512; + using swizzle_ops = avx512_ymm_64bit_swizzle_ops; static type_t type_max() { @@ -556,6 +568,10 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -1204,4 +1220,77 @@ struct avx512_64bit_swizzle_ops { } }; +struct avx512_ymm_64bit_swizzle_ops { + template + X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) + { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b10110001); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 4) { + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, 0b01001110); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 8) { + v = _mm256_permute2x128_si256(v, v, 0b00000001); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t + reverse_n(typename vtype::reg_t reg) + { + __m256i v = vtype::cast_to(reg); + + if constexpr (scale == 2) { return swap_n(reg); } + else if constexpr (scale == 4) { + constexpr uint64_t mask = 0b00011011; + __m256 vf = _mm256_castsi256_ps(v); + vf = _mm256_permute_ps(vf, mask); + v = _mm256_castps_si256(vf); + } + else if constexpr (scale == 8) { + return vtype::reverse(reg); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v); + } + + template + X86_SIMD_SORT_INLINE typename vtype::reg_t + merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) + { + __m256i v1 = vtype::cast_to(reg); + __m256i v2 = vtype::cast_to(other); + + if constexpr (scale == 2) { + v1 = _mm256_blend_epi32(v1, v2, 0b01010101); + } + else if constexpr (scale == 4) { + v1 = _mm256_blend_epi32(v1, v2, 0b00110011); + } + else if constexpr (scale == 8) { + v1 = _mm256_blend_epi32(v1, v2, 0b00001111); + } + else { + static_assert(scale == -1, "should not be reached"); + } + + return vtype::cast_from(v1); + } +}; + #endif diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 9acdbd71..48b28ad7 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -388,7 +388,10 @@ X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys, return; } - type1_t pivot = get_pivot_blocks(keys, left, right); + type1_t pivot; + auto pivot_result = get_pivot_smart(keys, left, right); + pivot = pivot_result.pivot; + type1_t smallest = vtype1::type_max(); type1_t biggest = vtype1::type_min(); arrsize_t pivot_index = partition_avx512_unrolled( diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 59dc0489..13fed026 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -157,12 +157,7 @@ get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right) // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition return pivot_results(median); } - else { - // Should be unreachable - return pivot_results(median); - } - // Should be unreachable return pivot_results(median); } diff --git a/utils/rand_array.h b/utils/rand_array.h index a9703551..dc20dbb9 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -137,7 +137,7 @@ static std::vector get_array(std::string arrtype, val = std::numeric_limits::max(); } for (size_t ii = 1; ii <= arrsize; ++ii) { - if (rand() % 0x1) { arr[ii] = val; } + if (rand() & 0x1) { arr[ii] = val; } } } else {