diff --git a/README.md b/README.md index 0b33eda8..8a6015c0 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,33 @@ AVX2 specific implementations, please see [README](https://github.com/intel/x86-simd-sort/blob/main/src/README.md) file under `src/` directory. The following routines are currently supported: + +### Sort routines on arrays ```cpp x86simdsort::qsort(T* arr, size_t size, bool hasnan); x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan); x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan); +``` +Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, +int32_t, double, uint64_t, int64_t]` + +### Key-value sort routines on pairs of arrays +```cpp +x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan); +``` +Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double, +uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit +data types. + +### Arg sort routines on arrays +```cpp std::vector arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan); std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan); ``` +Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, +int32_t, double, uint64_t, int64_t]` -### Build/Install +## Build/Install [meson](https://github.com/mesonbuild/meson) is the used build system. Command to build and install the library: @@ -35,7 +53,7 @@ benchmark](https://github.com/google/benchmark) frameworks respectively. You can configure meson to build them both by using `-Dbuild_tests=true` and `-Dbuild_benchmarks=true`. -### Example usage +## Example usage ```cpp #include "x86simdsort.h" @@ -48,7 +66,7 @@ int main() { ``` -### Details +## Details - `x86simdsort::qsort` is equivalent to `qsort` in [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm) @@ -77,7 +95,7 @@ argselect) will not use the SIMD based algorithms if they detect NAN's in the array. You can read details of all the implementations [here](https://github.com/intel/x86-simd-sort/src/README.md). -### Downstream projects using x86-simd-sort +## Downstream projects using x86-simd-sort - NumPy uses this as a [submodule](https://github.com/numpy/numpy/pull/22315) to accelerate `np.sort, np.argsort, np.partition and np.argpartition`. - A slightly modifed version this library has been integrated into [openJDK](https://github.com/openjdk/jdk/pull/14227). diff --git a/benchmarks/bench-keyvalue.hpp b/benchmarks/bench-keyvalue.hpp index 101a8fae..1eaab9e9 100644 --- a/benchmarks/bench-keyvalue.hpp +++ b/benchmarks/bench-keyvalue.hpp @@ -46,3 +46,6 @@ static void simdkvsort(benchmark::State &state, Args &&...args) BENCH_BOTH_KVSORT(uint64_t) BENCH_BOTH_KVSORT(int64_t) BENCH_BOTH_KVSORT(double) +BENCH_BOTH_KVSORT(uint32_t) +BENCH_BOTH_KVSORT(int32_t) +BENCH_BOTH_KVSORT(float) diff --git a/examples/avx512-kv.cpp b/examples/avx512-kv.cpp index 26fc910d..c789b7c8 100644 --- a/examples/avx512-kv.cpp +++ b/examples/avx512-kv.cpp @@ -5,6 +5,7 @@ int main() { int64_t arr1[size]; uint64_t arr2[size]; double arr3[size]; + float arr4[size]; avx512_qsort_kv(arr1, arr1, size); avx512_qsort_kv(arr1, arr2, size); avx512_qsort_kv(arr1, arr3, size); @@ -13,6 +14,9 @@ int main() { avx512_qsort_kv(arr2, arr3, size); avx512_qsort_kv(arr3, arr1, size); avx512_qsort_kv(arr3, arr2, size); - avx512_qsort_kv(arr3, arr3, size); + avx512_qsort_kv(arr1, arr4, size); + avx512_qsort_kv(arr2, arr4, size); + avx512_qsort_kv(arr3, arr4, size); + return 0; return 0; } diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 1dabfb71..02faa90f 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -33,9 +33,34 @@ return avx512_argselect(arr, k, arrsize, hasnan); \ } -#define DEFINE_KEYVALUE_METHODS(type1, type2) \ +#define DEFINE_KEYVALUE_METHODS(type) \ template <> \ - void keyvalue_qsort(type1 *key, type2* val, size_t arrsize, bool hasnan) \ + void keyvalue_qsort(type *key, uint64_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, int64_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, double* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, uint32_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, int32_t* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + template <> \ + void keyvalue_qsort(type *key, float* val, size_t arrsize, bool hasnan) \ { \ avx512_qsort_kv(key, val, arrsize, hasnan); \ } \ @@ -49,14 +74,11 @@ namespace avx512 { DEFINE_ALL_METHODS(uint64_t) DEFINE_ALL_METHODS(int64_t) DEFINE_ALL_METHODS(double) - DEFINE_KEYVALUE_METHODS(double, uint64_t) - DEFINE_KEYVALUE_METHODS(double, int64_t) - DEFINE_KEYVALUE_METHODS(double, double) - DEFINE_KEYVALUE_METHODS(uint64_t, uint64_t) - DEFINE_KEYVALUE_METHODS(uint64_t, int64_t) - DEFINE_KEYVALUE_METHODS(uint64_t, double) - DEFINE_KEYVALUE_METHODS(int64_t, uint64_t) - DEFINE_KEYVALUE_METHODS(int64_t, int64_t) - DEFINE_KEYVALUE_METHODS(int64_t, double) + DEFINE_KEYVALUE_METHODS(uint64_t) + DEFINE_KEYVALUE_METHODS(int64_t) + DEFINE_KEYVALUE_METHODS(double) + DEFINE_KEYVALUE_METHODS(uint32_t) + DEFINE_KEYVALUE_METHODS(int32_t) + DEFINE_KEYVALUE_METHODS(float) } // namespace avx512 } // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 86caeb0e..8ebbc6be 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -196,14 +196,19 @@ DISPATCH_ALL(argselect, (ISA_LIST("avx512_skx")), (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, uint64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(uint64_t, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, uint64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(int64_t, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, int64_t, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, double, (ISA_LIST("avx512_skx"))) -DISPATCH_KEYVALUE_SORT(double, uint64_t, (ISA_LIST("avx512_skx"))) +#define DISPATCH_KEYVALUE_SORT_FORTYPE(type) \ + DISPATCH_KEYVALUE_SORT(type, uint64_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, int64_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, double, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, uint32_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, int32_t, (ISA_LIST("avx512_skx")))\ + DISPATCH_KEYVALUE_SORT(type, float, (ISA_LIST("avx512_skx")))\ + +DISPATCH_KEYVALUE_SORT_FORTYPE(uint64_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(int64_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(double) +DISPATCH_KEYVALUE_SORT_FORTYPE(uint32_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(int32_t) +DISPATCH_KEYVALUE_SORT_FORTYPE(float) } // namespace x86simdsort diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index e7f9f44c..909f3b2b 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -186,6 +186,10 @@ struct ymm_vector { // return _mm256_shuffle_ps(zmm, zmm, mask); //} } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_ps((float *)mem, x); @@ -342,6 +346,10 @@ struct ymm_vector { * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); @@ -498,6 +506,10 @@ struct ymm_vector { * 32-bit and 64-bit */ return _mm256_shuffle_epi32(zmm, 0b10110001); } + static reg_t sort_vec(reg_t x) + { + return sort_zmm_64bit>(x); + } static void storeu(void *mem, reg_t x) { _mm256_storeu_si256((__m256i *)mem, x); diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 8281d2db..55f79bb1 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -558,7 +558,7 @@ template (keys, indexes, i, size); if (i == 0) { break; } } @@ -617,26 +617,33 @@ template X86_SIMD_SORT_INLINE void avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize, bool hasnan = false) { - UNUSED(hasnan); + using keytype = typename std::conditional, + zmm_vector>::type; + using valtype = typename std::conditional, + zmm_vector>::type; if (arrsize > 1) { if constexpr (std::is_floating_point_v) { - arrsize_t nan_count - = replace_nan_with_inf>(keys, arrsize); - qsort_64bit_, zmm_vector>( - keys, - indexes, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize)); + arrsize_t nan_count = 0; + if (UNLIKELY(hasnan)) { + nan_count = replace_nan_with_inf>(keys, + arrsize); + } + qsort_64bit_(keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(keys, arrsize, nan_count); } else { - qsort_64bit_, zmm_vector>( - keys, - indexes, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize)); + UNUSED(hasnan); + qsort_64bit_(keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); } } } diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index 3a73c08e..c82b033a 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -40,28 +40,32 @@ TYPED_TEST_P(simdkvsort, test_kvsort) std::vector key_bckp = key; std::vector val_bckp = val; x86simdsort::keyvalue_qsort(key.data(), val.data(), size, hasnan); - xss::scalar::keyvalue_qsort(key_bckp.data(), val_bckp.data(), size, hasnan); + xss::scalar::keyvalue_qsort( + key_bckp.data(), val_bckp.data(), size, hasnan); ASSERT_EQ(key, key_bckp); - const bool hasDuplicates = std::adjacent_find(key.begin(), key.end()) != key.end(); - if (!hasDuplicates) { - ASSERT_EQ(val, val_bckp); - } - key.clear(); val.clear(); - key_bckp.clear(); val_bckp.clear(); + const bool hasDuplicates + = std::adjacent_find(key.begin(), key.end()) != key.end(); + if (!hasDuplicates) { ASSERT_EQ(val, val_bckp); } + key.clear(); + val.clear(); + key_bckp.clear(); + val_bckp.clear(); } } } REGISTER_TYPED_TEST_SUITE_P(simdkvsort, test_kvsort); -using QKVSortTestTypes = testing::Types, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple>; +#define CREATE_TUPLES(type) \ + std::tuple, std::tuple, \ + std::tuple, std::tuple, \ + std::tuple, std::tuple + +using QKVSortTestTypes = testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdkvsort, QKVSortTestTypes);