From 13246e2797ea5aae5aa07586f8dea26dbffb7edd Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 1 Apr 2024 14:22:18 -0700 Subject: [PATCH 1/9] Add src/x86simdsort-static-incl.h to include all static avx512/avx2 methods --- examples/Makefile | 32 ++++----- examples/avx2-32bit-qsort.cpp | 11 ---- examples/avx512-16bit-qsort.cpp | 11 ---- examples/avx512-32bit-qsort.cpp | 11 ---- examples/avx512-64bit-qsort.cpp | 11 ---- examples/avx512-argsort.cpp | 10 --- examples/avx512-kv.cpp | 24 +++---- examples/avx512fp-16bit-qsort.cpp | 11 ---- examples/icl-16bit.cpp | 11 ++++ examples/skx-avx2.cpp | 19 ++++++ examples/spr-16bit.cpp | 11 ++++ lib/x86simdsort-avx2.cpp | 5 +- lib/x86simdsort-icl.cpp | 2 +- lib/x86simdsort-skx.cpp | 5 +- lib/x86simdsort-spr.cpp | 2 +- src/avx2-32bit-qsort.hpp | 1 - src/avx2-64bit-qsort.hpp | 1 - src/avx2-emu-funcs.hpp | 1 - src/avx512-16bit-common.h | 2 - src/avx512-32bit-qsort.hpp | 2 - src/avx512-64bit-keyvaluesort.hpp | 1 - src/avx512-64bit-qsort.hpp | 1 - src/x86simdsort-static-incl.h | 106 ++++++++++++++++++++++++++++++ src/xss-common-argsort.h | 1 - src/xss-network-qsort.hpp | 1 - 25 files changed, 176 insertions(+), 117 deletions(-) delete mode 100644 examples/avx2-32bit-qsort.cpp delete mode 100644 examples/avx512-16bit-qsort.cpp delete mode 100644 examples/avx512-32bit-qsort.cpp delete mode 100644 examples/avx512-64bit-qsort.cpp delete mode 100644 examples/avx512-argsort.cpp delete mode 100644 examples/avx512fp-16bit-qsort.cpp create mode 100644 examples/icl-16bit.cpp create mode 100644 examples/skx-avx2.cpp create mode 100644 examples/spr-16bit.cpp create mode 100644 src/x86simdsort-static-incl.h diff --git a/examples/Makefile b/examples/Makefile index 80917c1b..7694bcc1 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,30 +1,24 @@ -CXX ?= g++-12 -CFLAGS = -I../src -std=c++17 -O3 $(if $(CXXFLAGS),$(CXXFLAGS),) -EXE = qsort32avx2 argsort kvsort qsortfp16 qsort16 qsort32 qsort64 +CXX ?= g++-13 +CFLAGS = -I../src -std=c++17 -O3 +EXE = kvsort qsortavx2 qsortavx512 qsortspr qsorticl default: all all : $(EXE) -qsortfp16: avx512fp-16bit-qsort.cpp - $(CXX) -o qsortfp16 -march=sapphirerapids $(CFLAGS) avx512fp-16bit-qsort.cpp - -qsort16: avx512-16bit-qsort.cpp - $(CXX) -o qsort16 -march=icelake-client $(CFLAGS) avx512-16bit-qsort.cpp - -qsort32: avx512-32bit-qsort.cpp - $(CXX) -o qsort32 -march=skylake-avx512 $(CFLAGS) avx512-32bit-qsort.cpp +kvsort: avx512-kv.cpp + $(CXX) -o kvsort -mavx512vl -mavx512dq $(CFLAGS) avx512-kv.cpp -qsort32avx2: avx2-32bit-qsort.cpp - $(CXX) -o qsort32avx2 -march=haswell $(CFLAGS) avx2-32bit-qsort.cpp +qsortavx512: skx-avx2.cpp + $(CXX) -o qsortavx512 -mavx512vl -mavx512dq $(CFLAGS) skx-avx2.cpp -qsort64: avx512-64bit-qsort.cpp - $(CXX) -o qsort64 -march=skylake-avx512 $(CFLAGS) avx512-64bit-qsort.cpp +qsortavx2: skx-avx2.cpp + $(CXX) -o qsortavx2 -mavx2 $(CFLAGS) skx-avx2.cpp -argsort: avx512-argsort.cpp - $(CXX) -o argsort -march=skylake-avx512 $(CFLAGS) avx512-argsort.cpp +qsorticl: icl-16bit.cpp + $(CXX) -o qsorticl -mavx512vl -mavx512bw -mavx512dq -mavx512vbmi2 $(CFLAGS) icl-16bit.cpp -kvsort: avx512-kv.cpp - $(CXX) -o kvsort -march=skylake-avx512 $(CFLAGS) avx512-kv.cpp +qsortspr: spr-16bit.cpp + $(CXX) -o qsortspr -mavx512vl -mavx512dq -mavx512vbmi2 -mavx512fp16 $(CFLAGS) spr-16bit.cpp clean: $(RM) $(EXE) diff --git a/examples/avx2-32bit-qsort.cpp b/examples/avx2-32bit-qsort.cpp deleted file mode 100644 index 5e36aa22..00000000 --- a/examples/avx2-32bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx2-32bit-qsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - avx2_qsort(arr, size); - avx2_qselect(arr, 10, size); - avx2_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-16bit-qsort.cpp b/examples/avx512-16bit-qsort.cpp deleted file mode 100644 index 9990402b..00000000 --- a/examples/avx512-16bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-16bit-qsort.hpp" - -int main() -{ - const int size = 1000; - short arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-32bit-qsort.cpp b/examples/avx512-32bit-qsort.cpp deleted file mode 100644 index 8d8b8b7a..00000000 --- a/examples/avx512-32bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-32bit-qsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-64bit-qsort.cpp b/examples/avx512-64bit-qsort.cpp deleted file mode 100644 index 400f860a..00000000 --- a/examples/avx512-64bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512-64bit-qsort.hpp" - -int main() -{ - const int size = 1000; - double arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/avx512-argsort.cpp b/examples/avx512-argsort.cpp deleted file mode 100644 index cbe21066..00000000 --- a/examples/avx512-argsort.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include "avx512-64bit-argsort.hpp" - -int main() -{ - const int size = 1000; - float arr[size]; - std::vector arg1 = avx512_argsort(arr, size); - std::vector arg2 = avx512_argselect(arr, 10, size); - return 0; -} diff --git a/examples/avx512-kv.cpp b/examples/avx512-kv.cpp index f46a1020..3ca6d090 100644 --- a/examples/avx512-kv.cpp +++ b/examples/avx512-kv.cpp @@ -1,4 +1,4 @@ -#include "avx512-64bit-keyvaluesort.hpp" +#include "x86simdsort-static-incl.h" int main() { @@ -7,17 +7,17 @@ int main() uint64_t arr2[size]; double arr3[size]; float arr4[size]; - avx512_qsort_kv(arr1, arr1, size); - avx512_qsort_kv(arr1, arr2, size); - avx512_qsort_kv(arr1, arr3, size); - avx512_qsort_kv(arr2, arr1, size); - avx512_qsort_kv(arr2, arr2, size); - avx512_qsort_kv(arr2, arr3, size); - avx512_qsort_kv(arr3, arr1, size); - avx512_qsort_kv(arr3, arr2, size); - avx512_qsort_kv(arr1, arr4, size); - avx512_qsort_kv(arr2, arr4, size); - avx512_qsort_kv(arr3, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr3, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr3, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr1, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr2, size); + x86simdsortStatic::keyvalue_qsort(arr1, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr2, arr4, size); + x86simdsortStatic::keyvalue_qsort(arr3, arr4, size); return 0; return 0; } diff --git a/examples/avx512fp-16bit-qsort.cpp b/examples/avx512fp-16bit-qsort.cpp deleted file mode 100644 index 18e1c823..00000000 --- a/examples/avx512fp-16bit-qsort.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "avx512fp16-16bit-qsort.hpp" - -int main() -{ - const int size = 1000; - _Float16 arr[size]; - avx512_qsort(arr, size); - avx512_qselect(arr, 10, size); - avx512_partial_qsort(arr, 10, size); - return 0; -} diff --git a/examples/icl-16bit.cpp b/examples/icl-16bit.cpp new file mode 100644 index 00000000..e789b0f4 --- /dev/null +++ b/examples/icl-16bit.cpp @@ -0,0 +1,11 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + short arr[size]; + x86simdsortStatic::qsort(arr, size); + x86simdsortStatic::qselect(arr, 10, size); + x86simdsortStatic::partial_qsort(arr, 10, size); + return 0; +} diff --git a/examples/skx-avx2.cpp b/examples/skx-avx2.cpp new file mode 100644 index 00000000..ef4bc050 --- /dev/null +++ b/examples/skx-avx2.cpp @@ -0,0 +1,19 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + double arrd[size]; + float arrf[size]; + x86simdsortStatic::qsort(arrf, size); + x86simdsortStatic::qsort(arrd, size); + x86simdsortStatic::qselect(arrf, 10, size); + x86simdsortStatic::qselect(arrd, 10, size); + x86simdsortStatic::partial_qsort(arrf, 10, size); + x86simdsortStatic::partial_qsort(arrd, 10, size); + auto arg1 = x86simdsortStatic::argsort(arrf, size); + auto arg2 = x86simdsortStatic::argselect(arrf, 10, size); + auto arg3 = x86simdsortStatic::argsort(arrd, size); + auto arg4 = x86simdsortStatic::argselect(arrd, 10, size); + return 0; +} diff --git a/examples/spr-16bit.cpp b/examples/spr-16bit.cpp new file mode 100644 index 00000000..6fb4c3ab --- /dev/null +++ b/examples/spr-16bit.cpp @@ -0,0 +1,11 @@ +#include "x86simdsort-static-incl.h" + +int main() +{ + const int size = 1000; + _Float16 arr[size]; + x86simdsortStatic::qsort(arr, size); + x86simdsortStatic::qselect(arr, 10, size); + x86simdsortStatic::partial_qsort(arr, 10, size); + return 0; +} diff --git a/lib/x86simdsort-avx2.cpp b/lib/x86simdsort-avx2.cpp index e10fc164..85459d1a 100644 --- a/lib/x86simdsort-avx2.cpp +++ b/lib/x86simdsort-avx2.cpp @@ -1,8 +1,5 @@ // AVX2 specific routines: -#include "avx2-32bit-qsort.hpp" -#include "avx2-64bit-qsort.hpp" -#include "avx2-32bit-half.hpp" -#include "xss-common-argsort.h" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" #define DEFINE_ALL_METHODS(type) \ diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 20095369..2ef27f78 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,5 +1,5 @@ // ICL specific routines: -#include "avx512-16bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" namespace xss { diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 8b154d4e..ac895ac5 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -1,8 +1,5 @@ // SKX specific routines: -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-keyvaluesort.hpp" -#include "avx512-64bit-argsort.hpp" -#include "avx512-64bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" #define DEFINE_ALL_METHODS(type) \ diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp index b09a8393..b42a70da 100644 --- a/lib/x86simdsort-spr.cpp +++ b/lib/x86simdsort-spr.cpp @@ -1,5 +1,5 @@ // SPR specific routines: -#include "avx512fp16-16bit-qsort.hpp" +#include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" namespace xss { diff --git a/src/avx2-32bit-qsort.hpp b/src/avx2-32bit-qsort.hpp index ad4e99fc..b8cca7a4 100644 --- a/src/avx2-32bit-qsort.hpp +++ b/src/avx2-32bit-qsort.hpp @@ -7,7 +7,6 @@ #ifndef AVX2_QSORT_32BIT #define AVX2_QSORT_32BIT -#include "xss-common-qsort.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx2-64bit-qsort.hpp b/src/avx2-64bit-qsort.hpp index c633b4b9..32e5e385 100644 --- a/src/avx2-64bit-qsort.hpp +++ b/src/avx2-64bit-qsort.hpp @@ -8,7 +8,6 @@ #ifndef AVX2_QSORT_64BIT #define AVX2_QSORT_64BIT -#include "xss-common-qsort.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx2-emu-funcs.hpp b/src/avx2-emu-funcs.hpp index 38489626..a30da7cc 100644 --- a/src/avx2-emu-funcs.hpp +++ b/src/avx2-emu-funcs.hpp @@ -3,7 +3,6 @@ #include #include -#include "xss-common-qsort.h" constexpr auto avx2_mask_helper_lut32 = [] { std::array, 256> lut {}; diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 28c1c1fe..76db872e 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -7,8 +7,6 @@ #ifndef AVX512_16BIT_COMMON #define AVX512_16BIT_COMMON -#include "xss-common-qsort.h" - /* * Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic * sorting network (see diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 8b44e76e..eeaba51c 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -8,8 +8,6 @@ #ifndef AVX512_QSORT_32BIT #define AVX512_QSORT_32BIT -#include "xss-common-qsort.h" - /* * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic * sorting network (see diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 9736b065..895a4f09 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -8,7 +8,6 @@ #ifndef AVX512_QSORT_64BIT_KV #define AVX512_QSORT_64BIT_KV -#include "xss-common-qsort.h" #include "avx512-64bit-common.h" #include "xss-network-keyvaluesort.hpp" diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index 4dcaeafa..1d15ef55 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -7,7 +7,6 @@ #ifndef AVX512_QSORT_64BIT #define AVX512_QSORT_64BIT -#include "xss-common-qsort.h" #include "avx512-64bit-common.h" #endif // AVX512_QSORT_64BIT diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h new file mode 100644 index 00000000..8831cf3e --- /dev/null +++ b/src/x86simdsort-static-incl.h @@ -0,0 +1,106 @@ +#ifndef X86_SIMD_SORT_STATIC_METHODS +#define X86_SIMD_SORT_STATIC_METHODS +#include +#include +// Declare all methods: +namespace x86simdsortStatic { +template +X86_SIMD_SORT_FINLINE void qsort(T *arr, size_t size, bool hasnan = true); + +template +X86_SIMD_SORT_FINLINE void +qselect(T *arr, size_t k, size_t size, bool hasnan = true); + +template +X86_SIMD_SORT_FINLINE void +partial_qsort(T *arr, size_t k, size_t size, bool hasnan = true); + +template +X86_SIMD_SORT_FINLINE std::vector +argsort(T *arr, size_t size, bool hasnan = true); + +template +std::vector X86_SIMD_SORT_FINLINE +argselect(T *arr, size_t k, size_t size, bool hasnan = true); + +template +X86_SIMD_SORT_FINLINE void +keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = true); +} // namespace x86simdsortStatic + +#define XSS_METHODS(ISA) \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::qsort( \ + T *arr, size_t size, bool hasnan) \ + { \ + ISA##_qsort(arr, size, hasnan); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::qselect( \ + T *arr, size_t k, size_t size, bool hasnan) \ + { \ + ISA##_qselect(arr, k, size, hasnan); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::partial_qsort( \ + T *arr, size_t k, size_t size, bool hasnan) \ + { \ + ISA##_partial_qsort(arr, k, size, hasnan); \ + } \ + template \ + X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argsort( \ + T *arr, size_t size, bool hasnan) \ + { \ + return ISA##_argsort(arr, size, hasnan); \ + } \ + template \ + X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argselect( \ + T *arr, size_t k, size_t size, bool hasnan) \ + { \ + return ISA##_argselect(arr, k, size, hasnan); \ + } + +/* + * qsort, qselect, partial, argsort key-value sort template functions. + */ +#include "xss-common-qsort.h" +#include "xss-common-argsort.h" + +#if defined(__AVX512DQ__) && defined(__AVX512VL__) +/* 32-bit and 64-bit dtypes vector definitions on SKX */ +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "avx512-64bit-argsort.hpp" +#include "avx512-64bit-keyvaluesort.hpp" + +/* 16-bit dtypes vector definitions on ICL */ +#if defined(__AVX512BW__) && defined(__AVX512VBMI2__) +#include "avx512-16bit-qsort.hpp" +/* _Float16 vector definition on SPR*/ +#if defined(__FLT16_MAX__) && defined(__AVX512BW__) && defined(__AVX512FP16__) +#include "avx512fp16-16bit-qsort.hpp" +#endif // __FLT16_MAX__ +#endif // __AVX512VBMI2__ + +XSS_METHODS(avx512) + +// key-value currently only on avx512 +template +X86_SIMD_SORT_FINLINE void +x86simdsortStatic::keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan) +{ + avx512_qsort_kv(key, val, size, hasnan); +} + +#elif defined(__AVX2__) && !defined(__AVX512F__) +/* 32-bit and 64-bit dtypes vector definitions on AVX2 */ +#include "avx2-32bit-half.hpp" +#include "avx2-32bit-qsort.hpp" +#include "avx2-64bit-qsort.hpp" +XSS_METHODS(avx2) + +#else +#error "x86simdsortStatic methods needs to be compiled with avx512/avx2 specific flags" +#endif // (__AVX512VL__ && __AVX512DQ__) || AVX2 + +#endif // X86_SIMD_SORT_STATIC_METHODS diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index b97dd0d0..64489d86 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -7,7 +7,6 @@ #ifndef XSS_COMMON_ARGSORT #define XSS_COMMON_ARGSORT -#include "xss-common-qsort.h" #include "xss-network-keyvaluesort.hpp" #include diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index dd299507..0c1d1d8a 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -2,7 +2,6 @@ #define XSS_NETWORK_QSORT #include "xss-optimal-networks.hpp" -#include "xss-common-qsort.h" template X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); From b8d0ce222b13e01b8678e475b8aea25e205e5bc1 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 1 Apr 2024 21:08:14 -0700 Subject: [PATCH 2/9] Include common header file in x86simdsort-static-incl.h --- src/avx2-32bit-half.hpp | 1 - src/avx512-64bit-common.h | 1 - src/x86simdsort-static-incl.h | 2 ++ src/xss-common-qsort.h | 1 - src/xss-network-keyvaluesort.hpp | 1 - 5 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/avx2-32bit-half.hpp b/src/avx2-32bit-half.hpp index 52697692..9100cbbc 100644 --- a/src/avx2-32bit-half.hpp +++ b/src/avx2-32bit-half.hpp @@ -7,7 +7,6 @@ #ifndef AVX2_HALF_32BIT #define AVX2_HALF_32BIT -#include "xss-common-includes.h" #include "avx2-emu-funcs.hpp" /* diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 68735c33..8f5fdce9 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -7,7 +7,6 @@ #ifndef AVX512_64BIT_COMMON #define AVX512_64BIT_COMMON -#include "xss-common-includes.h" #include "avx2-32bit-qsort.hpp" /* diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h index 8831cf3e..3426503c 100644 --- a/src/x86simdsort-static-incl.h +++ b/src/x86simdsort-static-incl.h @@ -2,6 +2,8 @@ #define X86_SIMD_SORT_STATIC_METHODS #include #include +#include "xss-common-includes.h" + // Declare all methods: namespace x86simdsortStatic { template diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 02522b50..4f85d637 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -34,7 +34,6 @@ * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 * */ -#include "xss-common-includes.h" #include "xss-pivot-selection.hpp" #include "xss-network-qsort.hpp" #include "xss-common-comparators.hpp" diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index a20da171..f3b3d453 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -1,7 +1,6 @@ #ifndef XSS_KEYVALUE_NETWORKS #define XSS_KEYVALUE_NETWORKS -#include "xss-common-includes.h" template typename valueType::opmask_t resize_mask(typename keyType::opmask_t mask) From c5b5efbbeea726f42f887d803a4e91b69037bf70 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 1 Apr 2024 21:15:54 -0700 Subject: [PATCH 3/9] clang format --- src/xss-network-keyvaluesort.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/xss-network-keyvaluesort.hpp b/src/xss-network-keyvaluesort.hpp index f3b3d453..bb9b9bcd 100644 --- a/src/xss-network-keyvaluesort.hpp +++ b/src/xss-network-keyvaluesort.hpp @@ -1,7 +1,6 @@ #ifndef XSS_KEYVALUE_NETWORKS #define XSS_KEYVALUE_NETWORKS - template typename valueType::opmask_t resize_mask(typename keyType::opmask_t mask) { From 112f730367bf83f1541d919b11007bfbb33fe863 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 2 Apr 2024 13:25:27 -0700 Subject: [PATCH 4/9] Use static methods in the dispatch files --- lib/x86simdsort-avx2.cpp | 10 ++--- lib/x86simdsort-icl.cpp | 12 +++--- lib/x86simdsort-skx.cpp | 22 +++++------ lib/x86simdsort-spr.cpp | 16 +++++--- src/avx512fp16-16bit-qsort.hpp | 61 ------------------------------- src/x86simdsort-static-incl.h | 67 +++++++++++++++++++++++++--------- src/xss-common-argsort.h | 41 --------------------- 7 files changed, 82 insertions(+), 147 deletions(-) diff --git a/lib/x86simdsort-avx2.cpp b/lib/x86simdsort-avx2.cpp index 85459d1a..2afc4d1d 100644 --- a/lib/x86simdsort-avx2.cpp +++ b/lib/x86simdsort-avx2.cpp @@ -6,31 +6,31 @@ template <> \ void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_qsort(arr, arrsize, hasnan, descending); \ + x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ } \ template <> \ void qselect( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_qselect(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ } \ template <> \ void partial_qsort( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx2_partial_qsort(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ } \ template <> \ std::vector argsort( \ type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - return avx2_argsort(arr, arrsize, hasnan, descending); \ + return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ } \ template <> \ std::vector argselect( \ type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ - return avx2_argselect(arr, k, arrsize, hasnan); \ + return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ } namespace xss { diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 2ef27f78..eeb7b2bf 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -7,7 +7,7 @@ namespace avx512 { template <> void qsort(uint16_t *arr, size_t size, bool hasnan, bool descending) { - avx512_qsort(arr, size, hasnan, descending); + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(uint16_t *arr, @@ -16,7 +16,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_qselect(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(uint16_t *arr, @@ -25,12 +25,12 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } template <> void qsort(int16_t *arr, size_t size, bool hasnan, bool descending) { - avx512_qsort(arr, size, hasnan, descending); + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(int16_t *arr, @@ -39,7 +39,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_qselect(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(int16_t *arr, @@ -48,7 +48,7 @@ namespace avx512 { bool hasnan, bool descending) { - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } } // namespace avx512 } // namespace xss diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index ac895ac5..829dd7b8 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -6,63 +6,63 @@ template <> \ void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_qsort(arr, arrsize, hasnan, descending); \ + x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ } \ template <> \ void qselect( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_qselect(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ } \ template <> \ void partial_qsort( \ type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ - avx512_partial_qsort(arr, k, arrsize, hasnan, descending); \ + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ } \ template <> \ std::vector argsort( \ type *arr, size_t arrsize, bool hasnan, bool descending) \ { \ - return avx512_argsort(arr, arrsize, hasnan, descending); \ + return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ } \ template <> \ std::vector argselect( \ type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ - return avx512_argselect(arr, k, arrsize, hasnan); \ + return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ } #define DEFINE_KEYVALUE_METHODS(type) \ template <> \ void keyvalue_qsort(type *key, uint64_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, int64_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, double *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, uint32_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, int32_t *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } \ template <> \ void keyvalue_qsort(type *key, float *val, size_t arrsize, bool hasnan) \ { \ - avx512_qsort_kv(key, val, arrsize, hasnan); \ + x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \ } namespace xss { diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp index b42a70da..75aa7025 100644 --- a/lib/x86simdsort-spr.cpp +++ b/lib/x86simdsort-spr.cpp @@ -7,9 +7,9 @@ namespace avx512 { template <> void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending) { - if (descending) { avx512_qsort(arr, size, hasnan); } + if (descending) { x86simdsortStatic::qsort(arr, size, hasnan, true); } else { - avx512_qsort(arr, size, hasnan); + x86simdsortStatic::qsort(arr, size, hasnan, false); } } template <> @@ -19,9 +19,11 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { avx512_qselect(arr, k, arrsize, hasnan); } + if (descending) { + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, true); + } else { - avx512_qselect(arr, k, arrsize, hasnan); + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, false); } } template <> @@ -31,9 +33,11 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { avx512_partial_qsort(arr, k, arrsize, hasnan); } + if (descending) { + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, true); + } else { - avx512_partial_qsort(arr, k, arrsize, hasnan); + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, false); } } } // namespace avx512 diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 130e28a8..6d2cca6b 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -200,65 +200,4 @@ X86_SIMD_SORT_INLINE_ONLY void replace_inf_with_nan(_Float16 *arr, } } } -/* Specialized template function for _Float16 qsort_*/ -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_qsort(_Float16 *arr, arrsize_t arrsize, bool hasnan) -{ - using vtype = zmm_vector<_Float16>; - using comparator = - typename std::conditional, - Comparator>::type; - - if (arrsize > 1) { - arrsize_t nan_count = 0; - if (UNLIKELY(hasnan)) { - nan_count = replace_nan_with_inf(arr, arrsize); - } - - qsort_( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); - - replace_inf_with_nan(arr, arrsize, nan_count, descending); - } -} - -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) -{ - using vtype = zmm_vector<_Float16>; - using comparator = - typename std::conditional, - Comparator>::type; - - arrsize_t index_first_elem = 0; - arrsize_t index_last_elem = arrsize - 1; - - if (UNLIKELY(hasnan)) { - if constexpr (descending) { - index_first_elem = move_nans_to_start_of_array(arr, arrsize); - } - else { - index_last_elem = move_nans_to_end_of_array(arr, arrsize); - } - } - - if (index_first_elem <= k && index_last_elem >= k) { - qselect_(arr, - k, - index_first_elem, - index_last_elem, - 2 * (arrsize_t)log2(arrsize)); - } -} -template -X86_SIMD_SORT_INLINE_ONLY void -avx512_partial_qsort(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) -{ - avx512_qselect(arr, k - 1, arrsize, hasnan); - avx512_qsort(arr, k - 1, hasnan); -} #endif // AVX512FP16_QSORT_16BIT diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h index 3426503c..590e343b 100644 --- a/src/x86simdsort-static-incl.h +++ b/src/x86simdsort-static-incl.h @@ -7,59 +7,92 @@ // Declare all methods: namespace x86simdsortStatic { template -X86_SIMD_SORT_FINLINE void qsort(T *arr, size_t size, bool hasnan = true); +X86_SIMD_SORT_FINLINE void +qsort(T *arr, size_t size, bool hasnan = false, bool descending = true); template -X86_SIMD_SORT_FINLINE void -qselect(T *arr, size_t k, size_t size, bool hasnan = true); +X86_SIMD_SORT_FINLINE void qselect(T *arr, + size_t k, + size_t size, + bool hasnan = false, + bool descending = true); + +template +X86_SIMD_SORT_FINLINE void partial_qsort(T *arr, + size_t k, + size_t size, + bool hasnan = false, + bool descending = true); + +template +X86_SIMD_SORT_FINLINE std::vector +argsort(T *arr, size_t size, bool hasnan = false, bool descending = false); template X86_SIMD_SORT_FINLINE void -partial_qsort(T *arr, size_t k, size_t size, bool hasnan = true); +argsort(T *arr, size_t *arg, size_t size, bool hasnan = false, bool descending = false); template X86_SIMD_SORT_FINLINE std::vector -argsort(T *arr, size_t size, bool hasnan = true); +argselect(T *arr, size_t k, size_t size, bool hasnan = false); template -std::vector X86_SIMD_SORT_FINLINE -argselect(T *arr, size_t k, size_t size, bool hasnan = true); +void X86_SIMD_SORT_FINLINE +argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false); template X86_SIMD_SORT_FINLINE void -keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = true); +keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = false); } // namespace x86simdsortStatic #define XSS_METHODS(ISA) \ template \ X86_SIMD_SORT_FINLINE void x86simdsortStatic::qsort( \ - T *arr, size_t size, bool hasnan) \ + T *arr, size_t size, bool hasnan, bool descending) \ { \ - ISA##_qsort(arr, size, hasnan); \ + ISA##_qsort(arr, size, hasnan, descending); \ } \ template \ X86_SIMD_SORT_FINLINE void x86simdsortStatic::qselect( \ - T *arr, size_t k, size_t size, bool hasnan) \ + T *arr, size_t k, size_t size, bool hasnan, bool descending) \ { \ - ISA##_qselect(arr, k, size, hasnan); \ + ISA##_qselect(arr, k, size, hasnan, descending); \ } \ template \ X86_SIMD_SORT_FINLINE void x86simdsortStatic::partial_qsort( \ - T *arr, size_t k, size_t size, bool hasnan) \ + T *arr, size_t k, size_t size, bool hasnan, bool descending) \ + { \ + ISA##_partial_qsort(arr, k, size, hasnan, descending); \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::argsort( \ + T *arr, size_t *arg, size_t size, bool hasnan, bool descending) \ { \ - ISA##_partial_qsort(arr, k, size, hasnan); \ + ISA##_argsort(arr, arg, size, hasnan, descending); \ } \ template \ X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argsort( \ - T *arr, size_t size, bool hasnan) \ + T *arr, size_t size, bool hasnan, bool descending) \ + { \ + std::vector indices(size); \ + std::iota(indices.begin(), indices.end(), 0); \ + x86simdsortStatic::argsort(arr, indices.data(), size, hasnan, descending); \ + return indices; \ + } \ + template \ + X86_SIMD_SORT_FINLINE void x86simdsortStatic::argselect( \ + T *arr, size_t *arg, size_t k, size_t size, bool hasnan) \ { \ - return ISA##_argsort(arr, size, hasnan); \ + ISA##_argselect(arr, arg, k, size, hasnan); \ } \ template \ X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argselect( \ T *arr, size_t k, size_t size, bool hasnan) \ { \ - return ISA##_argselect(arr, k, size, hasnan); \ + std::vector indices(size); \ + std::iota(indices.begin(), indices.end(), 0); \ + x86simdsortStatic::argselect(arr, indices.data(), k, size, hasnan); \ + return indices; \ } /* diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 64489d86..88067830 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -574,16 +574,6 @@ X86_SIMD_SORT_INLINE void avx512_argsort(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector avx512_argsort( - T *arr, arrsize_t arrsize, bool hasnan = false, bool descending = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx512_argsort(arr, indices.data(), arrsize, hasnan, descending); - return indices; -} - /* argsort methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, @@ -618,16 +608,6 @@ X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector avx2_argsort( - T *arr, arrsize_t arrsize, bool hasnan = false, bool descending = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx2_argsort(arr, indices.data(), arrsize, hasnan, descending); - return indices; -} - /* argselect methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, @@ -659,16 +639,6 @@ X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, } } -template -X86_SIMD_SORT_INLINE std::vector -avx512_argselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx512_argselect(arr, indices.data(), k, arrsize, hasnan); - return indices; -} - /* argselect methods for 32-bit and 64-bit dtypes */ template X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, @@ -698,15 +668,4 @@ X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, arr, arg, k, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } - -template -X86_SIMD_SORT_INLINE std::vector -avx2_argselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) -{ - std::vector indices(arrsize); - std::iota(indices.begin(), indices.end(), 0); - avx2_argselect(arr, indices.data(), k, arrsize, hasnan); - return indices; -} - #endif // XSS_COMMON_ARGSORT From b3b28b8d4b6bab7b3822c36b723f4e880a91f231 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 2 Apr 2024 14:50:08 -0700 Subject: [PATCH 5/9] Fix hasnan default value --- src/x86simdsort-static-incl.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h index 590e343b..895c8f44 100644 --- a/src/x86simdsort-static-incl.h +++ b/src/x86simdsort-static-incl.h @@ -4,30 +4,31 @@ #include #include "xss-common-includes.h" -// Declare all methods: +// Supported methods declared here for a quick reference: namespace x86simdsortStatic { template X86_SIMD_SORT_FINLINE void -qsort(T *arr, size_t size, bool hasnan = false, bool descending = true); +qsort(T *arr, size_t size, bool hasnan = false, bool descending = false); template X86_SIMD_SORT_FINLINE void qselect(T *arr, size_t k, size_t size, bool hasnan = false, - bool descending = true); + bool descending = false); template X86_SIMD_SORT_FINLINE void partial_qsort(T *arr, size_t k, size_t size, bool hasnan = false, - bool descending = true); + bool descending = false); template X86_SIMD_SORT_FINLINE std::vector argsort(T *arr, size_t size, bool hasnan = false, bool descending = false); +/* argsort API required by NumPy: */ template X86_SIMD_SORT_FINLINE void argsort(T *arr, size_t *arg, size_t size, bool hasnan = false, bool descending = false); @@ -36,6 +37,7 @@ template X86_SIMD_SORT_FINLINE std::vector argselect(T *arr, size_t k, size_t size, bool hasnan = false); +/* argselect API required by NumPy: */ template void X86_SIMD_SORT_FINLINE argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false); @@ -43,6 +45,7 @@ argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false); template X86_SIMD_SORT_FINLINE void keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = false); + } // namespace x86simdsortStatic #define XSS_METHODS(ISA) \ From 52aa0e8a06d69ca3e9ddd116fa875ab70a63f398 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 5 Apr 2024 11:49:42 -0700 Subject: [PATCH 6/9] Use move and use custom-float in src files --- meson.build | 2 +- src/avx512-16bit-qsort.hpp | 31 ++++++++++--------- src/avx512-64bit-keyvaluesort.hpp | 2 +- src/xss-common-argsort.h | 8 ++--- src/xss-common-includes.h | 1 + src/xss-common-qsort.h | 12 +++---- .../custom-float.h => src/xss-custom-float.h | 6 ++-- tests/meson.build | 6 ++-- utils/custom-compare.h | 2 +- utils/rand_array.h | 2 +- 10 files changed, 38 insertions(+), 34 deletions(-) rename utils/custom-float.h => src/xss-custom-float.h (96%) diff --git a/meson.build b/meson.build index 873094ba..7ae934df 100644 --- a/meson.build +++ b/meson.build @@ -36,7 +36,7 @@ cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') subdir('lib') libsimdsort = shared_library('x86simdsortcpp', 'lib/x86simdsort.cpp', - include_directories : [utils, lib], + include_directories : [src, utils, lib], link_with : [libtargets], gnu_symbol_visibility : 'inlineshidden', install : true, diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 15c7c91e..18595ac1 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -560,10 +560,11 @@ X86_SIMD_SORT_INLINE_ONLY bool is_a_nan(uint16_t elem) return ((elem & 0x7c00u) == 0x7c00u) && ((elem & 0x03ffu) != 0); } -X86_SIMD_SORT_INLINE void avx512_qsort_fp16(uint16_t *arr, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_qsort_fp16(uint16_t *arr, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { using vtype = zmm_vector; @@ -585,11 +586,12 @@ X86_SIMD_SORT_INLINE void avx512_qsort_fp16(uint16_t *arr, } } -X86_SIMD_SORT_INLINE void avx512_qselect_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_qselect_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { using vtype = zmm_vector; @@ -617,11 +619,12 @@ X86_SIMD_SORT_INLINE void avx512_qselect_fp16(uint16_t *arr, } } -X86_SIMD_SORT_INLINE void avx512_partial_qsort_fp16(uint16_t *arr, - arrsize_t k, - arrsize_t arrsize, - bool hasnan = false, - bool descending = false) +[[maybe_unused]] X86_SIMD_SORT_INLINE void +avx512_partial_qsort_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false, + bool descending = false) { avx512_qselect_fp16(arr, k - 1, arrsize, hasnan, descending); avx512_qsort_fp16(arr, k - 1, descending); diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 895a4f09..61046cae 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -418,7 +418,7 @@ avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize, bool hasnan = false) zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { arrsize_t nan_count = 0; if (UNLIKELY(hasnan)) { nan_count = replace_nan_with_inf>(keys, arrsize); diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 88067830..4fa5041a 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -557,7 +557,7 @@ X86_SIMD_SORT_INLINE void avx512_argsort(T *arr, zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argsort_withnan(arr, arg, 0, arrsize); @@ -591,7 +591,7 @@ X86_SIMD_SORT_INLINE void avx2_argsort(T *arr, avx2_half_vector, avx2_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argsort_withnan(arr, arg, 0, arrsize); @@ -627,7 +627,7 @@ X86_SIMD_SORT_INLINE void avx512_argselect(T *arr, zmm_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argselect_withnan(arr, arg, k, 0, arrsize); return; @@ -657,7 +657,7 @@ X86_SIMD_SORT_INLINE void avx2_argselect(T *arr, avx2_vector>::type; if (arrsize > 1) { - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if ((hasnan) && (array_has_nan(arr, arrsize))) { std_argselect_withnan(arr, arg, k, 0, arrsize); return; diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 2682919e..83a54716 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -7,6 +7,7 @@ #include #include #include +#include "xss-custom-float.h" #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 4f85d637..2d5b4ea1 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -105,8 +105,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(type_t *arr, { if (descending) { for (arrsize_t ii = 0; nan_count > 0; ++ii) { - if constexpr (std::is_floating_point_v) { - arr[ii] = std::numeric_limits::quiet_NaN(); + if constexpr (xss::fp::is_floating_point_v) { + arr[ii] = xss::fp::quiet_NaN(); } else { arr[ii] = 0xFFFF; @@ -116,8 +116,8 @@ X86_SIMD_SORT_INLINE void replace_inf_with_nan(type_t *arr, } else { for (arrsize_t ii = size - 1; nan_count > 0; --ii) { - if constexpr (std::is_floating_point_v) { - arr[ii] = std::numeric_limits::quiet_NaN(); + if constexpr (xss::fp::is_floating_point_v) { + arr[ii] = xss::fp::quiet_NaN(); } else { arr[ii] = 0xFFFF; @@ -619,7 +619,7 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) if (arrsize > 1) { arrsize_t nan_count = 0; - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if (UNLIKELY(hasnan)) { nan_count = replace_nan_with_inf(arr, arrsize); } @@ -646,7 +646,7 @@ xss_qselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) arrsize_t index_first_elem = 0; arrsize_t index_last_elem = arrsize - 1; - if constexpr (std::is_floating_point_v) { + if constexpr (xss::fp::is_floating_point_v) { if (UNLIKELY(hasnan)) { if constexpr (descending) { index_first_elem = move_nans_to_start_of_array(arr, arrsize); diff --git a/utils/custom-float.h b/src/xss-custom-float.h similarity index 96% rename from utils/custom-float.h rename to src/xss-custom-float.h index 5faaa9e8..5fd973a7 100644 --- a/utils/custom-float.h +++ b/src/xss-custom-float.h @@ -1,5 +1,5 @@ -#ifndef UTILS_FLOAT -#define UTILS_FLOAT +#ifndef XSS_CUSTOM_FLOAT +#define XSS_CUSTOM_FLOAT #include namespace xss { namespace fp { @@ -87,4 +87,4 @@ namespace fp { } // namespace fp } // namespace xss -#endif +#endif // XSS_CUSTOM_FLOAT diff --git a/tests/meson.build b/tests/meson.build index 86ca2fe8..0583c55e 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -3,17 +3,17 @@ libtests = [] libtests += static_library('tests_qsort', files('test-qsort.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) libtests += static_library('tests_kvsort', files('test-keyvalue.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) libtests += static_library('tests_objsort', files('test-objqsort.cpp', ), dependencies: gtest_dep, - include_directories : [lib, utils], + include_directories : [src, lib, utils], ) diff --git a/utils/custom-compare.h b/utils/custom-compare.h index ab8df85c..6244bb24 100644 --- a/utils/custom-compare.h +++ b/utils/custom-compare.h @@ -1,6 +1,6 @@ #include #include -#include "custom-float.h" +#include "xss-custom-float.h" /* * Custom comparator class to handle NAN's: treats NAN > INF diff --git a/utils/rand_array.h b/utils/rand_array.h index a9703551..cb99da2e 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -10,7 +10,7 @@ #include #include #include -#include "custom-float.h" +#include "xss-custom-float.h" template static std::vector get_uniform_rand_array(int64_t arrsize, From 9a098921a669f5cf3172a151ad3779103d4d2eef Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 5 Apr 2024 12:12:26 -0700 Subject: [PATCH 7/9] clang format files --- src/x86simdsort-static-incl.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h index 895c8f44..b7dc4d6f 100644 --- a/src/x86simdsort-static-incl.h +++ b/src/x86simdsort-static-incl.h @@ -30,8 +30,11 @@ argsort(T *arr, size_t size, bool hasnan = false, bool descending = false); /* argsort API required by NumPy: */ template -X86_SIMD_SORT_FINLINE void -argsort(T *arr, size_t *arg, size_t size, bool hasnan = false, bool descending = false); +X86_SIMD_SORT_FINLINE void argsort(T *arr, + size_t *arg, + size_t size, + bool hasnan = false, + bool descending = false); template X86_SIMD_SORT_FINLINE std::vector @@ -79,7 +82,8 @@ keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan = false); { \ std::vector indices(size); \ std::iota(indices.begin(), indices.end(), 0); \ - x86simdsortStatic::argsort(arr, indices.data(), size, hasnan, descending); \ + x86simdsortStatic::argsort( \ + arr, indices.data(), size, hasnan, descending); \ return indices; \ } \ template \ From 7a286b3fcf68db03422fb7b1caeb97922b1abddb Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 11 Apr 2024 09:33:32 -0700 Subject: [PATCH 8/9] Get rid of an unnecessary branch --- lib/x86simdsort-spr.cpp | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp index 75aa7025..b8069d2b 100644 --- a/lib/x86simdsort-spr.cpp +++ b/lib/x86simdsort-spr.cpp @@ -7,10 +7,7 @@ namespace avx512 { template <> void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending) { - if (descending) { x86simdsortStatic::qsort(arr, size, hasnan, true); } - else { - x86simdsortStatic::qsort(arr, size, hasnan, false); - } + x86simdsortStatic::qsort(arr, size, hasnan, descending); } template <> void qselect(_Float16 *arr, @@ -19,12 +16,7 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { - x86simdsortStatic::qselect(arr, k, arrsize, hasnan, true); - } - else { - x86simdsortStatic::qselect(arr, k, arrsize, hasnan, false); - } + x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); } template <> void partial_qsort(_Float16 *arr, @@ -33,12 +25,7 @@ namespace avx512 { bool hasnan, bool descending) { - if (descending) { - x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, true); - } - else { - x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, false); - } + x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); } } // namespace avx512 } // namespace xss From 3a2bde31dc0a5059c3aff1c92072868ee6dcb918 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Mon, 22 Apr 2024 16:32:02 -0700 Subject: [PATCH 9/9] Added specific error message for case where AVX512F is enabled but not AVX512DQ and AVX512VL --- src/x86simdsort-static-incl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/x86simdsort-static-incl.h b/src/x86simdsort-static-incl.h index b7dc4d6f..0d0e4400 100644 --- a/src/x86simdsort-static-incl.h +++ b/src/x86simdsort-static-incl.h @@ -134,6 +134,9 @@ x86simdsortStatic::keyvalue_qsort(T1 *key, T2 *val, size_t size, bool hasnan) avx512_qsort_kv(key, val, size, hasnan); } +#elif defined(__AVX512F__) +#error "x86simdsort requires AVX512DQ and AVX512VL to be enabled in addition to AVX512F to use AVX512" + #elif defined(__AVX2__) && !defined(__AVX512F__) /* 32-bit and 64-bit dtypes vector definitions on AVX2 */ #include "avx2-32bit-half.hpp"