From 2951f9e7d89d5d7e391d0b377556e34bca053f5e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 17:47:46 +0200 Subject: [PATCH 01/60] Implement vectorized min_ / max_element for ints Resolves #2438 TODO: * Test coverage * Attach minmax_element * Add AVX2 version of the same ---- Benchmark ```C++ #include #include #include #include #include #include enum class Kind { Min, Max, }; template void benchmark_find(T* a, std::size_t max, size_t start, size_t pos, Kind kind, size_t rep) { std::fill_n(a, max, '0'); if (pos < max && pos >= start) { if (kind == Kind::Min) { a[pos] = '*'; } else { a[pos] = '1'; } } auto t1 = std::chrono::steady_clock::now(); switch (kind) { case Kind::Min: for (std::size_t s = 0; s < rep; s++) { if (std::min_element(a + start, a + max) != a + pos) { abort(); } } break; case Kind::Max: for (std::size_t s = 0; s < rep; s++) { if (std::min_element(a + start, a + max) != a + pos) { abort(); } } break; } auto t2 = std::chrono::steady_clock::now(); const char* op_str = nullptr; switch (kind) { case Kind::Min: op_str = "min"; break; case Kind::Max: op_str = "max"; break; } std::cout << std::setw(10) << std::chrono::duration_cast>(t2 - t1).count() << "s -- " << "Op " << op_str << " Size " << sizeof(T) << " byte elements, array size " << max << " starting at " << start << " found at " << pos << "; " << rep << " repetitions \n"; } constexpr std::size_t Nmax = 8192; alignas(64) std::uint8_t a8[Nmax]; alignas(64) std::uint16_t a16[Nmax]; alignas(64) std::uint32_t a32[Nmax]; alignas(64) std::uint64_t a64[Nmax]; extern "C" long __isa_enabled; int main() { std::cout << "Vector alg used: " << _USE_STD_VECTOR_ALGORITHMS << "\n"; benchmark_find(a8, Nmax, 0, 3459, Kind::Min, 100000); benchmark_find(a16, Nmax, 0, 3459, Kind::Min, 100000); benchmark_find(a32, Nmax, 0, 3459, Kind::Min, 100000); benchmark_find(a64, Nmax, 0, 3459, Kind::Min, 100000); benchmark_find(a8, Nmax, 0, 3459, Kind::Max, 100000); benchmark_find(a16, Nmax, 0, 3459, Kind::Max, 100000); benchmark_find(a32, Nmax, 0, 3459, Kind::Max, 100000); benchmark_find(a64, Nmax, 0, 3459, Kind::Max, 100000); std::cout << "Done\n"; return 0; } ``` Current benchmark results ``` ********************************************************************** ** Visual Studio 2022 Developer Command Prompt v17.1.0-pre.1.1 ** Copyright (c) 2021 Microsoft Corporation ********************************************************************** [vcvarsall.bat] Environment initialized for: 'x64' C:\Program Files\Microsoft Visual Studio\2022\Preview>cd/d C:\Project\vector_find_benchmark C:\Project\vector_find_benchmark>set INCLUDE=C:\Project\STL\out\build\x64\out\inc;%INCLUDE% C:\Project\vector_find_benchmark>set LIB=C:\Project\STL\out\build\x64\out\lib\amd64;%LIB% C:\Project\vector_find_benchmark>set PATH=C:\Project\STL\out\build\x64\out\bin\amd64;%PATH% C:\Project\vector_find_benchmark>cl /O2 /std:c++latest /EHsc /D_USE_STD_VECTOR_ALGORITHMS=0 /nologo vector_find_benchmark.cpp vector_find_benchmark.cpp vector_find_benchmark.cpp(1): warning C4005: '_USE_STD_VECTOR_ALGORITHMS': macro redefinition vector_find_benchmark.cpp: note: see previous definition of '_USE_STD_VECTOR_ALGORITHMS' C:\Project\vector_find_benchmark>cl /O2 /std:c++latest /EHsc /D_USE_STD_VECTOR_ALGORITHMS=0 /nologo vector_find_benchmark.cpp vector_find_benchmark.cpp C:\Project\vector_find_benchmark>vector_find_benchmark.exe Vector alg used: 0 1.48497s -- Op min Size 1 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 1.48125s -- Op min Size 2 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 1.47988s -- Op min Size 4 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 1.48431s -- Op min Size 8 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions C:\Project\vector_find_benchmark>cl /O2 /std:c++latest /EHsc /D_USE_STD_VECTOR_ALGORITHMS=1 /nologo vector_find_benchmark.cpp vector_find_benchmark.cpp C:\Project\vector_find_benchmark>vector_find_benchmark.exe Vector alg used: 1 0.0559598s -- Op min Size 1 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 0.0681002s -- Op min Size 2 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 0.159074s -- Op min Size 4 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions 0.597614s -- Op min Size 8 byte elements, array size 8192 starting at 0 found at 3459; 100000 repetitions ``` --- stl/inc/algorithm | 96 ++++++ stl/src/vector_algorithms.cpp | 533 ++++++++++++++++++++++++++++++++++ 2 files changed, 629 insertions(+) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 65ecf97335..0281754161 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -34,7 +34,68 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( const void* _First, const void* _Last, void* _Dest) noexcept; __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( const void* _First, const void* _Last, void* _Dest) noexcept; + +struct _Min_max_t { + const void* _Min; + const void* _Max; +}; + +const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_min_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; + +const void* __stdcall __std_max_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_max_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_max_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; +const void* __stdcall __std_max_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; + +_Min_max_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; + _END_EXTERN_C + +template +_Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { + using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; + constexpr bool _Signed = _STD is_signed_v<_Ty>; + + if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { + return __std_min_element(_First, _Last); + } else if constexpr (sizeof(_Ty) == 1) { + return static_cast<_Ty*>(const_cast(__std_min_element_1(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 2) { + return static_cast<_Ty*>(const_cast(__std_min_element_2(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 4) { + return static_cast<_Ty*>(const_cast(__std_min_element_4(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 8) { + return static_cast<_Ty*>(const_cast(__std_min_element_8(_First, _Last, _Signed))); + } else { + static_assert(_STD _Always_false<_Ty>, "Unexpected size"); + } +} + +template +_Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { + using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; + constexpr bool _Signed = _STD is_signed_v<_Ty>; + + if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { + return __std_max_element(_First, _Last); + } else if constexpr (sizeof(_Ty) == 1) { + return static_cast<_Ty*>(const_cast(__std_max_element_1(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 2) { + return static_cast<_Ty*>(const_cast(__std_max_element_2(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 4) { + return static_cast<_Ty*>(const_cast(__std_max_element_4(_First, _Last, _Signed))); + } else if constexpr (sizeof(_Ty) == 8) { + return static_cast<_Ty*>(const_cast(__std_max_element_8(_First, _Last, _Signed))); + } else { + static_assert(_STD _Always_false<_Ty>, "Unexpected size"); + } +} #endif // _USE_STD_VECTOR_ALGORITHMS _STD_BEGIN @@ -9228,8 +9289,29 @@ namespace ranges { #endif // __cpp_lib_concepts #endif // _HAS_CXX17 +template +_INLINE_VAR constexpr bool _Is_min_max_optimization_safe = + _Iterator_is_contiguous<_FwdIt>&& + conjunction_v>, is_pointer<_Iter_value_t<_FwdIt>>>, + disjunction>, is_same<_Pr, _RANGES less>, + is_same<_Pr, less<_Iter_value_t<_FwdIt>>>>> && !is_volatile_v>>; + template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element +#if _USE_STD_VECTOR_ALGORITHMS + if (!is_constant_evaluated()) { + if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { + const auto _First_ptr = _To_address(_First); + auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); + if constexpr (is_pointer_v<_FwdIt>) { + return _Result; + } else { + return _First + (_Result - _First_ptr); + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + _FwdIt _Found = _First; if (_First != _Last) { while (++_First != _Last) { @@ -9321,6 +9403,20 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element +#if _USE_STD_VECTOR_ALGORITHMS + if (!is_constant_evaluated()) { + if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { + const auto _First_ptr = _To_address(_First); + auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); + if constexpr (is_pointer_v<_FwdIt>) { + return _Result; + } else { + return _First + (_Result - _First_ptr); + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + _FwdIt _Found = _First; if (_First != _Last) { while (++_First != _Last) { diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index bfce592432..e2ce361ac5 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -20,9 +20,22 @@ #include #endif // defined(_M_ARM64EC) #include +#include +#include extern "C" long __isa_enabled; +#pragma optimize("t", on) // Override /Os with /Ot for this TU + +static bool _Use_sse42() { + return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); +} + +struct _Min_max_t { + const void* _Min; + const void* _Max; +}; + template static void _Reverse_tail(_BidIt _First, _BidIt _Last) noexcept { for (; _First != _Last && _First != --_Last; ++_First) { @@ -451,6 +464,526 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( static_cast(_Dest)); } +} // extern "C" + +template +const void* _Min_tail(const void* _First, const void* _Last, const void* _Res, _Ty _Cur) noexcept { + for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { + if (*_Ptr < _Cur) { + _Res = _Ptr; + _Cur = *_Ptr; + } + } + return _Res; +} + +template +const void* _Max_tail(const void* _First, const void* _Last, const void* _Res, _Ty _Cur) noexcept { + for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { + if (_Cur < *_Ptr) { + _Res = _Ptr; + _Cur = *_Ptr; + } + } + return _Res; +} + +template +_Min_max_t _Both_tail(const void* _First, const void* _Last, _Min_max_t& _Res, _Ty _Cur_min, _Ty _Cur_max) noexcept { + for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { + if (*_Ptr < _Cur_min) { + _Res._Min = _Ptr; + _Cur_min = *_Ptr; + } + if (_Cur_min < *_Ptr) { + _Res._Max = _Ptr; + _Cur_max = *_Ptr; + } + } + return _Res; +} + +enum class _Min_max_mode { + _Min_only, + _Max_only, + _Both, +}; + +template <_Min_max_mode _Mode, class _STy, class _UTy> +auto _Minmax_tail( + const void* _First, const void* _Last, _Min_max_t& _Res, bool _Sign, _UTy _Cur_min, _UTy _Cur_max) noexcept { + constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * CHAR_BIT - 1)); + + if constexpr (_Mode == _Min_max_mode::_Min_only) { + if (_Sign) { + return _Min_tail(_First, _Last, _Res._Min, static_cast<_STy>(_Cur_min)); + } else { + return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min + _Cor)); + } + } else if constexpr (_Mode == _Min_max_mode::_Max_only) { + if (_Sign) { + return _Max_tail(_First, _Last, _Res._Max, static_cast<_STy>(_Cur_max)); + } else { + return _Max_tail(_First, _Last, _Res._Max, static_cast<_UTy>(_Cur_max + _Cor)); + } + } else { + if (_Sign) { + return _Both_tail(_First, _Last, _Res, static_cast<_STy>(_Cur_min), static_cast<_STy>(_Cur_max)); + } else { + return _Both_tail( + _First, _Last, _Res, static_cast<_UTy>(_Cur_min + _Cor), static_cast<_UTy>(_Cur_max + _Cor)); + } + } +} + +struct _Minmax_traits_1 { + using _Signed_t = int8_t; + using _Unsigned_t = uint8_t; + + static constexpr bool _Has_portion_max = true; + static constexpr size_t _Portion_max = 256; + + static constexpr uint8_t _Init_min_val = 0x7F; + static constexpr uint8_t _Init_max_val = 0x80; + + static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + alignas(16) static constexpr uint8_t _Sign_cors[2][16] = { + {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; + return _mm_sub_epi8(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + } + + static __m128i _Inc() { + return _mm_set1_epi8(1); + } + + static __m128i _H_min(__m128i _Cur) { + const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_min = _Cur; + _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); + return _H_min; + } + + static __m128i _H_max(__m128i _Cur) { + const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_max = _Cur; + _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); + _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); + _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_words)); + _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_bytes)); + return _H_max; + } + + static int8_t _Get_any(__m128i _Cur) { + return static_cast(_mm_cvtsi128_si32(_Cur)); + } + + static uint8_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + return static_cast(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_H_pos)))); + } + + static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + return _mm_cmpeq_epi8(_First, _Second); + } + + static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + return _mm_cmpgt_epi8(_First, _Second); + } + + static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + return _mm_cmplt_epi8(_First, _Second); + } + + static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + return _mm_min_epi8(_First, _Second); + } + + static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + return _mm_max_epi8(_First, _Second); + } +}; + +struct _Minmax_traits_2 { + using _Signed_t = int16_t; + using _Unsigned_t = uint16_t; + + static constexpr bool _Has_portion_max = true; + static constexpr size_t _Portion_max = 65536; + + static constexpr uint16_t _Init_min_val = 0x7FFF; + static constexpr uint16_t _Init_max_val = 0x8000; + + static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + alignas(16) static constexpr uint16_t _Sign_cors[2][8] = { + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; + return _mm_sub_epi16(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + } + + static __m128i _Inc() { + return _mm_set1_epi16(1); + } + + static __m128i _H_min(__m128i _Cur) { + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_min = _Cur; + _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + return _H_min; + } + + static __m128i _H_max(__m128i _Cur) { + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_max = _Cur; + _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); + _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); + _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_words)); + return _H_max; + } + + static int16_t _Get_any(__m128i _Cur) { + return static_cast(_mm_cvtsi128_si32(_Cur)); + } + + static uint16_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + uint16_t _Array[8]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); + return _Array[_H_pos >> 1]; + } + + static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + return _mm_cmpeq_epi16(_First, _Second); + } + + static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + return _mm_cmpgt_epi16(_First, _Second); + } + + static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + return _mm_cmplt_epi16(_First, _Second); + } + + static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + return _mm_min_epi16(_First, _Second); + } + + static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + return _mm_max_epi16(_First, _Second); + } +}; + + +struct _Minmax_traits_4 { + using _Signed_t = int32_t; + using _Unsigned_t = uint32_t; + +#ifdef _M_IX86 + static constexpr bool _Has_portion_max = false; +#else // ^^^ 32-bit ^^^ / vvv 64-bit vvv + static constexpr bool _Has_portion_max = true; + static constexpr size_t _Portion_max = 0x1'0000'0000ULL; +#endif // ^^^ 64-bit ^^^ + + static constexpr uint32_t _Init_min_val = 0x7FFF'FFFFUL; + static constexpr uint32_t _Init_max_val = 0x8000'0000UL; + + static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + alignas(16) static constexpr uint32_t _Sign_cors[2][4] = { + 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; + return _mm_sub_epi32(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + } + + static __m128i _Inc() { + return _mm_set1_epi32(1); + } + + static __m128i _H_min(__m128i _Cur) { + __m128i _H_min = _Cur; + _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_min; + } + + static __m128i _H_max(__m128i _Cur) { + __m128i _H_max = _Cur; + _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); + _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_max; + } + + static int32_t _Get_any(__m128i _Cur) { + return static_cast(_mm_cvtsi128_si32(_Cur)); + } + + static uint32_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + uint32_t _Array[4]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); + return _Array[_H_pos >> 2]; + } + + static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + return _mm_cmpeq_epi32(_First, _Second); + } + + static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + return _mm_cmpgt_epi32(_First, _Second); + } + + static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + return _mm_cmplt_epi32(_First, _Second); + } + + static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + return _mm_min_epi32(_First, _Second); + } + + static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + return _mm_max_epi32(_First, _Second); + } +}; + +struct _Minmax_traits_8 { + using _Signed_t = int64_t; + using _Unsigned_t = uint64_t; + + static constexpr bool _Has_portion_max = false; + + static constexpr uint64_t _Init_min_val = 0x7FFF'FFFF'FFFF'FFFFULL; + static constexpr uint64_t _Init_max_val = 0x8000'0000'0000'0000ULL; + + static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + alignas(16) static constexpr uint64_t _Sign_cors[2][2] = { + 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; + return _mm_sub_epi64(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + } + + static __m128i _Inc() { + return _mm_set1_epi64x(1); + } + + static __m128i _H_min(__m128i _Cur) { + int64_t _H_min_a = _mm_cvtsi128_si64(_Cur); + int64_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + if (_H_min_b < _H_min_a) { + _H_min_a = _H_min_b; + } + return _mm_set1_epi64x(_H_min_a); + } + + static __m128i _H_max(__m128i _Cur) { + int64_t _H_max_a = _mm_cvtsi128_si64(_Cur); + int64_t _H_max_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + if (_H_max_b > _H_max_a) { + _H_max_a = _H_max_b; + } + return _mm_set1_epi64x(_H_max_a); + } + + static int64_t _Get_any(__m128i _Cur) { + return static_cast(_mm_cvtsi128_si64(_Cur)); + } + + static uint64_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + uint64_t _Array[2]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); + return _Array[_H_pos >> 3]; + } + + static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + return _mm_cmpeq_epi64(_First, _Second); + } + + static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + return _mm_cmpgt_epi64(_First, _Second); + } + + static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + __m128i _Gt = _mm_cmpgt_epi64(_Second, _First); // less or equal + __m128i _Eq = _mm_cmpeq_epi64(_First, _Second); + return _mm_andnot_si128(_Eq, _Gt); + } + + static __m128i _Min(__m128i _First, __m128i _Second, __m128i _Mask) { + return _mm_blendv_epi8(_First, _Second, _Mask); + } + + static __m128i _Max(__m128i _First, __m128i _Second, __m128i _Mask) { + return _mm_blendv_epi8(_First, _Second, _Mask); + } +}; + +template <_Min_max_mode _Mode, class _Traits> +auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept { + _Min_max_t _Res = {_First, _First}; + auto _Base = static_cast(_First); + typename _Traits::_Signed_t _Cur_min_val = _Traits::_Init_min_val; + typename _Traits::_Signed_t _Cur_max_val = _Traits::_Init_max_val; + + if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { + const void* _Stop_at = _First; + + size_t _Sse_size = _Byte_length(_First, _Last) & ~size_t{0xF}; + size_t _Portion_size = _Sse_size; + + if constexpr (_Traits::_Has_portion_max) { + constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; + if (_Portion_size > _Max_portion_size) { + _Portion_size = _Max_portion_size; + } + } + + _Sse_size -= _Portion_size; + + _Advance_bytes(_Stop_at, _Portion_size); + + const __m128i _Inc = _Traits::_Inc(); + __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + __m128i _Cur_vals_min = _Cur_vals; + __m128i _Cur_idx_min = _mm_setzero_si128(); + __m128i _Cur_vals_max = _Cur_vals; + __m128i _Cur_idx_max = _mm_setzero_si128(); + __m128i _Cur_idx = _mm_setzero_si128(); + + for (;;) { + _Advance_bytes(_First, 16); + _Cur_idx = _mm_add_epi64(_Cur_idx, _Inc); + + if (_First == _Stop_at) { + + if constexpr (_Mode != _Min_max_mode::_Max_only) { + __m128i _H_min = _Traits::_H_min(_Cur_vals_min); + typename _Traits::_Signed_t _H_min_val = _Traits::_Get_any(_H_min); + if (_H_min_val < _Cur_min_val) { + _Cur_min_val = _H_min_val; + unsigned long _H_pos; + _BitScanForward(&_H_pos, _mm_movemask_epi8(_Traits::_Cmp_eq(_H_min, _Cur_vals_min))); + typename _Traits::_Unsigned_t _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); + _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + } + } + + if constexpr (_Mode != _Min_max_mode::_Min_only) { + __m128i _H_max = _Traits::_H_max(_Cur_vals_min); + typename _Traits::_Signed_t _H_max_val = _Traits::_Get_any(_H_max); + if (_H_max_val < _Cur_max_val) { + _Cur_max_val = _H_max_val; + unsigned long _H_pos; + _BitScanForward(&_H_pos, _mm_movemask_epi8(_Traits::_Cmp_eq(_H_max, _Cur_vals_max))); + typename _Traits::_Unsigned_t _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); + _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + } + } + + if constexpr (_Traits::_Has_portion_max) { + size_t _Portion_size = _Sse_size; + if (_Portion_size == 0) { + break; + } + + constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; + if (_Portion_size > _Max_portion_size) { + _Portion_size = _Max_portion_size; + } + + _Advance_bytes(_Stop_at, _Portion_size); + _Sse_size -= _Portion_size; + + _Base = static_cast(_First); + + _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + + if constexpr (_Mode != _Min_max_mode::_Max_only) { + _Cur_vals_min = _Cur_vals; + _Cur_idx_min = _mm_setzero_si128(); + } + if constexpr (_Mode != _Min_max_mode::_Min_only) { + _Cur_vals_max = _Cur_vals; + _Cur_idx_max = _mm_setzero_si128(); + } + continue; + } else { + break; + } + } + + __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + + if constexpr (_Mode != _Min_max_mode::_Max_only) { + const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); + _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); + _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); + } + + if constexpr (_Mode != _Min_max_mode::_Min_only) { + const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); + _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); + } + }; + } + + return _Minmax_tail<_Mode, _Traits::_Signed_t, _Traits::_Unsigned_t>( + _First, _Last, _Res, _Sign, _Cur_min_val, _Cur_max_val); +} + + +extern "C" { + +const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_1>(_First, _Last, _Signed); +} + +const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_2>(_First, _Last, _Signed); +} + +const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_4>(_First, _Last, _Signed); +} + +const void* __stdcall __std_min_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_8>(_First, _Last, _Signed); +} + +const void* __stdcall __std_max_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_1>(_First, _Last, _Signed); +} + +const void* __stdcall __std_max_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_2>(_First, _Last, _Signed); +} + +const void* __stdcall __std_max_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_4>(_First, _Last, _Signed); +} + +const void* __stdcall __std_max_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_8>(_First, _Last, _Signed); +} + +_Min_max_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_1>(_First, _Last, _Signed); +} + +_Min_max_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_2>(_First, _Last, _Signed); +} + +_Min_max_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_4>(_First, _Last, _Signed); +} + +_Min_max_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { + return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_8>(_First, _Last, _Signed); +} } // extern "C" From 19ba1d0e97f686f5eb4afe1c323e8b30162bf037 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 18:12:28 +0200 Subject: [PATCH 02/60] fix copypasta --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e2ce361ac5..825b1dfe5e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -871,9 +871,9 @@ auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept } if constexpr (_Mode != _Min_max_mode::_Min_only) { - __m128i _H_max = _Traits::_H_max(_Cur_vals_min); + __m128i _H_max = _Traits::_H_max(_Cur_vals_max); typename _Traits::_Signed_t _H_max_val = _Traits::_Get_any(_H_max); - if (_H_max_val < _Cur_max_val) { + if (_Cur_max_val < _H_max_val) { _Cur_max_val = _H_max_val; unsigned long _H_pos; _BitScanForward(&_H_pos, _mm_movemask_epi8(_Traits::_Cmp_eq(_H_max, _Cur_vals_max))); From 44b19c97702414c324be441402104de3d8feb088 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 18:24:17 +0200 Subject: [PATCH 03/60] Guard C++ features --- stl/inc/algorithm | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 0281754161..9ca93bfa71 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -21,7 +21,7 @@ _STL_DISABLE_CLANG_WARNINGS _EXTERN_C // The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms // won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by -// those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via +// those pointers. The optimizer also assurames in that case that a pointer parameter is not returned to the caller via // the return value, so functions using "noalias" must usually return void. This attribute is valuable because these // functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to @@ -9293,13 +9293,19 @@ template _INLINE_VAR constexpr bool _Is_min_max_optimization_safe = _Iterator_is_contiguous<_FwdIt>&& conjunction_v>, is_pointer<_Iter_value_t<_FwdIt>>>, - disjunction>, is_same<_Pr, _RANGES less>, + disjunction>, +#ifdef __cpp_lib_concepts + is_same<_Pr, _RANGES less>, +#endif // __cpp_lib_concepts is_same<_Pr, less<_Iter_value_t<_FwdIt>>>>> && !is_volatile_v>>; template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element #if _USE_STD_VECTOR_ALGORITHMS - if (!is_constant_evaluated()) { +#if _HAS_CXX20 + if (!is_constant_evaluated()) +#endif // _HAS_CXX20 + { if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { const auto _First_ptr = _To_address(_First); auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); @@ -9404,7 +9410,10 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element #if _USE_STD_VECTOR_ALGORITHMS - if (!is_constant_evaluated()) { +#if _HAS_CXX20 + if (!is_constant_evaluated()) +#endif // _HAS_CXX20 + { if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { const auto _First_ptr = _To_address(_First); auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); From 7c233f4ca98d31aabf4cf58518be9bc1a4f2eace Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 18:25:14 +0200 Subject: [PATCH 04/60] typo --- stl/inc/algorithm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 9ca93bfa71..f3331919d3 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -21,7 +21,7 @@ _STL_DISABLE_CLANG_WARNINGS _EXTERN_C // The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms // won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by -// those pointers. The optimizer also assurames in that case that a pointer parameter is not returned to the caller via +// those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via // the return value, so functions using "noalias" must usually return void. This attribute is valuable because these // functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the // compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to From e24501850e2fc8bc97aecbb077758658c42ff5c9 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 18:54:01 +0200 Subject: [PATCH 05/60] @miscco review --- stl/inc/algorithm | 18 ++--- stl/src/vector_algorithms.cpp | 140 +++++++++++++++++++--------------- 2 files changed, 87 insertions(+), 71 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index f3331919d3..51ca79123e 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9290,7 +9290,7 @@ namespace ranges { #endif // _HAS_CXX17 template -_INLINE_VAR constexpr bool _Is_min_max_optimization_safe = +_INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = _Iterator_is_contiguous<_FwdIt>&& conjunction_v>, is_pointer<_Iter_value_t<_FwdIt>>>, disjunction>, @@ -9302,13 +9302,13 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe = template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element #if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { #if _HAS_CXX20 - if (!is_constant_evaluated()) + if (!_STD is_constant_evaluated()) #endif // _HAS_CXX20 - { - if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { + { const auto _First_ptr = _To_address(_First); - auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); + const auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { return _Result; } else { @@ -9410,13 +9410,13 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element #if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { #if _HAS_CXX20 - if (!is_constant_evaluated()) + if (!_STD is_constant_evaluated()) #endif // _HAS_CXX20 - { - if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { + { const auto _First_ptr = _To_address(_First); - auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); + const auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { return _Result; } else { diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 825b1dfe5e..0b7e4d23f6 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -474,6 +474,7 @@ const void* _Min_tail(const void* _First, const void* _Last, const void* _Res, _ _Cur = *_Ptr; } } + return _Res; } @@ -485,6 +486,7 @@ const void* _Max_tail(const void* _First, const void* _Last, const void* _Res, _ _Cur = *_Ptr; } } + return _Res; } @@ -495,11 +497,13 @@ _Min_max_t _Both_tail(const void* _First, const void* _Last, _Min_max_t& _Res, _ _Res._Min = _Ptr; _Cur_min = *_Ptr; } + if (_Cur_min < *_Ptr) { _Res._Max = _Ptr; _Cur_max = *_Ptr; } } + return _Res; } @@ -546,17 +550,17 @@ struct _Minmax_traits_1 { static constexpr uint8_t _Init_min_val = 0x7F; static constexpr uint8_t _Init_max_val = 0x80; - static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr uint8_t _Sign_cors[2][16] = { {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; return _mm_sub_epi8(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() { + static __m128i _Inc() noexcept { return _mm_set1_epi8(1); } - static __m128i _H_min(__m128i _Cur) { + static __m128i _H_min(const __m128i _Cur) noexcept { const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); @@ -568,7 +572,7 @@ struct _Minmax_traits_1 { return _H_min; } - static __m128i _H_max(__m128i _Cur) { + static __m128i _H_max(const __m128i _Cur) noexcept { const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); @@ -580,31 +584,31 @@ struct _Minmax_traits_1 { return _H_max; } - static int8_t _Get_any(__m128i _Cur) { + static int8_t _Get_any(const __m128i _Cur) noexcept { return static_cast(_mm_cvtsi128_si32(_Cur)); } - static uint8_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + static uint8_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { return static_cast(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_H_pos)))); } - static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpeq_epi8(_First, _Second); } - static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_gt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpgt_epi8(_First, _Second); } - static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmplt_epi8(_First, _Second); } - static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi8(_First, _Second); } - static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Max(const __m128i _First, __m128i _Second, __m128i) noexcept { return _mm_max_epi8(_First, _Second); } }; @@ -619,17 +623,17 @@ struct _Minmax_traits_2 { static constexpr uint16_t _Init_min_val = 0x7FFF; static constexpr uint16_t _Init_max_val = 0x8000; - static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr uint16_t _Sign_cors[2][8] = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; return _mm_sub_epi16(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() { + static __m128i _Inc() noexcept { return _mm_set1_epi16(1); } - static __m128i _H_min(__m128i _Cur) { + static __m128i _H_min(const __m128i _Cur) noexcept { const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); __m128i _H_min = _Cur; @@ -639,7 +643,7 @@ struct _Minmax_traits_2 { return _H_min; } - static __m128i _H_max(__m128i _Cur) { + static __m128i _H_max(const __m128i _Cur) noexcept { const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); __m128i _H_max = _Cur; @@ -649,33 +653,33 @@ struct _Minmax_traits_2 { return _H_max; } - static int16_t _Get_any(__m128i _Cur) { + static int16_t _Get_any(const __m128i _Cur) noexcept { return static_cast(_mm_cvtsi128_si32(_Cur)); } - static uint16_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + static uint16_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { uint16_t _Array[8]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 1]; } - static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpeq_epi16(_First, _Second); } - static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_gt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpgt_epi16(_First, _Second); } - static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmplt_epi16(_First, _Second); } - static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i) noexcept { return _mm_min_epi16(_First, _Second); } - static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Max(const __m128i _First, const __m128i _Second, const __m128i) noexcept { return _mm_max_epi16(_First, _Second); } }; @@ -695,57 +699,57 @@ struct _Minmax_traits_4 { static constexpr uint32_t _Init_min_val = 0x7FFF'FFFFUL; static constexpr uint32_t _Init_max_val = 0x8000'0000UL; - static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr uint32_t _Sign_cors[2][4] = { 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; return _mm_sub_epi32(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() { + static __m128i _Inc() noexcept { return _mm_set1_epi32(1); } - static __m128i _H_min(__m128i _Cur) { + static __m128i _H_min(const __m128i _Cur) noexcept { __m128i _H_min = _Cur; _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); return _H_min; } - static __m128i _H_max(__m128i _Cur) { + static __m128i _H_max(const __m128i _Cur) noexcept { __m128i _H_max = _Cur; _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); return _H_max; } - static int32_t _Get_any(__m128i _Cur) { + static int32_t _Get_any(const __m128i _Cur) noexcept { return static_cast(_mm_cvtsi128_si32(_Cur)); } - static uint32_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + static uint32_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { uint32_t _Array[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 2]; } - static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpeq_epi32(_First, _Second); } - static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_gt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpgt_epi32(_First, _Second); } - static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmplt_epi32(_First, _Second); } - static __m128i _Min(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi32(_First, _Second); } - static __m128i _Max(__m128i _First, __m128i _Second, __m128i) { + static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi32(_First, _Second); } }; @@ -759,17 +763,17 @@ struct _Minmax_traits_8 { static constexpr uint64_t _Init_min_val = 0x7FFF'FFFF'FFFF'FFFFULL; static constexpr uint64_t _Init_max_val = 0x8000'0000'0000'0000ULL; - static __m128i _Sign_cor(__m128i _Val, bool _Sign) { + static __m128i _Sign_cor(__m128i _Val, const bool _Sign) { alignas(16) static constexpr uint64_t _Sign_cors[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; return _mm_sub_epi64(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() { + static __m128i _Inc() noexcept { return _mm_set1_epi64x(1); } - static __m128i _H_min(__m128i _Cur) { + static __m128i _H_min(const __m128i _Cur) noexcept { int64_t _H_min_a = _mm_cvtsi128_si64(_Cur); int64_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); if (_H_min_b < _H_min_a) { @@ -778,7 +782,7 @@ struct _Minmax_traits_8 { return _mm_set1_epi64x(_H_min_a); } - static __m128i _H_max(__m128i _Cur) { + static __m128i _H_max(const __m128i _Cur) noexcept { int64_t _H_max_a = _mm_cvtsi128_si64(_Cur); int64_t _H_max_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); if (_H_max_b > _H_max_a) { @@ -787,41 +791,41 @@ struct _Minmax_traits_8 { return _mm_set1_epi64x(_H_max_a); } - static int64_t _Get_any(__m128i _Cur) { + static int64_t _Get_any(const __m128i _Cur) noexcept { return static_cast(_mm_cvtsi128_si64(_Cur)); } - static uint64_t _Get_v_pos(__m128i _Idx, unsigned long _H_pos) { + static uint64_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { uint64_t _Array[2]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 3]; } - static __m128i _Cmp_eq(__m128i _First, __m128i _Second) { + static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpeq_epi64(_First, _Second); } - static __m128i _Cmp_gt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_gt(const __m128i _First, const __m128i _Second) noexcept { return _mm_cmpgt_epi64(_First, _Second); } - static __m128i _Cmp_lt(__m128i _First, __m128i _Second) { + static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { __m128i _Gt = _mm_cmpgt_epi64(_Second, _First); // less or equal __m128i _Eq = _mm_cmpeq_epi64(_First, _Second); return _mm_andnot_si128(_Eq, _Gt); } - static __m128i _Min(__m128i _First, __m128i _Second, __m128i _Mask) { + static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { return _mm_blendv_epi8(_First, _Second, _Mask); } - static __m128i _Max(__m128i _First, __m128i _Second, __m128i _Mask) { + static __m128i _Max(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { return _mm_blendv_epi8(_First, _Second, _Mask); } }; template <_Min_max_mode _Mode, class _Traits> -auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept { +auto _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { _Min_max_t _Res = {_First, _First}; auto _Base = static_cast(_First); typename _Traits::_Signed_t _Cur_min_val = _Traits::_Init_min_val; @@ -859,8 +863,8 @@ auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept if (_First == _Stop_at) { if constexpr (_Mode != _Min_max_mode::_Max_only) { - __m128i _H_min = _Traits::_H_min(_Cur_vals_min); - typename _Traits::_Signed_t _H_min_val = _Traits::_Get_any(_H_min); + __m128i _H_min = _Traits::_H_min(_Cur_vals_min); + auto _H_min_val = _Traits::_Get_any(_H_min); if (_H_min_val < _Cur_min_val) { _Cur_min_val = _H_min_val; unsigned long _H_pos; @@ -871,8 +875,8 @@ auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept } if constexpr (_Mode != _Min_max_mode::_Min_only) { - __m128i _H_max = _Traits::_H_max(_Cur_vals_max); - typename _Traits::_Signed_t _H_max_val = _Traits::_Get_any(_H_max); + __m128i _H_max = _Traits::_H_max(_Cur_vals_max); + auto _H_max_val = _Traits::_Get_any(_H_max); if (_Cur_max_val < _H_max_val) { _Cur_max_val = _H_max_val; unsigned long _H_pos; @@ -927,7 +931,7 @@ auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); } - }; + } } return _Minmax_tail<_Mode, _Traits::_Signed_t, _Traits::_Unsigned_t>( @@ -937,51 +941,63 @@ auto _Minmax_element(const void* _First, const void* _Last, bool _Sign) noexcept extern "C" { -const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_min_element_1( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_1>(_First, _Last, _Signed); } -const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_min_element_2( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_2>(_First, _Last, _Signed); } -const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_min_element_4( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_4>(_First, _Last, _Signed); } -const void* __stdcall __std_min_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_min_element_8( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_8>(_First, _Last, _Signed); } -const void* __stdcall __std_max_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_max_element_1( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_1>(_First, _Last, _Signed); } -const void* __stdcall __std_max_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_max_element_2( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_2>(_First, _Last, _Signed); } -const void* __stdcall __std_max_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_max_element_4( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_4>(_First, _Last, _Signed); } -const void* __stdcall __std_max_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { +const void* __stdcall __std_max_element_8( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_8>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept { +_Min_max_t __stdcall __std_minmax_element_1( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_1>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept { +_Min_max_t __stdcall __std_minmax_element_2( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_2>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept { +_Min_max_t __stdcall __std_minmax_element_4( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_4>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept { +_Min_max_t __stdcall __std_minmax_element_8( + const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_8>(_First, _Last, _Signed); } From 3bf81a7f49df793ad5784f5a2aa96f8c73a77895 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 19:05:09 +0200 Subject: [PATCH 06/60] contexpr in C++14 / C++17 :-( Need to guard in C++20 to call _STD is_constant_evaluated() --- stl/inc/algorithm | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 51ca79123e..da111d4949 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9301,12 +9301,9 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element -#if _USE_STD_VECTOR_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { -#if _HAS_CXX20 - if (!_STD is_constant_evaluated()) -#endif // _HAS_CXX20 - { + if (!_STD is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { @@ -9316,7 +9313,7 @@ constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 _FwdIt _Found = _First; if (_First != _Last) { @@ -9409,12 +9406,9 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element -#if _USE_STD_VECTOR_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { -#if _HAS_CXX20 - if (!_STD is_constant_evaluated()) -#endif // _HAS_CXX20 - { + if (!_STD is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { @@ -9424,7 +9418,7 @@ constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 _FwdIt _Found = _First; if (_First != _Last) { From f781a7008cbdff64342962891ef05282a8a20918 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 20:04:24 +0200 Subject: [PATCH 07/60] Fix algorithm --- stl/src/vector_algorithms.cpp | 154 +++++++++++++++++++++++----------- 1 file changed, 103 insertions(+), 51 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0b7e4d23f6..87552c7bff 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -467,7 +467,7 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( } // extern "C" template -const void* _Min_tail(const void* _First, const void* _Last, const void* _Res, _Ty _Cur) noexcept { +const void* _Min_tail(const void* const _First, const void* const _Last, const void* _Res, _Ty _Cur) noexcept { for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if (*_Ptr < _Cur) { _Res = _Ptr; @@ -479,7 +479,7 @@ const void* _Min_tail(const void* _First, const void* _Last, const void* _Res, _ } template -const void* _Max_tail(const void* _First, const void* _Last, const void* _Res, _Ty _Cur) noexcept { +const void* _Max_tail(const void* const _First, const void* const _Last, const void* _Res, _Ty _Cur) noexcept { for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if (_Cur < *_Ptr) { _Res = _Ptr; @@ -491,7 +491,8 @@ const void* _Max_tail(const void* _First, const void* _Last, const void* _Res, _ } template -_Min_max_t _Both_tail(const void* _First, const void* _Last, _Min_max_t& _Res, _Ty _Cur_min, _Ty _Cur_max) noexcept { +_Min_max_t _Both_tail( + const void* const _First, const void* const _Last, _Min_max_t& _Res, _Ty _Cur_min, _Ty _Cur_max) noexcept { for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if (*_Ptr < _Cur_min) { _Res._Min = _Ptr; @@ -547,11 +548,11 @@ struct _Minmax_traits_1 { static constexpr bool _Has_portion_max = true; static constexpr size_t _Portion_max = 256; - static constexpr uint8_t _Init_min_val = 0x7F; - static constexpr uint8_t _Init_max_val = 0x80; + static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7F); + static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x80); static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr uint8_t _Sign_cors[2][16] = { + alignas(16) static constexpr _Unsigned_t _Sign_cors[2][16] = { {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; return _mm_sub_epi8(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } @@ -584,11 +585,24 @@ struct _Minmax_traits_1 { return _H_max; } - static int8_t _Get_any(const __m128i _Cur) noexcept { - return static_cast(_mm_cvtsi128_si32(_Cur)); + + static __m128i _H_min_u(const __m128i _Cur) noexcept { + const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_min = _Cur; + _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); + return _H_min; + } + + static _Signed_t _Get_any(const __m128i _Cur) noexcept { + return static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); } - static uint8_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { return static_cast(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_H_pos)))); } @@ -620,11 +634,11 @@ struct _Minmax_traits_2 { static constexpr bool _Has_portion_max = true; static constexpr size_t _Portion_max = 65536; - static constexpr uint16_t _Init_min_val = 0x7FFF; - static constexpr uint16_t _Init_max_val = 0x8000; + static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF); + static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000); static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr uint16_t _Sign_cors[2][8] = { + alignas(16) static constexpr _Unsigned_t _Sign_cors[2][8] = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; return _mm_sub_epi16(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } @@ -653,12 +667,23 @@ struct _Minmax_traits_2 { return _H_max; } - static int16_t _Get_any(const __m128i _Cur) noexcept { - return static_cast(_mm_cvtsi128_si32(_Cur)); + static __m128i _H_min_u(const __m128i _Cur) noexcept { + const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + + __m128i _H_min = _Cur; + _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + return _H_min; + } + + static _Signed_t _Get_any(const __m128i _Cur) noexcept { + return static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); } - static uint16_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - uint16_t _Array[8]; + static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + _Unsigned_t _Array[8]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 1]; } @@ -696,11 +721,11 @@ struct _Minmax_traits_4 { static constexpr size_t _Portion_max = 0x1'0000'0000ULL; #endif // ^^^ 64-bit ^^^ - static constexpr uint32_t _Init_min_val = 0x7FFF'FFFFUL; - static constexpr uint32_t _Init_max_val = 0x8000'0000UL; + static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF'FFFFUL); + static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000'0000UL); static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr uint32_t _Sign_cors[2][4] = { + alignas(16) static constexpr _Unsigned_t _Sign_cors[2][4] = { 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; return _mm_sub_epi32(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } @@ -723,12 +748,19 @@ struct _Minmax_traits_4 { return _H_max; } - static int32_t _Get_any(const __m128i _Cur) noexcept { - return static_cast(_mm_cvtsi128_si32(_Cur)); + static __m128i _H_min_u(const __m128i _Cur) noexcept { + __m128i _H_min = _Cur; + _H_min = _mm_min_epu32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _mm_min_epu32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_min; } - static uint32_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - uint32_t _Array[4]; + static _Signed_t _Get_any(const __m128i _Cur) noexcept { + return static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); + } + + static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + _Unsigned_t _Array[4]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 2]; } @@ -760,11 +792,11 @@ struct _Minmax_traits_8 { static constexpr bool _Has_portion_max = false; - static constexpr uint64_t _Init_min_val = 0x7FFF'FFFF'FFFF'FFFFULL; - static constexpr uint64_t _Init_max_val = 0x8000'0000'0000'0000ULL; + static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF'FFFF'FFFF'FFFFULL); + static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000'0000'0000'0000ULL); static __m128i _Sign_cor(__m128i _Val, const bool _Sign) { - alignas(16) static constexpr uint64_t _Sign_cors[2][2] = { + alignas(16) static constexpr _Unsigned_t _Sign_cors[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; return _mm_sub_epi64(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } @@ -774,8 +806,8 @@ struct _Minmax_traits_8 { } static __m128i _H_min(const __m128i _Cur) noexcept { - int64_t _H_min_a = _mm_cvtsi128_si64(_Cur); - int64_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + _Signed_t _H_min_a = _mm_cvtsi128_si64(_Cur); + _Signed_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); if (_H_min_b < _H_min_a) { _H_min_a = _H_min_b; } @@ -783,20 +815,30 @@ struct _Minmax_traits_8 { } static __m128i _H_max(const __m128i _Cur) noexcept { - int64_t _H_max_a = _mm_cvtsi128_si64(_Cur); - int64_t _H_max_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + _Signed_t _H_max_a = _mm_cvtsi128_si64(_Cur); + _Signed_t _H_max_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); if (_H_max_b > _H_max_a) { _H_max_a = _H_max_b; } return _mm_set1_epi64x(_H_max_a); } - static int64_t _Get_any(const __m128i _Cur) noexcept { - return static_cast(_mm_cvtsi128_si64(_Cur)); + static __m128i _H_min_u(const __m128i _Cur) noexcept { + _Unsigned_t _H_min_a = _mm_cvtsi128_si64(_Cur); + _Unsigned_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + if (_H_min_b < _H_min_a) { + _H_min_a = _H_min_b; + } + return _mm_set1_epi64x(_H_min_a); + } + + + static _Signed_t _Get_any(const __m128i _Cur) noexcept { + return static_cast<_Signed_t>(_mm_cvtsi128_si64(_Cur)); } - static uint64_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - uint64_t _Array[2]; + static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { + _Unsigned_t _Array[2]; _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); return _Array[_H_pos >> 3]; } @@ -826,10 +868,10 @@ struct _Minmax_traits_8 { template <_Min_max_mode _Mode, class _Traits> auto _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { - _Min_max_t _Res = {_First, _First}; - auto _Base = static_cast(_First); - typename _Traits::_Signed_t _Cur_min_val = _Traits::_Init_min_val; - typename _Traits::_Signed_t _Cur_max_val = _Traits::_Init_max_val; + _Min_max_t _Res = {_First, _First}; + auto _Base = static_cast(_First); + auto _Cur_min_val = _Traits::_Init_min_val; + auto _Cur_max_val = _Traits::_Init_max_val; if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { const void* _Stop_at = _First; @@ -863,31 +905,41 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si if (_First == _Stop_at) { if constexpr (_Mode != _Min_max_mode::_Max_only) { - __m128i _H_min = _Traits::_H_min(_Cur_vals_min); - auto _H_min_val = _Traits::_Get_any(_H_min); + const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); + const auto _H_min_val = _Traits::_Get_any(_H_min); if (_H_min_val < _Cur_min_val) { - _Cur_min_val = _H_min_val; + _Cur_min_val = _H_min_val; + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); + const __m128i _Idx_min_val = _mm_blendv_epi8(_mm_set1_epi32(-1), _Cur_idx_min, _Eq_mask); + __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); + unsigned long _H_pos; - _BitScanForward(&_H_pos, _mm_movemask_epi8(_Traits::_Cmp_eq(_H_min, _Cur_vals_min))); - typename _Traits::_Unsigned_t _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); - _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + _BitScanForward(&_H_pos, + _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)) & _mm_movemask_epi8(_Eq_mask)); + const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); + _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); } } if constexpr (_Mode != _Min_max_mode::_Min_only) { - __m128i _H_max = _Traits::_H_max(_Cur_vals_max); - auto _H_max_val = _Traits::_Get_any(_H_max); + const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); + const auto _H_max_val = _Traits::_Get_any(_H_max); if (_Cur_max_val < _H_max_val) { - _Cur_max_val = _H_max_val; + _Cur_max_val = _H_max_val; + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); + const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_set1_epi32(-1), _Cur_idx_max, _Eq_mask); + __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + unsigned long _H_pos; - _BitScanForward(&_H_pos, _mm_movemask_epi8(_Traits::_Cmp_eq(_H_max, _Cur_vals_max))); - typename _Traits::_Unsigned_t _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); - _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + _BitScanForward(&_H_pos, + _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)) & _mm_movemask_epi8(_Eq_mask)); + const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); + _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); } } if constexpr (_Traits::_Has_portion_max) { - size_t _Portion_size = _Sse_size; + _Portion_size = _Sse_size; if (_Portion_size == 0) { break; } From 34b935d26be602259ee1f7b763b586de339d799f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 20:44:46 +0200 Subject: [PATCH 08/60] x86 build --- stl/src/vector_algorithms.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 87552c7bff..59205135ce 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -806,8 +806,8 @@ struct _Minmax_traits_8 { } static __m128i _H_min(const __m128i _Cur) noexcept { - _Signed_t _H_min_a = _mm_cvtsi128_si64(_Cur); - _Signed_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + _Signed_t _H_min_a = _Get_any(_Cur); + _Signed_t _H_min_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); if (_H_min_b < _H_min_a) { _H_min_a = _H_min_b; } @@ -815,8 +815,8 @@ struct _Minmax_traits_8 { } static __m128i _H_max(const __m128i _Cur) noexcept { - _Signed_t _H_max_a = _mm_cvtsi128_si64(_Cur); - _Signed_t _H_max_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + _Signed_t _H_max_a = _Get_any(_Cur); + _Signed_t _H_max_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); if (_H_max_b > _H_max_a) { _H_max_a = _H_max_b; } @@ -824,8 +824,8 @@ struct _Minmax_traits_8 { } static __m128i _H_min_u(const __m128i _Cur) noexcept { - _Unsigned_t _H_min_a = _mm_cvtsi128_si64(_Cur); - _Unsigned_t _H_min_b = _mm_cvtsi128_si64(_mm_bsrli_si128(_Cur, 8)); + _Unsigned_t _H_min_a = _Get_any(_Cur); + _Unsigned_t _H_min_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); if (_H_min_b < _H_min_a) { _H_min_a = _H_min_b; } @@ -834,7 +834,11 @@ struct _Minmax_traits_8 { static _Signed_t _Get_any(const __m128i _Cur) noexcept { +#ifdef _M_IX86 + return static_cast<_Signed_t>(_mm_extract_epi32(_Cur, 1)) | static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); +#else return static_cast<_Signed_t>(_mm_cvtsi128_si64(_Cur)); +#endif } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { From de0397a2904068b7ad9612232d3426fe3eea165d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 22:09:09 +0200 Subject: [PATCH 09/60] minmax_element --- stl/inc/algorithm | 43 +++++++++- stl/src/vector_algorithms.cpp | 152 ++++++++++++++++++++-------------- 2 files changed, 129 insertions(+), 66 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index da111d4949..2818bd7135 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -63,7 +63,7 @@ _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_min_element(_First, _Last); + return __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { @@ -83,7 +83,7 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_max_element(_First, _Last); + return __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { @@ -96,6 +96,31 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } } + +template +_STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { + using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; + constexpr bool _Signed = _STD is_signed_v<_Ty>; + + if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { + return __std_minmax_element( + reinterpret_cast(_First), reinterpret_cast(_Last)); + } else if constexpr (sizeof(_Ty) == 1) { + auto _Res = __std_minmax_element_1(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 2) { + auto _Res = __std_minmax_element_2(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 4) { + auto _Res = __std_minmax_element_4(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 8) { + auto _Res = __std_minmax_element_8(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else { + static_assert(_STD _Always_false<_Ty>, "Unexpected size"); + } +} #endif // _USE_STD_VECTOR_ALGORITHMS _STD_BEGIN @@ -9511,6 +9536,20 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { +#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 + if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { + if (!_STD is_constant_evaluated()) { + const auto _First_ptr = _To_address(_First); + const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); + if constexpr (is_pointer_v<_FwdIt>) { + return {_Result.first, _Result.second}; + } else { + return {_First + (_Result.first - _First_ptr), _First + (_Result.second - _First_ptr)}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 + // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 59205135ce..ffc9f7b5b4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -497,9 +497,7 @@ _Min_max_t _Both_tail( if (*_Ptr < _Cur_min) { _Res._Min = _Ptr; _Cur_min = *_Ptr; - } - - if (_Cur_min < *_Ptr) { + } else if (_Cur_max <= *_Ptr) { _Res._Max = _Ptr; _Cur_max = *_Ptr; } @@ -561,41 +559,33 @@ struct _Minmax_traits_1 { return _mm_set1_epi8(1); } - static __m128i _H_min(const __m128i _Cur) noexcept { + template + static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); __m128i _H_min = _Cur; - _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); - _H_min = _mm_min_epi8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); return _H_min; } - static __m128i _H_max(const __m128i _Cur) noexcept { - const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - - __m128i _H_max = _Cur; - _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); - _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); - _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_words)); - _H_max = _mm_max_epi8(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_bytes)); - return _H_max; + static __m128i _H_min(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epi8(_First, _Second); }); } + static __m128i _H_max(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epi8(_First, _Second); }); + } static __m128i _H_min_u(const __m128i _Cur) noexcept { - const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epu8(_First, _Second); }); + } - __m128i _H_min = _Cur; - _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); - _H_min = _mm_min_epu8(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); - return _H_min; + static __m128i _H_max_u(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epu8(_First, _Second); }); } static _Signed_t _Get_any(const __m128i _Cur) noexcept { @@ -647,35 +637,31 @@ struct _Minmax_traits_2 { return _mm_set1_epi16(1); } - static __m128i _H_min(const __m128i _Cur) noexcept { + template + static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); __m128i _H_min = _Cur; - _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _mm_min_epi16(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); return _H_min; } - static __m128i _H_max(const __m128i _Cur) noexcept { - const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + static __m128i _H_min(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epi16(_First, _Second); }); + } - __m128i _H_max = _Cur; - _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); - _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); - _H_max = _mm_max_epi16(_H_max, _mm_shuffle_epi8(_H_max, _Shuf_words)); - return _H_max; + static __m128i _H_max(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epi16(_First, _Second); }); } static __m128i _H_min_u(const __m128i _Cur) noexcept { - const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epu16(_First, _Second); }); + } - __m128i _H_min = _Cur; - _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _mm_min_epu16(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); - return _H_min; + static __m128i _H_max_u(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epu16(_First, _Second); }); } static _Signed_t _Get_any(const __m128i _Cur) noexcept { @@ -734,25 +720,30 @@ struct _Minmax_traits_4 { return _mm_set1_epi32(1); } - static __m128i _H_min(const __m128i _Cur) noexcept { + template + static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { + const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + __m128i _H_min = _Cur; - _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epi32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); return _H_min; } + static __m128i _H_min(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epi32(_First, _Second); }); + } + static __m128i _H_max(const __m128i _Cur) noexcept { - __m128i _H_max = _Cur; - _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(1, 0, 3, 2))); - _H_max = _mm_max_epi32(_H_max, _mm_shuffle_epi32(_H_max, _MM_SHUFFLE(2, 3, 0, 1))); - return _H_max; + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epi32(_First, _Second); }); } static __m128i _H_min_u(const __m128i _Cur) noexcept { - __m128i _H_min = _Cur; - _H_min = _mm_min_epu32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _mm_min_epu32(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - return _H_min; + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_min_epu32(_First, _Second); }); + } + + static __m128i _H_max_u(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](__m128i _First, __m128i _Second) { return _mm_max_epu32(_First, _Second); }); } static _Signed_t _Get_any(const __m128i _Cur) noexcept { @@ -833,6 +824,16 @@ struct _Minmax_traits_8 { } + static __m128i _H_max_u(const __m128i _Cur) noexcept { + _Unsigned_t _H_max_a = _Get_any(_Cur); + _Unsigned_t _H_max_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); + if (_H_max_b > _H_max_a) { + _H_max_a = _H_max_b; + } + return _mm_set1_epi64x(_H_max_a); + } + + static _Signed_t _Get_any(const __m128i _Cur) noexcept { #ifdef _M_IX86 return static_cast<_Signed_t>(_mm_extract_epi32(_Cur, 1)) | static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); @@ -911,10 +912,12 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si if constexpr (_Mode != _Min_max_mode::_Max_only) { const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); const auto _H_min_val = _Traits::_Get_any(_H_min); + if (_H_min_val < _Cur_min_val) { _Cur_min_val = _H_min_val; const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); - const __m128i _Idx_min_val = _mm_blendv_epi8(_mm_set1_epi32(-1), _Cur_idx_min, _Eq_mask); + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + const __m128i _Idx_min_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_min, _Eq_mask); __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); unsigned long _H_pos; @@ -928,15 +931,32 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si if constexpr (_Mode != _Min_max_mode::_Min_only) { const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); const auto _H_max_val = _Traits::_Get_any(_H_max); - if (_Cur_max_val < _H_max_val) { - _Cur_max_val = _H_max_val; - const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); - const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_set1_epi32(-1), _Cur_idx_max, _Eq_mask); - __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + + if (_Mode == _Min_max_mode::_Both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { + _Cur_max_val = _H_max_val; + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); + + + int _Mask = _mm_movemask_epi8(_Eq_mask); + + if constexpr (_Mode == _Min_max_mode::_Both) { + const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); + const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + } else { + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + const __m128i _Idx_max_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_max, _Eq_mask); + const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + } unsigned long _H_pos; - _BitScanForward(&_H_pos, - _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)) & _mm_movemask_epi8(_Eq_mask)); + if constexpr (_Mode == _Min_max_mode::_Both) { + _BitScanReverse(&_H_pos, _Mask); + } else { + _BitScanForward(&_H_pos, _Mask); + } + const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); } @@ -982,10 +1002,14 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); } - if constexpr (_Mode != _Min_max_mode::_Min_only) { + if constexpr (_Mode == _Min_max_mode::_Max_only) { const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); + } else if constexpr (_Mode == _Min_max_mode::_Both) { + const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); + _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); } } } From f118d4c692f50c95bd9db67bf9746bd5c4520829 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:14:36 +0200 Subject: [PATCH 10/60] fix corner case; +comments --- stl/src/vector_algorithms.cpp | 135 +++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 42 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ffc9f7b5b4..ee7f91d579 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -497,7 +497,11 @@ _Min_max_t _Both_tail( if (*_Ptr < _Cur_min) { _Res._Min = _Ptr; _Cur_min = *_Ptr; - } else if (_Cur_max <= *_Ptr) { + } + // Not else! + // * Needed for correctness if start with maximum, as we don't handle specially the first element. + // * Promote branchless code generation. + if (_Cur_max <= *_Ptr) { _Res._Max = _Ptr; _Cur_max = *_Ptr; } @@ -555,8 +559,8 @@ struct _Minmax_traits_1 { return _mm_sub_epi8(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() noexcept { - return _mm_set1_epi8(1); + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi8(_Idx, _mm_set1_epi8(1)); } template @@ -633,8 +637,8 @@ struct _Minmax_traits_2 { return _mm_sub_epi16(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() noexcept { - return _mm_set1_epi16(1); + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi16(_Idx, _mm_set1_epi16(1)); } template @@ -716,8 +720,8 @@ struct _Minmax_traits_4 { return _mm_sub_epi32(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() noexcept { - return _mm_set1_epi32(1); + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi32(_Idx, _mm_set1_epi32(1)); } template @@ -792,8 +796,8 @@ struct _Minmax_traits_8 { return _mm_sub_epi64(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); } - static __m128i _Inc() noexcept { - return _mm_set1_epi64x(1); + static __m128i _Inc(__m128i _Idx) noexcept { + return _mm_add_epi64(_Idx, _mm_set1_epi64x(1)); } static __m128i _H_min(const __m128i _Cur) noexcept { @@ -885,6 +889,7 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si size_t _Portion_size = _Sse_size; if constexpr (_Traits::_Has_portion_max) { + // vector of indices will wrap around at this size constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; if (_Portion_size > _Max_portion_size) { _Portion_size = _Max_portion_size; @@ -895,89 +900,123 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si _Advance_bytes(_Stop_at, _Portion_size); - const __m128i _Inc = _Traits::_Inc(); + // Load values and if unsigned adjust them to be signed (for signed vector comparisons) __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - __m128i _Cur_vals_min = _Cur_vals; - __m128i _Cur_idx_min = _mm_setzero_si128(); - __m128i _Cur_vals_max = _Cur_vals; - __m128i _Cur_idx_max = _mm_setzero_si128(); - __m128i _Cur_idx = _mm_setzero_si128(); + __m128i _Cur_vals_min = _Cur_vals; // vector of vertical minimum values + __m128i _Cur_idx_min = _mm_setzero_si128(); // vector of vertical minimum indices + __m128i _Cur_vals_max = _Cur_vals; // vector of vertical minimum values + __m128i _Cur_idx_max = _mm_setzero_si128(); // vector of vertical minimum indices + __m128i _Cur_idx = _mm_setzero_si128(); // current vector of indices for (;;) { _Advance_bytes(_First, 16); - _Cur_idx = _mm_add_epi64(_Cur_idx, _Inc); + + // Increment vertical indices. Will stop at exactly wrap around, if not reach the end before + _Cur_idx = _Traits::_Inc(_Cur_idx); if (_First == _Stop_at) { + // Reached end or indices wrap around point. Compute horizontal min and/or max if constexpr (_Mode != _Min_max_mode::_Max_only) { - const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); + // Vector populated by the smallest element + const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); + // Get any element of it const auto _H_min_val = _Traits::_Get_any(_H_min); if (_H_min_val < _Cur_min_val) { - _Cur_min_val = _H_min_val; - const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + // Current horizontal min is less than the old, update min + _Cur_min_val = _H_min_val; + // Mask of all elements equal to minimum + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); + int _Mask = _mm_movemask_epi8(_Eq_mask); + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + // Indices of minimum elements or the greatest index if none const __m128i _Idx_min_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_min, _Eq_mask); - __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); - + // The smallest indexes + __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); + // Select the smallest vertical indexes from the smallest element mask + _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)); + // Find the smallest horizontal index unsigned long _H_pos; - _BitScanForward(&_H_pos, - _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)) & _mm_movemask_epi8(_Eq_mask)); + _BitScanForward(&_H_pos, _Mask); + // Extract its vertical index const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); - _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + // Finally, compute the pointer + _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); } } if constexpr (_Mode != _Min_max_mode::_Min_only) { - const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); + // Vector populated by the largest element + const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); + // Get any element of it const auto _H_max_val = _Traits::_Get_any(_H_max); if (_Mode == _Min_max_mode::_Both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { - _Cur_max_val = _H_max_val; + // max_element: current horizontal max is greater than the old, update max + // minmax_element: current horizontal max is not less than the old, update max + _Cur_max_val = _H_max_val; + // Mask of all elements equal to maximum const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); - - + // Mask of all elements equal to maximum int _Mask = _mm_movemask_epi8(_Eq_mask); if constexpr (_Mode == _Min_max_mode::_Both) { + // Indices of minimum elements or zero none const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); + // The greatest indexes + const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); + // Select the greatest vertical indexes from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); } else { - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + // Indices of minimum elements or the greatest index if none const __m128i _Idx_max_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + // The smallest indexes + const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + // Select the smallest vertical indexes from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); } unsigned long _H_pos; if constexpr (_Mode == _Min_max_mode::_Both) { + // Find the largest horizontal index _BitScanReverse(&_H_pos, _Mask); } else { + // Find the smallest horizontal index _BitScanForward(&_H_pos, _Mask); } - + // Extract its vertical index const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); - _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + // Finally, compute the pointer + _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); } } if constexpr (_Traits::_Has_portion_max) { + // Last portion or wrapping, need to determine + _Portion_size = _Sse_size; if (_Portion_size == 0) { - break; + break; // Last portion } + // Handle the wrapping indexes. Assume _Cur_idx is zero + + // This portion size constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; if (_Portion_size > _Max_portion_size) { _Portion_size = _Max_portion_size; } _Advance_bytes(_Stop_at, _Portion_size); + // Size remaining after this _Sse_size -= _Portion_size; + // Indices will be relative to the new base _Base = static_cast(_First); + // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode != _Min_max_mode::_Max_only) { @@ -988,28 +1027,40 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); } + continue; } else { - break; + break; // No wrapping, so it was the only portion } } + // This is the main part, finding vertical minimum/maximum + // Load values and if unsigned adjust them to be signed (for signed vector comparisons) __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode != _Min_max_mode::_Max_only) { + // Mask for the values less than the current minimum const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); - _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); - _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); + // Remember their vertical indices + _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); + // Update the current minimum + _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); } if constexpr (_Mode == _Min_max_mode::_Max_only) { + // Mask for the values greater or equal to the current maximum const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); - _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); + // Remember their vertical indices + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); + // Update the current maximum + _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); } else if constexpr (_Mode == _Min_max_mode::_Both) { + // *Inverse* mask for the values greater or equal to the current maximum const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); - _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); + // Remember their vertical indices + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); + // Update the current maximum + _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); } } } From bc6a055e329fd0de83a6d95bff856cc6e8199ada Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:17:48 +0200 Subject: [PATCH 11/60] misleading comments --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ee7f91d579..0ffd174b94 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -904,8 +904,8 @@ auto _Minmax_element(const void* _First, const void* const _Last, const bool _Si __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); __m128i _Cur_vals_min = _Cur_vals; // vector of vertical minimum values __m128i _Cur_idx_min = _mm_setzero_si128(); // vector of vertical minimum indices - __m128i _Cur_vals_max = _Cur_vals; // vector of vertical minimum values - __m128i _Cur_idx_max = _mm_setzero_si128(); // vector of vertical minimum indices + __m128i _Cur_vals_max = _Cur_vals; // vector of vertical maximum values + __m128i _Cur_idx_max = _mm_setzero_si128(); // vector of vertical maximum indices __m128i _Cur_idx = _mm_setzero_si128(); // current vector of indices for (;;) { From 6a7708e6dfa7d474ffa28beea45cf7ad06744545 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:19:50 +0200 Subject: [PATCH 12/60] enable x86 function fusion optimziation --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0ffd174b94..802176b585 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -876,7 +876,7 @@ struct _Minmax_traits_8 { }; template <_Min_max_mode _Mode, class _Traits> -auto _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { +auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { _Min_max_t _Res = {_First, _First}; auto _Base = static_cast(_First); auto _Cur_min_val = _Traits::_Init_min_val; From 64b9e8098f3efc7b4b2473b20ae87a6c08d7e43c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:35:54 +0200 Subject: [PATCH 13/60] Comment on __stdcall --- stl/src/vector_algorithms.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 802176b585..4cf9ee425f 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -875,8 +875,11 @@ struct _Minmax_traits_8 { } }; +// Exactly the same signature with __std_min_element_N / __std_max_element_N / __std_minmax_element_N, +// up to calling convention. This makes sure the template specialization is fused with the export function. +// In optimized build it avoids extra call, as this function is too large to inline. template <_Min_max_mode _Mode, class _Traits> -auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { +auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign, int = 0) noexcept { _Min_max_t _Res = {_First, _First}; auto _Base = static_cast(_First); auto _Cur_min_val = _Traits::_Init_min_val; From 8e2bf94e6cfa640bd2847c2df6b7955d4379903e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:36:19 +0200 Subject: [PATCH 14/60] -stray def param --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4cf9ee425f..04b7323604 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -879,7 +879,7 @@ struct _Minmax_traits_8 { // up to calling convention. This makes sure the template specialization is fused with the export function. // In optimized build it avoids extra call, as this function is too large to inline. template <_Min_max_mode _Mode, class _Traits> -auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign, int = 0) noexcept { +auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { _Min_max_t _Res = {_First, _First}; auto _Base = static_cast(_First); auto _Cur_min_val = _Traits::_Init_min_val; From 300b3b4265dd3b2288bd80fe63fa16d45cdb6250 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:48:32 +0200 Subject: [PATCH 15/60] literate --- stl/src/vector_algorithms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 04b7323604..d880449531 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -1059,11 +1059,11 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); } else if constexpr (_Mode == _Min_max_mode::_Both) { // *Inverse* mask for the values greater or equal to the current maximum - const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); + const __m128i _Is_greater_or_equal_inv = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // Remember their vertical indices - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_greater_or_equal_inv); // Update the current maximum - _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); + _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_greater_or_equal_inv); } } } From 7597d6fd69b658927eed28964d870df4fab1287c Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 26 Dec 2021 23:56:25 +0200 Subject: [PATCH 16/60] Simplify _Base --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index d880449531..2c0dfb660c 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -881,7 +881,7 @@ struct _Minmax_traits_8 { template <_Min_max_mode _Mode, class _Traits> auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { _Min_max_t _Res = {_First, _First}; - auto _Base = static_cast(_First); + auto _Base = static_cast(_First); auto _Cur_min_val = _Traits::_Init_min_val; auto _Cur_max_val = _Traits::_Init_max_val; @@ -945,7 +945,7 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons // Extract its vertical index const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); // Finally, compute the pointer - _Res._Min = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + _Res._Min = _Base + (_V_pos * 16 + _H_pos); } } @@ -992,7 +992,7 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons // Extract its vertical index const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); // Finally, compute the pointer - _Res._Max = _Base + (_V_pos * 16 + _H_pos) / sizeof(_Cur_min_val); + _Res._Max = _Base + (_V_pos * 16 + _H_pos); } } @@ -1017,7 +1017,7 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Sse_size -= _Portion_size; // Indices will be relative to the new base - _Base = static_cast(_First); + _Base = static_cast(_First); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); From 0519a3e6ee3dc9f31a5e4c0ed396deba609cac87 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 00:17:51 +0200 Subject: [PATCH 17/60] strange, tests locally pass for me --- stl/src/vector_algorithms.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 2c0dfb660c..4e4bfadeb0 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -1072,7 +1072,6 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _First, _Last, _Res, _Sign, _Cur_min_val, _Cur_max_val); } - extern "C" { const void* __stdcall __std_min_element_1( From 424687445c8e150b0e2a8f67b1206f5feb35ead2 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 00:36:02 +0200 Subject: [PATCH 18/60] try to undo __std_minmax_element --- stl/inc/algorithm | 43 ++----------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 2818bd7135..da111d4949 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -63,7 +63,7 @@ _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + return __std_min_element(_First, _Last); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { @@ -83,7 +83,7 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + return __std_max_element(_First, _Last); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { @@ -96,31 +96,6 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } } - -template -_STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { - using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; - constexpr bool _Signed = _STD is_signed_v<_Ty>; - - if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_minmax_element( - reinterpret_cast(_First), reinterpret_cast(_Last)); - } else if constexpr (sizeof(_Ty) == 1) { - auto _Res = __std_minmax_element_1(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; - } else if constexpr (sizeof(_Ty) == 2) { - auto _Res = __std_minmax_element_2(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; - } else if constexpr (sizeof(_Ty) == 4) { - auto _Res = __std_minmax_element_4(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; - } else if constexpr (sizeof(_Ty) == 8) { - auto _Res = __std_minmax_element_8(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; - } else { - static_assert(_STD _Always_false<_Ty>, "Unexpected size"); - } -} #endif // _USE_STD_VECTOR_ALGORITHMS _STD_BEGIN @@ -9536,20 +9511,6 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { -#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 - if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { - if (!_STD is_constant_evaluated()) { - const auto _First_ptr = _To_address(_First); - const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); - if constexpr (is_pointer_v<_FwdIt>) { - return {_Result.first, _Result.second}; - } else { - return {_First + (_Result.first - _First_ptr), _First + (_Result.second - _First_ptr)}; - } - } - } -#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 - // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); From 121ea2a6c2fd7aac6496283e6d814a37e2036c0e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 00:43:36 +0200 Subject: [PATCH 19/60] ptrptr --- stl/inc/algorithm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index da111d4949..95ae095f1e 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -63,7 +63,8 @@ _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_min_element(_First, _Last); + using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; + return __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { @@ -83,7 +84,8 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return __std_max_element(_First, _Last); + using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; + return __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)); } else if constexpr (sizeof(_Ty) == 1) { return static_cast<_Ty*>(const_cast(__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { From 2a7b50ba5a89c3b603a200b4922dbc6d83c4d000 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 10:06:41 +0200 Subject: [PATCH 20/60] minmax element wrapper --- stl/inc/algorithm | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 95ae095f1e..40e74bc754 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -98,6 +98,31 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } } + +template +_STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { + using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; + constexpr bool _Signed = _STD is_signed_v<_Ty>; + + if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { + using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; + return __std_minmax_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + } else if constexpr (sizeof(_Ty) == 1) { + auto _Res = __std_minmax_element_1(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 2) { + auto _Res = __std_minmax_element_2(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 4) { + auto _Res = __std_minmax_element_4(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else if constexpr (sizeof(_Ty) == 8) { + auto _Res = __std_minmax_element_8(_First, _Last, _Signed); + return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + } else { + static_assert(_STD _Always_false<_Ty>, "Unexpected size"); + } +} #endif // _USE_STD_VECTOR_ALGORITHMS _STD_BEGIN From 666451926ac185acb110e0633a8cd0aaf9c37919 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 11:15:48 +0200 Subject: [PATCH 21/60] minmax --- stl/inc/algorithm | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 40e74bc754..8d90eccbb7 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9538,6 +9538,20 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { +#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 + if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { + if (!_STD is_constant_evaluated()) { + const auto _First_ptr = _To_address(_First); + const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); + if constexpr (is_pointer_v<_FwdIt>) { + return {_Result.first, _Result.second}; + } else { + return {_First + (_Result.first - _First_ptr), _First + (_Result.second - _First_ptr)}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 + // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); From d0ccf20b2b510c81e3bf8bb642eaaa6eca9841d0 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 12:25:45 +0200 Subject: [PATCH 22/60] fix minmax_element for larger value type --- stl/src/vector_algorithms.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4e4bfadeb0..fea0848bbc 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -985,6 +985,8 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons if constexpr (_Mode == _Min_max_mode::_Both) { // Find the largest horizontal index _BitScanReverse(&_H_pos, _Mask); + // Correct from highest val bit to lowest + _H_pos -= sizeof(_Cur_max_val) - 1; } else { // Find the smallest horizontal index _BitScanForward(&_H_pos, _Mask); From 24dbd5726ca5fa275f5d07dd44fdc89105a76637 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 12:26:16 +0200 Subject: [PATCH 23/60] some test coverage --- .../VSO_0000000_vector_algorithms/test.cpp | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index b37274ecb1..e733e36303 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -21,6 +21,97 @@ void disable_instructions(ISA_AVAILABILITY isa) { constexpr size_t dataCount = 1024; +template +FwdIt last_good_known_min_element(FwdIt first, FwdIt last) { + FwdIt result = first; + + for (; first != last; ++first) { + if (*first < *result) { + result = first; + } + } + + return result; +} + +template +FwdIt last_good_known_max_element(FwdIt first, FwdIt last) { + FwdIt result = first; + + for (; first != last; ++first) { + if (*result < *first) { + result = first; + } + } + + return result; +} + +template +std::pair last_good_known_minmax_element(FwdIt first, FwdIt last) { + // find smallest and largest elements + pair found(first, first); + + if (first != last) { + while (++first != last) { // process one or two elements + FwdIt next = first; + if (++next == last) { // process last element + if (*first < *found.first) { + found.first = first; + } else if (!(*first < *found.second)) { + found.second = first; + } + } else { // process next two elements + if (*next < *first) { // test next for new smallest + if (*next < *found.first) { + found.first = next; + } + if (!(*first < *found.second)) { + found.second = first; + } + } else { // test first for new smallest + if (*first < *found.first) { + found.first = first; + } + if (!(*next < *found.second)) { + found.second = next; + } + } + first = next; + } + } + } + + return found; +} + +template +void test_case_min_max_element(const vector& input) { + auto expected_min = last_good_known_min_element(input.begin(), input.end()); + auto expected_max = last_good_known_max_element(input.begin(), input.end()); + auto expected_minmax = last_good_known_minmax_element(input.begin(), input.end()); + auto actual_min = min_element(input.begin(), input.end()); + auto actual_max = max_element(input.begin(), input.end()); + auto actual_minmax = minmax_element(input.begin(), input.end()); + assert(expected_min == actual_min); + assert(expected_max == actual_max); + assert(expected_minmax == actual_minmax); +} + +template +void test_min_max_element(mt19937_64& gen) { + auto dis = conditional_t, uniform_real_distribution, + conditional_t<(sizeof(T) > 1), uniform_int_distribution, uniform_int_distribution>>(1, 20); + + vector input; + input.reserve(dataCount); + test_case_min_max_element(input); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(static_cast(dis(gen))); + test_case_min_max_element(input); + } +} + template inline void last_known_good_reverse(BidIt first, BidIt last) { for (; first != last && first != --last; ++first) { @@ -107,6 +198,20 @@ void test_swap_ranges(mt19937_64& gen) { void test_vector_algorithms() { mt19937_64 gen(1729); + + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_min_max_element(gen); + test_reverse(gen); test_reverse(gen); test_reverse(gen); From 3ed6d99fa90a8eee12d2267baff2731282841ee8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 13:38:16 +0200 Subject: [PATCH 24/60] Fix x86 64-bit handling --- stl/src/vector_algorithms.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index fea0848bbc..ff48f94b42 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -840,7 +840,8 @@ struct _Minmax_traits_8 { static _Signed_t _Get_any(const __m128i _Cur) noexcept { #ifdef _M_IX86 - return static_cast<_Signed_t>(_mm_extract_epi32(_Cur, 1)) | static_cast<_Signed_t>(_mm_cvtsi128_si32(_Cur)); + return static_cast<_Signed_t>((static_cast<_Unsigned_t>(_mm_extract_epi32(_Cur, 1)) << 32) + | static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_Cur))); #else return static_cast<_Signed_t>(_mm_cvtsi128_si64(_Cur)); #endif From 2b43c89aae41bdb185864669590ce86586d3bf9d Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 14:44:19 +0200 Subject: [PATCH 25/60] minor cleanup --- stl/inc/algorithm | 1 - stl/src/vector_algorithms.cpp | 111 +++++++++++++--------------------- 2 files changed, 41 insertions(+), 71 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 8d90eccbb7..93a9d3eacb 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -54,7 +54,6 @@ _Min_max_t __stdcall __std_minmax_element_1(const void* _First, const void* _Las _Min_max_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; - _END_EXTERN_C template diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ff48f94b42..0a5d457196 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -842,9 +842,9 @@ struct _Minmax_traits_8 { #ifdef _M_IX86 return static_cast<_Signed_t>((static_cast<_Unsigned_t>(_mm_extract_epi32(_Cur, 1)) << 32) | static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_Cur))); -#else +#else // ^^^ x86 ^^^ / // vvv x64 vvv return static_cast<_Signed_t>(_mm_cvtsi128_si64(_Cur)); -#endif +#endif // ^^^ x64 ^^^ } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { @@ -919,83 +919,62 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Cur_idx = _Traits::_Inc(_Cur_idx); if (_First == _Stop_at) { - // Reached end or indices wrap around point. Compute horizontal min and/or max + // Reached end or indices wrap around point. + // Compute horizontal min and/or max. Determine horizontal and vertical position of it. if constexpr (_Mode != _Min_max_mode::_Max_only) { - // Vector populated by the smallest element - const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); - // Get any element of it - const auto _H_min_val = _Traits::_Get_any(_H_min); - - if (_H_min_val < _Cur_min_val) { - // Current horizontal min is less than the old, update min - _Cur_min_val = _H_min_val; - // Mask of all elements equal to minimum - const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); - int _Mask = _mm_movemask_epi8(_Eq_mask); - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element + const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it + + if (_H_min_val < _Cur_min_val) { // Current horizontal min is less than the old + _Cur_min_val = _H_min_val; // update min + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_min, _Cur_vals_min); // Mask of all elems eq to min + int _Mask = _mm_movemask_epi8(_Eq_mask); // Indices of minimum elements or the greatest index if none + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); const __m128i _Idx_min_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_min, _Eq_mask); - // The smallest indexes - __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); + __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indexes // Select the smallest vertical indexes from the smallest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)); - // Find the smallest horizontal index unsigned long _H_pos; - _BitScanForward(&_H_pos, _Mask); - // Extract its vertical index - const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); - // Finally, compute the pointer - _Res._Min = _Base + (_V_pos * 16 + _H_pos); + _BitScanForward(&_H_pos, _Mask); // Find the smallest horizontal index + const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_min, _H_pos); // Extract its vertical index + _Res._Min = _Base + _V_pos * 16 + _H_pos; // Finally, compute the pointer } } if constexpr (_Mode != _Min_max_mode::_Min_only) { - // Vector populated by the largest element - const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); - // Get any element of it - const auto _H_max_val = _Traits::_Get_any(_H_max); + const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element + const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it if (_Mode == _Min_max_mode::_Both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { // max_element: current horizontal max is greater than the old, update max // minmax_element: current horizontal max is not less than the old, update max - _Cur_max_val = _H_max_val; - // Mask of all elements equal to maximum - const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); - // Mask of all elements equal to maximum - int _Mask = _mm_movemask_epi8(_Eq_mask); + _Cur_max_val = _H_max_val; + const __m128i _Eq_mask = _Traits::_Cmp_eq(_H_max, _Cur_vals_max); // Mask of all elems eq to max + int _Mask = _mm_movemask_epi8(_Eq_mask); + unsigned long _H_pos; if constexpr (_Mode == _Min_max_mode::_Both) { - // Indices of minimum elements or zero none + // Indices of minimum elements or zero if none const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); - // The greatest indexes - const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); + const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); // The greatest indexes // Select the greatest vertical indexes from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + _BitScanReverse(&_H_pos, _Mask); // Find the largest horizontal index + _H_pos -= sizeof(_Cur_max_val) - 1; // Correct from highest val bit to lowest } else { const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); // Indices of minimum elements or the greatest index if none const __m128i _Idx_max_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_max, _Eq_mask); - // The smallest indexes - const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); + const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indexes // Select the smallest vertical indexes from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); + _BitScanForward(&_H_pos, _Mask); // Find the smallest horizontal index } - unsigned long _H_pos; - if constexpr (_Mode == _Min_max_mode::_Both) { - // Find the largest horizontal index - _BitScanReverse(&_H_pos, _Mask); - // Correct from highest val bit to lowest - _H_pos -= sizeof(_Cur_max_val) - 1; - } else { - // Find the smallest horizontal index - _BitScanForward(&_H_pos, _Mask); - } - // Extract its vertical index - const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); - // Finally, compute the pointer - _Res._Max = _Base + (_V_pos * 16 + _H_pos); + const auto _V_pos = _Traits::_Get_v_pos(_Cur_idx_max, _H_pos); // Extract its vertical index + _Res._Max = _Base + _V_pos * 16 + _H_pos; // Finally, compute the pointer } } @@ -1042,31 +1021,23 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons // This is the main part, finding vertical minimum/maximum // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode != _Min_max_mode::_Max_only) { - // Mask for the values less than the current minimum - const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); - // Remember their vertical indices - _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); - // Update the current minimum - _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); + const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); // _Cur_vals < _Cur_vals_min + _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices + _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } if constexpr (_Mode == _Min_max_mode::_Max_only) { - // Mask for the values greater or equal to the current maximum - const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); - // Remember their vertical indices - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); - // Update the current maximum - _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); + const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); // Remember their vertical indices + _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum } else if constexpr (_Mode == _Min_max_mode::_Both) { - // *Inverse* mask for the values greater or equal to the current maximum - const __m128i _Is_greater_or_equal_inv = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); - // Remember their vertical indices - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_greater_or_equal_inv); - // Update the current maximum - _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_greater_or_equal_inv); + const __m128i _Is_gt_eq_inv = + _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_gt_eq_inv); // Remember vertical indices + _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_gt_eq_inv); // Update the current maximum } } } From 137f4135684ab2fe5c737e11e797ef223a24f80e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 18:52:03 +0200 Subject: [PATCH 26/60] more text cleanup --- stl/src/vector_algorithms.cpp | 50 +++++++++++++++-------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0a5d457196..6e60500acc 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -887,21 +887,17 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons auto _Cur_max_val = _Traits::_Init_max_val; if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { - const void* _Stop_at = _First; - - size_t _Sse_size = _Byte_length(_First, _Last) & ~size_t{0xF}; - size_t _Portion_size = _Sse_size; + size_t _Portion_size = _Byte_length(_First, _Last) & ~size_t{0xF}; if constexpr (_Traits::_Has_portion_max) { - // vector of indices will wrap around at this size + // vector of indices will wrap around at exactly this size constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; if (_Portion_size > _Max_portion_size) { _Portion_size = _Max_portion_size; } } - _Sse_size -= _Portion_size; - + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Portion_size); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) @@ -933,8 +929,8 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons // Indices of minimum elements or the greatest index if none const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); const __m128i _Idx_min_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_min, _Eq_mask); - __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indexes - // Select the smallest vertical indexes from the smallest element mask + __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indices + // Select the smallest vertical indices from the smallest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)); unsigned long _H_pos; _BitScanForward(&_H_pos, _Mask); // Find the smallest horizontal index @@ -956,19 +952,21 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons unsigned long _H_pos; if constexpr (_Mode == _Min_max_mode::_Both) { - // Indices of minimum elements or zero if none + // Looking for the last occurence of maximum + // Indices of maximum elements or zero if none const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); // The greatest indexes - // Select the greatest vertical indexes from the largest element mask + const __m128i _Idx_max = _Traits::_H_max_u(_Idx_max_val); // The greatest indices + // Select the greatest vertical indices from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); _BitScanReverse(&_H_pos, _Mask); // Find the largest horizontal index _H_pos -= sizeof(_Cur_max_val) - 1; // Correct from highest val bit to lowest } else { - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); - // Indices of minimum elements or the greatest index if none + // Looking for the first occurence of maximum + // Indices of maximum elements or the greatest index if none + const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); const __m128i _Idx_max_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_max, _Eq_mask); - const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indexes - // Select the smallest vertical indexes from the largest element mask + const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indices + // Select the smallest vertical indices from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); _BitScanForward(&_H_pos, _Mask); // Find the smallest horizontal index } @@ -977,30 +975,23 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Res._Max = _Base + _V_pos * 16 + _H_pos; // Finally, compute the pointer } } + // Horizontal part done, results are saved, now need to see if there is another portion to process if constexpr (_Traits::_Has_portion_max) { - // Last portion or wrapping, need to determine - - _Portion_size = _Sse_size; + // Either the last portion or wrapping point reached, need to determine + _Portion_size = _Byte_length(_First, _Last) & ~size_t{0xF}; if (_Portion_size == 0) { - break; // Last portion + break; // That was the last portion } - - // Handle the wrapping indexes. Assume _Cur_idx is zero - - // This portion size + // Start next portion to handle the wrapping indices. Assume _Cur_idx is zero constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; if (_Portion_size > _Max_portion_size) { _Portion_size = _Max_portion_size; } _Advance_bytes(_Stop_at, _Portion_size); - // Size remaining after this - _Sse_size -= _Portion_size; - // Indices will be relative to the new base _Base = static_cast(_First); - // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); @@ -1024,16 +1015,19 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode != _Min_max_mode::_Max_only) { + // Looking for the first occurence of minimum, don't overwrite with newly found occurences const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } if constexpr (_Mode == _Min_max_mode::_Max_only) { + // Looking for the first occurence of maximum, don't overwrite with newly found occurences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); // Remember their vertical indices _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum } else if constexpr (_Mode == _Min_max_mode::_Both) { + // Looking for the last occurence of maximum, do overwrite with newly found occurences const __m128i _Is_gt_eq_inv = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_gt_eq_inv); // Remember vertical indices From f01c3c61265654e3dddf5ce25f64480dd3c2acaf Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 27 Dec 2021 22:56:27 +0200 Subject: [PATCH 27/60] -stdlib dependency --- stl/src/vector_algorithms.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 6e60500acc..4597a02933 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -21,7 +21,6 @@ #endif // defined(_M_ARM64EC) #include #include -#include extern "C" long __isa_enabled; @@ -519,7 +518,7 @@ enum class _Min_max_mode { template <_Min_max_mode _Mode, class _STy, class _UTy> auto _Minmax_tail( const void* _First, const void* _Last, _Min_max_t& _Res, bool _Sign, _UTy _Cur_min, _UTy _Cur_max) noexcept { - constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * CHAR_BIT - 1)); + constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * 8 - 1)); if constexpr (_Mode == _Min_max_mode::_Min_only) { if (_Sign) { From d5ab6ac17f0b961be6eae8f3a1836be3ccd14b38 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 28 Dec 2021 15:30:40 +0200 Subject: [PATCH 28/60] superfluous --- stl/src/vector_algorithms.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4597a02933..b5dba6ee89 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -861,9 +861,7 @@ struct _Minmax_traits_8 { } static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { - __m128i _Gt = _mm_cmpgt_epi64(_Second, _First); // less or equal - __m128i _Eq = _mm_cmpeq_epi64(_First, _Second); - return _mm_andnot_si128(_Eq, _Gt); + return _mm_cmpgt_epi64(_Second, _First); } static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { From 75642480efcc3ef857e4fdd6327a96a225caf377 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 28 Dec 2021 15:39:23 +0200 Subject: [PATCH 29/60] LT is superfluous --- stl/src/vector_algorithms.cpp | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b5dba6ee89..e6c5f0c606 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -607,10 +607,6 @@ struct _Minmax_traits_1 { return _mm_cmpgt_epi8(_First, _Second); } - static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { - return _mm_cmplt_epi8(_First, _Second); - } - static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi8(_First, _Second); } @@ -685,10 +681,6 @@ struct _Minmax_traits_2 { return _mm_cmpgt_epi16(_First, _Second); } - static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { - return _mm_cmplt_epi16(_First, _Second); - } - static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i) noexcept { return _mm_min_epi16(_First, _Second); } @@ -767,10 +759,6 @@ struct _Minmax_traits_4 { return _mm_cmpgt_epi32(_First, _Second); } - static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { - return _mm_cmplt_epi32(_First, _Second); - } - static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi32(_First, _Second); } @@ -860,10 +848,6 @@ struct _Minmax_traits_8 { return _mm_cmpgt_epi64(_First, _Second); } - static __m128i _Cmp_lt(const __m128i _First, const __m128i _Second) noexcept { - return _mm_cmpgt_epi64(_Second, _First); - } - static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i _Mask) noexcept { return _mm_blendv_epi8(_First, _Second, _Mask); } @@ -1013,7 +997,7 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons if constexpr (_Mode != _Min_max_mode::_Max_only) { // Looking for the first occurence of minimum, don't overwrite with newly found occurences - const __m128i _Is_less = _Traits::_Cmp_lt(_Cur_vals, _Cur_vals_min); // _Cur_vals < _Cur_vals_min + const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } @@ -1025,10 +1009,9 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum } else if constexpr (_Mode == _Min_max_mode::_Both) { // Looking for the last occurence of maximum, do overwrite with newly found occurences - const __m128i _Is_gt_eq_inv = - _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) - _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_gt_eq_inv); // Remember vertical indices - _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_gt_eq_inv); // Update the current maximum + const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) + _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); // Remember their vertical indices + _Cur_vals_max = _Traits::_Max(_Cur_vals, _Cur_vals_max, _Is_less); // Update the current maximum } } } From 04b72fa5780873d7261e2c35e4e6ccb3204fdb0b Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 29 Dec 2021 18:01:26 +0200 Subject: [PATCH 30/60] optimize 16-bit case --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e6c5f0c606..113f4740c9 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -596,7 +596,7 @@ struct _Minmax_traits_1 { } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - return static_cast(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_H_pos)))); + return static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_H_pos)))); } static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { @@ -668,9 +668,9 @@ struct _Minmax_traits_2 { } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - _Unsigned_t _Array[8]; - _mm_storeu_si128(reinterpret_cast<__m128i*>(&_Array), _Idx); - return _Array[_H_pos >> 1]; + static constexpr const char _Shuf[] = "\x0\x1\x2\x3\x4\x5\x6\x7\x8\x9\xA\xB\xC\xD\xE\xF"; + + return static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_Shuf[_H_pos])))); } static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { From f4220f3deb36b64445f47e447ceed08a34d9a298 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Wed, 29 Dec 2021 19:51:32 +0200 Subject: [PATCH 31/60] Fix for optimization of 16-bit --- stl/src/vector_algorithms.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 113f4740c9..b4de7f29ac 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -668,9 +668,10 @@ struct _Minmax_traits_2 { } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - static constexpr const char _Shuf[] = "\x0\x1\x2\x3\x4\x5\x6\x7\x8\x9\xA\xB\xC\xD\xE\xF"; + static constexpr const _Unsigned_t _Shuf[] = {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}; - return static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_Shuf[_H_pos])))); + return static_cast<_Unsigned_t>( + _mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_Shuf[_H_pos >> 1])))); } static __m128i _Cmp_eq(const __m128i _First, const __m128i _Second) noexcept { From 28389bb7fb16b3929125a3582665b3bb0aa51f6f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 2 Jan 2022 13:11:55 +0200 Subject: [PATCH 32/60] enable for any standard mode --- stl/inc/algorithm | 18 +++++++++--------- stl/inc/xtr1common | 4 ++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 93a9d3eacb..dba9a03094 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9327,9 +9327,9 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element -#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { - if (!_STD is_constant_evaluated()) { + if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { @@ -9339,7 +9339,7 @@ constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#endif // _USE_STD_VECTOR_ALGORITHMS _FwdIt _Found = _First; if (_First != _Last) { @@ -9432,9 +9432,9 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element -#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { - if (!_STD is_constant_evaluated()) { + if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { @@ -9444,7 +9444,7 @@ constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#endif // _USE_STD_VECTOR_ALGORITHMS _FwdIt _Found = _First; if (_First != _Last) { @@ -9537,9 +9537,9 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { -#if _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { - if (!_STD is_constant_evaluated()) { + if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { @@ -9549,7 +9549,7 @@ constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _ } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && _HAS_CXX20 +#endif // _USE_STD_VECTOR_ALGORITHMS // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); diff --git a/stl/inc/xtr1common b/stl/inc/xtr1common index 93e7e7abd3..b92134c6c6 100644 --- a/stl/inc/xtr1common +++ b/stl/inc/xtr1common @@ -165,6 +165,10 @@ template _INLINE_VAR constexpr bool _Is_any_of_v = // true if and only if _Ty is in _Types disjunction_v...>; +_NODISCARD constexpr bool _Is_constant_evaluated() noexcept { // Internal function for any standard mode + return __builtin_is_constant_evaluated(); +} + #if _HAS_CXX20 _NODISCARD constexpr bool is_constant_evaluated() noexcept { return __builtin_is_constant_evaluated(); From 92fbbdafbb5ee45c35229827908b03bf78f5e2d7 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 2 Jan 2022 13:34:36 +0200 Subject: [PATCH 33/60] guard __CUDACC__ --- stl/inc/algorithm | 12 ++++++------ stl/inc/xtr1common | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index dba9a03094..d786d5637a 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9327,7 +9327,7 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element -#if _USE_STD_VECTOR_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9339,7 +9339,7 @@ constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) _FwdIt _Found = _First; if (_First != _Last) { @@ -9432,7 +9432,7 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element -#if _USE_STD_VECTOR_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9444,7 +9444,7 @@ constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) _FwdIt _Found = _First; if (_First != _Last) { @@ -9537,7 +9537,7 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { -#if _USE_STD_VECTOR_ALGORITHMS +#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9549,7 +9549,7 @@ constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _ } } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); diff --git a/stl/inc/xtr1common b/stl/inc/xtr1common index b92134c6c6..4dce7467bd 100644 --- a/stl/inc/xtr1common +++ b/stl/inc/xtr1common @@ -165,9 +165,11 @@ template _INLINE_VAR constexpr bool _Is_any_of_v = // true if and only if _Ty is in _Types disjunction_v...>; +#ifndef __CUDACC__ _NODISCARD constexpr bool _Is_constant_evaluated() noexcept { // Internal function for any standard mode return __builtin_is_constant_evaluated(); } +#endif // __CUDACC__ #if _HAS_CXX20 _NODISCARD constexpr bool is_constant_evaluated() noexcept { From 53c6796369fcab013ab01010bcbae6d0b55352a1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 2 Jan 2022 14:20:59 +0200 Subject: [PATCH 34/60] reduce repeated code --- stl/src/vector_algorithms.cpp | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index b4de7f29ac..1031ce8638 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -30,6 +30,7 @@ static bool _Use_sse42() { return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); } +// Must be in sync with _Min_max_t in struct _Min_max_t { const void* _Min; const void* _Max; @@ -691,7 +692,6 @@ struct _Minmax_traits_2 { } }; - struct _Minmax_traits_4 { using _Signed_t = int32_t; using _Unsigned_t = uint32_t; @@ -788,44 +788,32 @@ struct _Minmax_traits_8 { return _mm_add_epi64(_Idx, _mm_set1_epi64x(1)); } - static __m128i _H_min(const __m128i _Cur) noexcept { + template + static __m128i _H_func(const __m128i _Cur, _Fn Funct) noexcept { _Signed_t _H_min_a = _Get_any(_Cur); _Signed_t _H_min_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); - if (_H_min_b < _H_min_a) { + if (Funct(_H_min_b, _H_min_a)) { _H_min_a = _H_min_b; } return _mm_set1_epi64x(_H_min_a); } + static __m128i _H_min(const __m128i _Cur) noexcept { + return _H_func(_Cur, [](_Signed_t _Lhs, _Signed_t _Rhs) { return _Lhs < _Rhs; }); + } + static __m128i _H_max(const __m128i _Cur) noexcept { - _Signed_t _H_max_a = _Get_any(_Cur); - _Signed_t _H_max_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); - if (_H_max_b > _H_max_a) { - _H_max_a = _H_max_b; - } - return _mm_set1_epi64x(_H_max_a); + return _H_func(_Cur, [](_Signed_t _Lhs, _Signed_t _Rhs) { return _Lhs > _Rhs; }); } static __m128i _H_min_u(const __m128i _Cur) noexcept { - _Unsigned_t _H_min_a = _Get_any(_Cur); - _Unsigned_t _H_min_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); - if (_H_min_b < _H_min_a) { - _H_min_a = _H_min_b; - } - return _mm_set1_epi64x(_H_min_a); + return _H_func(_Cur, [](_Unsigned_t _Lhs, _Unsigned_t _Rhs) { return _Lhs < _Rhs; }); } - static __m128i _H_max_u(const __m128i _Cur) noexcept { - _Unsigned_t _H_max_a = _Get_any(_Cur); - _Unsigned_t _H_max_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); - if (_H_max_b > _H_max_a) { - _H_max_a = _H_max_b; - } - return _mm_set1_epi64x(_H_max_a); + return _H_func(_Cur, [](_Unsigned_t _Lhs, _Unsigned_t _Rhs) { return _Lhs > _Rhs; }); } - static _Signed_t _Get_any(const __m128i _Cur) noexcept { #ifdef _M_IX86 return static_cast<_Signed_t>((static_cast<_Unsigned_t>(_mm_extract_epi32(_Cur, 1)) << 32) From fb7679d967d909adb11429c9f9675a4a46b51fd1 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 3 Jan 2022 16:13:39 +0200 Subject: [PATCH 35/60] Element in name --- stl/inc/algorithm | 18 +++++++++--------- stl/src/vector_algorithms.cpp | 26 +++++++++++++------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index d786d5637a..3e84046bde 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -18,6 +18,11 @@ _STL_DISABLE_CLANG_WARNINGS #undef new #if _USE_STD_VECTOR_ALGORITHMS +struct _Min_max_element_t { + const void* _Min; + const void* _Max; +}; + _EXTERN_C // The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms // won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by @@ -35,11 +40,6 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( const void* _First, const void* _Last, void* _Dest) noexcept; -struct _Min_max_t { - const void* _Min; - const void* _Max; -}; - const void* __stdcall __std_min_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_min_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; @@ -50,10 +50,10 @@ const void* __stdcall __std_max_element_2(const void* _First, const void* _Last, const void* __stdcall __std_max_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; const void* __stdcall __std_max_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; -_Min_max_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; -_Min_max_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; -_Min_max_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; -_Min_max_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; +_Min_max_element_t __stdcall __std_minmax_element_8(const void* _First, const void* _Last, bool _Signed) noexcept; _END_EXTERN_C template diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 1031ce8638..56a25ad751 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -31,7 +31,7 @@ static bool _Use_sse42() { } // Must be in sync with _Min_max_t in -struct _Min_max_t { +struct _Min_max_element_t { const void* _Min; const void* _Max; }; @@ -491,8 +491,8 @@ const void* _Max_tail(const void* const _First, const void* const _Last, const v } template -_Min_max_t _Both_tail( - const void* const _First, const void* const _Last, _Min_max_t& _Res, _Ty _Cur_min, _Ty _Cur_max) noexcept { +_Min_max_element_t _Both_tail( + const void* const _First, const void* const _Last, _Min_max_element_t& _Res, _Ty _Cur_min, _Ty _Cur_max) noexcept { for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if (*_Ptr < _Cur_min) { _Res._Min = _Ptr; @@ -517,8 +517,8 @@ enum class _Min_max_mode { }; template <_Min_max_mode _Mode, class _STy, class _UTy> -auto _Minmax_tail( - const void* _First, const void* _Last, _Min_max_t& _Res, bool _Sign, _UTy _Cur_min, _UTy _Cur_max) noexcept { +auto _Minmax_tail(const void* _First, const void* _Last, _Min_max_element_t& _Res, bool _Sign, _UTy _Cur_min, + _UTy _Cur_max) noexcept { constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * 8 - 1)); if constexpr (_Mode == _Min_max_mode::_Min_only) { @@ -851,10 +851,10 @@ struct _Minmax_traits_8 { // In optimized build it avoids extra call, as this function is too large to inline. template <_Min_max_mode _Mode, class _Traits> auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { - _Min_max_t _Res = {_First, _First}; - auto _Base = static_cast(_First); - auto _Cur_min_val = _Traits::_Init_min_val; - auto _Cur_max_val = _Traits::_Init_max_val; + _Min_max_element_t _Res = {_First, _First}; + auto _Base = static_cast(_First); + auto _Cur_min_val = _Traits::_Init_min_val; + auto _Cur_max_val = _Traits::_Init_max_val; if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { size_t _Portion_size = _Byte_length(_First, _Last) & ~size_t{0xF}; @@ -1051,22 +1051,22 @@ const void* __stdcall __std_max_element_8( return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_8>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_1( +_Min_max_element_t __stdcall __std_minmax_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_1>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_2( +_Min_max_element_t __stdcall __std_minmax_element_2( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_2>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_4( +_Min_max_element_t __stdcall __std_minmax_element_4( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_4>(_First, _Last, _Signed); } -_Min_max_t __stdcall __std_minmax_element_8( +_Min_max_element_t __stdcall __std_minmax_element_8( const void* const _First, const void* const _Last, const bool _Signed) noexcept { return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_8>(_First, _Last, _Signed); } From 8f91c0d69b400a78f701f6ae495ea63746c38360 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Mon, 3 Jan 2022 16:39:41 +0200 Subject: [PATCH 36/60] optimization tuning for release only --- stl/src/vector_algorithms.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 56a25ad751..bb13edf541 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -24,7 +24,9 @@ extern "C" long __isa_enabled; +#ifndef _DEBUG #pragma optimize("t", on) // Override /Os with /Ot for this TU +#endif // !_DEBUG static bool _Use_sse42() { return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); From 661fd8650cbf433d328d33078d0f08a55a77f162 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 2 Apr 2022 11:22:27 +0300 Subject: [PATCH 37/60] Missing typename --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 451f82e889..c39ddbf3c2 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -1006,7 +1006,7 @@ auto __stdcall _Minmax_element(const void* _First, const void* const _Last, cons } } - return _Minmax_tail<_Mode, _Traits::_Signed_t, _Traits::_Unsigned_t>( + return _Minmax_tail<_Mode, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>( _First, _Last, _Res, _Sign, _Cur_min_val, _Cur_max_val); } From c451f69175e022bafc5cb334619a959d8c09a687 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 2 Apr 2022 16:21:45 +0300 Subject: [PATCH 38/60] pointer cast properly --- stl/inc/algorithm | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index cdfb479130..f594d01782 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -58,20 +58,19 @@ _END_EXTERN_C template _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { - using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; - return __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + return const_cast<_Ty*>(reinterpret_cast( + __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)))); } else if constexpr (sizeof(_Ty) == 1) { - return static_cast<_Ty*>(const_cast(__std_min_element_1(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { - return static_cast<_Ty*>(const_cast(__std_min_element_2(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_min_element_2(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 4) { - return static_cast<_Ty*>(const_cast(__std_min_element_4(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_min_element_4(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 8) { - return static_cast<_Ty*>(const_cast(__std_min_element_8(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_min_element_8(_First, _Last, _Signed))); } else { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } @@ -79,20 +78,19 @@ _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { template _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { - using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; - return __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + return const_cast<_Ty*>(reinterpret_cast( + __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)))); } else if constexpr (sizeof(_Ty) == 1) { - return static_cast<_Ty*>(const_cast(__std_max_element_1(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { - return static_cast<_Ty*>(const_cast(__std_max_element_2(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_max_element_2(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 4) { - return static_cast<_Ty*>(const_cast(__std_max_element_4(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_max_element_4(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 8) { - return static_cast<_Ty*>(const_cast(__std_max_element_8(_First, _Last, _Signed))); + return const_cast<_Ty*>(static_cast(__std_max_element_8(_First, _Last, _Signed))); } else { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } @@ -100,24 +98,29 @@ _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { template _STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { - using cv_pv = _STD conditional_t<_STD is_const_v<_Ty>, const void*, void*>; constexpr bool _Signed = _STD is_signed_v<_Ty>; if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - using cv_uintptr_ptr = _STD conditional_t<_STD is_const_v<_Ty>, const uintptr_t*, uintptr_t*>; - return __std_minmax_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + auto _Res = + __std_minmax_element(reinterpret_cast(_First), reinterpret_cast(_Last)); + return {const_cast<_Ty*>(reinterpret_cast(_Res.first)), + const_cast<_Ty*>(reinterpret_cast(_Res.second))}; } else if constexpr (sizeof(_Ty) == 1) { auto _Res = __std_minmax_element_1(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + return { + const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; } else if constexpr (sizeof(_Ty) == 2) { auto _Res = __std_minmax_element_2(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + return { + const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; } else if constexpr (sizeof(_Ty) == 4) { auto _Res = __std_minmax_element_4(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + return { + const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; } else if constexpr (sizeof(_Ty) == 8) { auto _Res = __std_minmax_element_8(_First, _Last, _Signed); - return {static_cast<_Ty*>(const_cast(_Res._Min)), static_cast<_Ty*>(const_cast(_Res._Max))}; + return { + const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; } else { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } From 87992c1f6b6d7275bb1ba625f5b9d9fda4dae36e Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 2 Apr 2022 17:24:25 +0300 Subject: [PATCH 39/60] narrative --- stl/inc/algorithm | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index f594d01782..f852e53ab0 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9163,15 +9163,16 @@ namespace ranges { #endif // __cpp_lib_concepts #endif // _HAS_CXX17 -template -_INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = - _Iterator_is_contiguous<_FwdIt>&& - conjunction_v>, is_pointer<_Iter_value_t<_FwdIt>>>, - disjunction>, +template > +_INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = // Activate the vector algorithms for min_/max_element? + _Iterator_is_contiguous<_Iter> // The iterator must be contiguous so we can get raw pointers. + && !_Iterator_is_volatile<_Iter> // The iterator must not be volatile. + && conjunction_v, is_pointer<_Elem>>, // Element is of integral or pointer type. + disjunction< // And either of the following: #ifdef __cpp_lib_concepts - is_same<_Pr, _RANGES less>, + is_same<_Pr, _RANGES less>, // predicate is ranges::less #endif // __cpp_lib_concepts - is_same<_Pr, less<_Iter_value_t<_FwdIt>>>>> && !is_volatile_v>>; + is_same<_Pr, less<>>, is_same<_Pr, less<_Elem>>>>; // predicate is less template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element From 66943a860ac1c33c94c9af7cff7c6b5cafc62c44 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 5 Apr 2022 08:40:35 +0300 Subject: [PATCH 40/60] unnamed namespace --- stl/src/vector_algorithms.cpp | 52 ++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c39ddbf3c2..667a073b54 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -28,43 +28,45 @@ extern "C" long __isa_enabled; #pragma optimize("t", on) // Override /Os with /Ot for this TU #endif // !_DEBUG -static bool _Use_sse42() { - return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); -} - // Must be in sync with _Min_max_t in struct _Min_max_element_t { const void* _Min; const void* _Max; }; -template -static void _Reverse_tail(_BidIt _First, _BidIt _Last) noexcept { - for (; _First != _Last && _First != --_Last; ++_First) { - const auto _Temp = *_First; - *_First = *_Last; - *_Last = _Temp; +namespace { + static bool _Use_sse42() { + return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); } -} -template -static void _Reverse_copy_tail(_BidIt _First, _BidIt _Last, _OutIt _Dest) noexcept { - while (_First != _Last) { - *_Dest++ = *--_Last; + template + static void _Reverse_tail(_BidIt _First, _BidIt _Last) noexcept { + for (; _First != _Last && _First != --_Last; ++_First) { + const auto _Temp = *_First; + *_First = *_Last; + *_Last = _Temp; + } } -} -static size_t _Byte_length(const void* _First, const void* _Last) noexcept { - return static_cast(_Last) - static_cast(_First); -} + template + static void _Reverse_copy_tail(_BidIt _First, _BidIt _Last, _OutIt _Dest) noexcept { + while (_First != _Last) { + *_Dest++ = *--_Last; + } + } -static void _Advance_bytes(void*& _Target, ptrdiff_t _Offset) noexcept { - _Target = static_cast(_Target) + _Offset; -} + static size_t _Byte_length(const void* _First, const void* _Last) noexcept { + return static_cast(_Last) - static_cast(_First); + } -static void _Advance_bytes(const void*& _Target, ptrdiff_t _Offset) noexcept { - _Target = static_cast(_Target) + _Offset; -} + static void _Advance_bytes(void*& _Target, ptrdiff_t _Offset) noexcept { + _Target = static_cast(_Target) + _Offset; + } + + static void _Advance_bytes(const void*& _Target, ptrdiff_t _Offset) noexcept { + _Target = static_cast(_Target) + _Offset; + } +} // unnamed namespace extern "C" { __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( From d07751421af1a18f591f02330f25e59034440f03 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 5 Apr 2022 09:28:42 +0300 Subject: [PATCH 41/60] extern C --- stl/inc/algorithm | 3 ++- stl/src/vector_algorithms.cpp | 17 ++++++----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 477f60f856..bc438d1fd8 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -18,12 +18,13 @@ _STL_DISABLE_CLANG_WARNINGS #undef new #if _USE_STD_VECTOR_ALGORITHMS + +_EXTERN_C struct _Min_max_element_t { const void* _Min; const void* _Max; }; -_EXTERN_C // The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms // won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by // those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 4491d92d40..25287688eb 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -24,22 +24,11 @@ extern "C" long __isa_enabled; #pragma optimize("t", on) // Override /Os with /Ot for this TU #endif // !_DEBUG -// Must be in sync with _Min_max_t in -struct _Min_max_element_t { - const void* _Min; - const void* _Max; -}; - namespace { bool _Use_avx2() noexcept { return __isa_enabled & (1 << __ISA_AVAILABLE_AVX2); } - - static bool _Use_sse42() { - return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); - } - bool _Use_sse42() noexcept { return __isa_enabled & (1 << __ISA_AVAILABLE_SSE42); } @@ -82,6 +71,12 @@ namespace { } // unnamed namespace extern "C" { +// Must be in sync with _Min_max_t in +struct _Min_max_element_t { + const void* _Min; + const void* _Max; +}; + __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( void* _First1, void* _Last1, void* _First2) noexcept { constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); From d5953f5765dc081bf5daedb95c6e813a7489b844 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 5 May 2022 18:41:44 +0300 Subject: [PATCH 42/60] more obvious all FFs --- stl/src/vector_algorithms.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 25287688eb..5f64e33d33 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -875,8 +875,8 @@ namespace { _Traits::_Cmp_eq(_H_min, _Cur_vals_min); // Mask of all elems eq to min int _Mask = _mm_movemask_epi8(_Eq_mask); // Indices of minimum elements or the greatest index if none - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); - const __m128i _Idx_min_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_min, _Eq_mask); + const __m128i _All_max = _mm_set1_epi8(static_cast(0xFF)); + const __m128i _Idx_min_val = _mm_blendv_epi8(_All_max, _Cur_idx_min, _Eq_mask); __m128i _Idx_min = _Traits::_H_min_u(_Idx_min_val); // The smallest indices // Select the smallest vertical indices from the smallest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_min, _Idx_min_val)); @@ -914,8 +914,8 @@ namespace { } else { // Looking for the first occurence of maximum // Indices of maximum elements or the greatest index if none - const __m128i _Minus_one = _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); - const __m128i _Idx_max_val = _mm_blendv_epi8(_Minus_one, _Cur_idx_max, _Eq_mask); + const __m128i _All_max = _mm_set1_epi8(static_cast(0xFF)); + const __m128i _Idx_max_val = _mm_blendv_epi8(_All_max, _Cur_idx_max, _Eq_mask); const __m128i _Idx_max = _Traits::_H_min_u(_Idx_max_val); // The smallest indices // Select the smallest vertical indices from the largest element mask _Mask &= _mm_movemask_epi8(_Traits::_Cmp_eq(_Idx_max, _Idx_max_val)); From 881f1e25bc127322b84abd1b639c964e8ab7aa73 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 10 May 2022 09:15:18 +0300 Subject: [PATCH 43/60] bitwise flags, aligned loads also eliminated unused variable --- stl/src/vector_algorithms.cpp | 64 +++++++++++++++++------------------ 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 5f64e33d33..c1270a1095 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -485,10 +485,10 @@ namespace { return _Res; } - enum class _Min_max_mode { - _Min_only, - _Max_only, - _Both, + enum _Min_max_mode { + _Mode_min = 1, + _Mode_max = 2, + _Mode_both = 3, }; template <_Min_max_mode _Mode, class _STy, class _UTy> @@ -496,13 +496,13 @@ namespace { _UTy _Cur_max) noexcept { constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * 8 - 1)); - if constexpr (_Mode == _Min_max_mode::_Min_only) { + if constexpr (_Mode == _Mode_min) { if (_Sign) { return _Min_tail(_First, _Last, _Res._Min, static_cast<_STy>(_Cur_min)); } else { return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min + _Cor)); } - } else if constexpr (_Mode == _Min_max_mode::_Max_only) { + } else if constexpr (_Mode == _Mode_max) { if (_Sign) { return _Max_tail(_First, _Last, _Res._Max, static_cast<_STy>(_Cur_max)); } else { @@ -531,7 +531,7 @@ namespace { static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_cors[2][16] = { {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; - return _mm_sub_epi8(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi8(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -605,7 +605,7 @@ namespace { static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_cors[2][8] = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; - return _mm_sub_epi16(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi16(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -685,7 +685,7 @@ namespace { static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { alignas(16) static constexpr _Unsigned_t _Sign_cors[2][4] = { 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; - return _mm_sub_epi32(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi32(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -694,8 +694,6 @@ namespace { template static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { - const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - __m128i _H_min = _Cur; _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); @@ -757,7 +755,7 @@ namespace { static __m128i _Sign_cor(__m128i _Val, const bool _Sign) { alignas(16) static constexpr _Unsigned_t _Sign_cors[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; - return _mm_sub_epi64(_Val, _mm_loadu_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi64(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -864,7 +862,7 @@ namespace { // Reached end or indices wrap around point. // Compute horizontal min and/or max. Determine horizontal and vertical position of it. - if constexpr (_Mode != _Min_max_mode::_Max_only) { + if constexpr ((_Mode & _Mode_min) != 0) { const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it @@ -887,12 +885,12 @@ namespace { } } - if constexpr (_Mode != _Min_max_mode::_Min_only) { + if constexpr (_Mode & _Mode_max != 0) { const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it - if (_Mode == _Min_max_mode::_Both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { + if (_Mode == _Mode_both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { // max_element: current horizontal max is greater than the old, update max // minmax_element: current horizontal max is not less than the old, update max _Cur_max_val = _H_max_val; @@ -901,7 +899,7 @@ namespace { int _Mask = _mm_movemask_epi8(_Eq_mask); unsigned long _H_pos; - if constexpr (_Mode == _Min_max_mode::_Both) { + if constexpr (_Mode == _Mode_both) { // Looking for the last occurence of maximum // Indices of maximum elements or zero if none const __m128i _Idx_max_val = @@ -947,11 +945,11 @@ namespace { _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr (_Mode != _Min_max_mode::_Max_only) { + if constexpr (( _Mode & _Mode_min) != 0) { _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } - if constexpr (_Mode != _Min_max_mode::_Min_only) { + if constexpr ((_Mode & _Mode_max) != 0) { _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); } @@ -966,20 +964,20 @@ namespace { // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr (_Mode != _Min_max_mode::_Max_only) { + if constexpr ((_Mode & _Mode_min) != 0) { // Looking for the first occurence of minimum, don't overwrite with newly found occurences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } - if constexpr (_Mode == _Min_max_mode::_Max_only) { + if constexpr ((_Mode == _Mode_max) != 0) { // Looking for the first occurence of maximum, don't overwrite with newly found occurences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); // Remember their vertical indices _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum - } else if constexpr (_Mode == _Min_max_mode::_Both) { + } else if constexpr (_Mode == _Mode_both) { // Looking for the last occurence of maximum, do overwrite with newly found occurences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) @@ -999,62 +997,62 @@ extern "C" { const void* __stdcall __std_min_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_1>(_First, _Last, _Signed); + return _Minmax_element<_Mode_min, _Minmax_traits_1>(_First, _Last, _Signed); } const void* __stdcall __std_min_element_2( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_2>(_First, _Last, _Signed); + return _Minmax_element<_Mode_min, _Minmax_traits_2>(_First, _Last, _Signed); } const void* __stdcall __std_min_element_4( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_4>(_First, _Last, _Signed); + return _Minmax_element<_Mode_min, _Minmax_traits_4>(_First, _Last, _Signed); } const void* __stdcall __std_min_element_8( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Min_only, _Minmax_traits_8>(_First, _Last, _Signed); + return _Minmax_element<_Mode_min, _Minmax_traits_8>(_First, _Last, _Signed); } const void* __stdcall __std_max_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_1>(_First, _Last, _Signed); + return _Minmax_element<_Mode_max, _Minmax_traits_1>(_First, _Last, _Signed); } const void* __stdcall __std_max_element_2( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_2>(_First, _Last, _Signed); + return _Minmax_element<_Mode_max, _Minmax_traits_2>(_First, _Last, _Signed); } const void* __stdcall __std_max_element_4( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_4>(_First, _Last, _Signed); + return _Minmax_element<_Mode_max, _Minmax_traits_4>(_First, _Last, _Signed); } const void* __stdcall __std_max_element_8( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Max_only, _Minmax_traits_8>(_First, _Last, _Signed); + return _Minmax_element<_Mode_max, _Minmax_traits_8>(_First, _Last, _Signed); } _Min_max_element_t __stdcall __std_minmax_element_1( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_1>(_First, _Last, _Signed); + return _Minmax_element<_Mode_both, _Minmax_traits_1>(_First, _Last, _Signed); } _Min_max_element_t __stdcall __std_minmax_element_2( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_2>(_First, _Last, _Signed); + return _Minmax_element<_Mode_both, _Minmax_traits_2>(_First, _Last, _Signed); } _Min_max_element_t __stdcall __std_minmax_element_4( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_4>(_First, _Last, _Signed); + return _Minmax_element<_Mode_both, _Minmax_traits_4>(_First, _Last, _Signed); } _Min_max_element_t __stdcall __std_minmax_element_8( const void* const _First, const void* const _Last, const bool _Signed) noexcept { - return _Minmax_element<_Min_max_mode::_Both, _Minmax_traits_8>(_First, _Last, _Signed); + return _Minmax_element<_Mode_both, _Minmax_traits_8>(_First, _Last, _Signed); } } // extern "C" From 8912c707e07180457de9487919746f4c87d5fbeb Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 10 May 2022 10:40:44 +0300 Subject: [PATCH 44/60] format --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c1270a1095..5a736056f2 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -945,7 +945,7 @@ namespace { _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr (( _Mode & _Mode_min) != 0) { + if constexpr ((_Mode & _Mode_min) != 0) { _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } From c02cb635e470f1d062e8dd85e3d10bf540286764 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Tue, 10 May 2022 10:55:40 +0300 Subject: [PATCH 45/60] brace --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 5a736056f2..f8bd000c2d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -885,7 +885,7 @@ namespace { } } - if constexpr (_Mode & _Mode_max != 0) { + if constexpr ((_Mode & _Mode_max) != 0) { const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it From ef213be7bf5e2f3fe408c967fc2f4f2f1f1609e2 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 12 May 2022 09:10:38 +0300 Subject: [PATCH 46/60] unternary --- stl/src/vector_algorithms.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index f8bd000c2d..04e6d88e0b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -890,7 +890,8 @@ namespace { _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it - if (_Mode == _Mode_both ? _Cur_max_val <= _H_max_val : _Cur_max_val < _H_max_val) { + if (_Mode == _Mode_both && _Cur_max_val <= _H_max_val + || _Mode == _Mode_max && _Cur_max_val < _H_max_val) { // max_element: current horizontal max is greater than the old, update max // minmax_element: current horizontal max is not less than the old, update max _Cur_max_val = _H_max_val; From 40c323f7244c0054ed6a6ee7ca5d10c3b0616e35 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 12 May 2022 15:45:49 +0300 Subject: [PATCH 47/60] Update vector_algorithms.cpp --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 04e6d88e0b..140628019e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -972,7 +972,7 @@ namespace { _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } - if constexpr ((_Mode == _Mode_max) != 0) { + if constexpr (_Mode == _Mode_max != 0) { // Looking for the first occurence of maximum, don't overwrite with newly found occurences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = From 856da66b84cea3bb4ea153452769bf135717a866 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Thu, 12 May 2022 15:55:12 +0300 Subject: [PATCH 48/60] _Ugly --- stl/src/vector_algorithms.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 140628019e..dd2b10271b 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -763,10 +763,10 @@ namespace { } template - static __m128i _H_func(const __m128i _Cur, _Fn Funct) noexcept { + static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { _Signed_t _H_min_a = _Get_any(_Cur); _Signed_t _H_min_b = _Get_any(_mm_bsrli_si128(_Cur, 8)); - if (Funct(_H_min_b, _H_min_a)) { + if (_Funct(_H_min_b, _H_min_a)) { _H_min_a = _H_min_b; } return _mm_set1_epi64x(_H_min_a); From 7e7683fd92a19b32418df18a96e09288f42b79c8 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Fri, 13 May 2022 19:44:07 +0300 Subject: [PATCH 49/60] review comments --- stl/inc/algorithm | 8 ++++---- stl/src/vector_algorithms.cpp | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index bc438d1fd8..bf7de92be8 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9181,7 +9181,7 @@ namespace ranges { #endif // _HAS_CXX17 template > -_INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = // Activate the vector algorithms for min_/max_element? +_INLINE_VAR constexpr bool _Is_min_max_optimization_safe = // Activate the vector algorithms for min_/max_element? _Iterator_is_contiguous<_Iter> // The iterator must be contiguous so we can get raw pointers. && !_Iterator_is_volatile<_Iter> // The iterator must not be volatile. && conjunction_v, is_pointer<_Elem>>, // Element is of integral or pointer type. @@ -9194,7 +9194,7 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe_v = // Activate the vec template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element #if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) - if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { + if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_max_element(_First_ptr, _To_address(_Last)); @@ -9299,7 +9299,7 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element #if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) - if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { + if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_min_element(_First_ptr, _To_address(_Last)); @@ -9404,7 +9404,7 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { #if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) - if constexpr (_Is_min_max_optimization_safe_v<_FwdIt, _Pr>) { + if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index dd2b10271b..216741f665 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -831,18 +831,18 @@ namespace { auto _Cur_max_val = _Traits::_Init_max_val; if (_Byte_length(_First, _Last) >= 16 && _Use_sse42()) { - size_t _Portion_size = _Byte_length(_First, _Last) & ~size_t{0xF}; + size_t _Portion_byte_size = _Byte_length(_First, _Last) & ~size_t{0xF}; if constexpr (_Traits::_Has_portion_max) { // vector of indices will wrap around at exactly this size - constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; - if (_Portion_size > _Max_portion_size) { - _Portion_size = _Max_portion_size; + constexpr size_t _Max_portion_byte_size = _Traits::_Portion_max * 16; + if (_Portion_byte_size > _Max_portion_byte_size) { + _Portion_byte_size = _Max_portion_byte_size; } } const void* _Stop_at = _First; - _Advance_bytes(_Stop_at, _Portion_size); + _Advance_bytes(_Stop_at, _Portion_byte_size); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); @@ -862,7 +862,7 @@ namespace { // Reached end or indices wrap around point. // Compute horizontal min and/or max. Determine horizontal and vertical position of it. - if constexpr ((_Mode & _Mode_min) != 0) { + if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it @@ -885,7 +885,7 @@ namespace { } } - if constexpr ((_Mode & _Mode_max) != 0) { + if constexpr ((_Mode & _Mode_max) != 0) { // TRANSITION, 17.3 Preview 2 const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it @@ -929,28 +929,28 @@ namespace { if constexpr (_Traits::_Has_portion_max) { // Either the last portion or wrapping point reached, need to determine - _Portion_size = _Byte_length(_First, _Last) & ~size_t{0xF}; - if (_Portion_size == 0) { + _Portion_byte_size = _Byte_length(_First, _Last) & ~size_t{0xF}; + if (_Portion_byte_size == 0) { break; // That was the last portion } // Start next portion to handle the wrapping indices. Assume _Cur_idx is zero - constexpr size_t _Max_portion_size = _Traits::_Portion_max * 16; - if (_Portion_size > _Max_portion_size) { - _Portion_size = _Max_portion_size; + constexpr size_t _Max_portion_byte_size = _Traits::_Portion_max * 16; + if (_Portion_byte_size > _Max_portion_byte_size) { + _Portion_byte_size = _Max_portion_byte_size; } - _Advance_bytes(_Stop_at, _Portion_size); + _Advance_bytes(_Stop_at, _Portion_byte_size); // Indices will be relative to the new base _Base = static_cast(_First); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr ((_Mode & _Mode_min) != 0) { + if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } - if constexpr ((_Mode & _Mode_max) != 0) { + if constexpr ((_Mode & _Mode_max) != 0) { // TRANSITION, 17.3 Preview 2 _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); } @@ -965,14 +965,15 @@ namespace { // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr ((_Mode & _Mode_min) != 0) { + if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 + // Looking for the first occurence of minimum, don't overwrite with newly found occurences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } - if constexpr (_Mode == _Mode_max != 0) { + if constexpr (_Mode == _Mode_max) { // TRANSITION, 17.3 Preview 2 // Looking for the first occurence of maximum, don't overwrite with newly found occurences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = From 4f332e356a579bc176b0d4d20607e65f74c489de Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 May 2022 06:10:23 +0300 Subject: [PATCH 50/60] +coverage --- .../VSO_0000000_vector_algorithms/test.cpp | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index f142a86456..ac748287d6 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include +#include #include #include #include @@ -174,6 +175,50 @@ void test_min_max_element(mt19937_64& gen) { } } +void test_min_max_element_special_cases() { + // multi portion and same vector cases tested explicitly + // made sure valid for vector sizes 128,256,512 + + array test; + + test.fill(1); + test.at(65) = 0; + test.at(66) = 0; + test.at(68) = 2; + test.at(69) = 2; + + assert(min_element(test.begin(), test.end()) == test.begin() + 65); + assert(max_element(test.begin(), test.end()) == test.begin() + 68); + assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65); + assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69); + + test.fill(1); + test.at(65 + 4096) = 0; + test.at(66 + 4096) = 0; + test.at(68 + 4096) = 2; + test.at(69 + 4096) = 2; + + assert(min_element(test.begin(), test.end()) == test.begin() + 65 + 4096); + assert(max_element(test.begin(), test.end()) == test.begin() + 68 + 4096); + assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65 + 4096); + assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69 + 4096); + + test.fill(1); + test.at(65) = 0; + test.at(66) = 0; + test.at(68) = 2; + test.at(69) = 2; + test.at(65 + 4096) = 0; + test.at(66 + 4096) = 0; + test.at(68 + 4096) = 2; + test.at(69 + 4096) = 2; + + assert(min_element(test.begin(), test.end()) == test.begin() + 65); + assert(max_element(test.begin(), test.end()) == test.begin() + 68); + assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65); + assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69 + 4096); +} + template inline void last_known_good_reverse(BidIt first, BidIt last) { for (; first != last && first != --last; ++first) { @@ -294,6 +339,8 @@ void test_vector_algorithms() { test_min_max_element(gen); test_min_max_element(gen); + test_min_max_element_special_cases(); + test_reverse(gen); test_reverse(gen); test_reverse(gen); From bdf7d8289847544834038bb455af36fe809de671 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 May 2022 06:23:23 +0300 Subject: [PATCH 51/60] +coverage --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index ac748287d6..d216468409 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -181,6 +181,12 @@ void test_min_max_element_special_cases() { array test; + test.fill(1); + assert(min_element(test.begin(), test.end()) == test.begin() ); + assert(max_element(test.begin(), test.end()) == test.begin()); + assert(minmax_element(test.begin(), test.end()).first == test.begin()); + assert(minmax_element(test.begin(), test.end()).second == test.begin() + 8191); + test.fill(1); test.at(65) = 0; test.at(66) = 0; From b79b19452f830438b36b120ba9b482ff32336433 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 14 May 2022 06:25:10 +0300 Subject: [PATCH 52/60] format --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index d216468409..66f18ab4b4 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -182,7 +182,7 @@ void test_min_max_element_special_cases() { array test; test.fill(1); - assert(min_element(test.begin(), test.end()) == test.begin() ); + assert(min_element(test.begin(), test.end()) == test.begin()); assert(max_element(test.begin(), test.end()) == test.begin()); assert(minmax_element(test.begin(), test.end()).first == test.begin()); assert(minmax_element(test.begin(), test.end()).second == test.begin() + 8191); From a6ffe632f92a0ba96a4a3ac65044c393b002954f Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 May 2022 16:49:16 +0300 Subject: [PATCH 53/60] enhance special cases --- .../VSO_0000000_vector_algorithms/test.cpp | 98 +++++++++---------- 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 66f18ab4b4..4c3d7d3e6f 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -#include #include #include #include @@ -10,7 +9,6 @@ #include #include #include -#include using namespace std; @@ -175,54 +173,52 @@ void test_min_max_element(mt19937_64& gen) { } } +template void test_min_max_element_special_cases() { - // multi portion and same vector cases tested explicitly - // made sure valid for vector sizes 128,256,512 - - array test; - - test.fill(1); - assert(min_element(test.begin(), test.end()) == test.begin()); - assert(max_element(test.begin(), test.end()) == test.begin()); - assert(minmax_element(test.begin(), test.end()).first == test.begin()); - assert(minmax_element(test.begin(), test.end()).second == test.begin() + 8191); - - test.fill(1); - test.at(65) = 0; - test.at(66) = 0; - test.at(68) = 2; - test.at(69) = 2; - - assert(min_element(test.begin(), test.end()) == test.begin() + 65); - assert(max_element(test.begin(), test.end()) == test.begin() + 68); - assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65); - assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69); - - test.fill(1); - test.at(65 + 4096) = 0; - test.at(66 + 4096) = 0; - test.at(68 + 4096) = 2; - test.at(69 + 4096) = 2; - - assert(min_element(test.begin(), test.end()) == test.begin() + 65 + 4096); - assert(max_element(test.begin(), test.end()) == test.begin() + 68 + 4096); - assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65 + 4096); - assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69 + 4096); - - test.fill(1); - test.at(65) = 0; - test.at(66) = 0; - test.at(68) = 2; - test.at(69) = 2; - test.at(65 + 4096) = 0; - test.at(66 + 4096) = 0; - test.at(68 + 4096) = 2; - test.at(69 + 4096) = 2; - - assert(min_element(test.begin(), test.end()) == test.begin() + 65); - assert(max_element(test.begin(), test.end()) == test.begin() + 68); - assert(minmax_element(test.begin(), test.end()).first == test.begin() + 65); - assert(minmax_element(test.begin(), test.end()).second == test.begin() + 69 + 4096); + constexpr size_t block_size_in_vectors = 1 << (sizeof(ElementType) * CHAR_BIT); + constexpr size_t block_size_in_elements = block_size_in_vectors * VectorSize; + constexpr size_t num_blocks = 4; + constexpr size_t tail_size = 13; + constexpr size_t array_size = num_blocks * block_size_in_elements + tail_size; + constexpr size_t last_block_first_elem = (num_blocks - 1) * block_size_in_elements; + constexpr size_t last_vector_first_elem = (block_size_in_vectors - 1) * VectorSize; + + vector v(array_size); // not array to avoid large data on stack + + // all equal + fill(v.begin(), v.end(), ElementType{1}); + assert(min_element(v.begin(), v.end()) == v.begin()); + assert(max_element(v.begin(), v.end()) == v.begin()); + assert(minmax_element(v.begin(), v.end()).first == v.begin()); + assert(minmax_element(v.begin(), v.end()).second == v.end() - 1); + + // same position in different blocks + fill(v.begin(), v.end(), ElementType{1}); + for (size_t block_pos = 0; block_pos != num_blocks; ++block_pos) { + v.at(block_pos * block_size_in_elements + 20 * VectorSize + 2) = 0; + v.at(block_pos * block_size_in_elements + 20 * VectorSize + 5) = 0; + v.at(block_pos * block_size_in_elements + 25 * VectorSize + 6) = 2; + v.at(block_pos * block_size_in_elements + 25 * VectorSize + 9) = 2; + } + assert(min_element(v.begin(), v.end()) == v.begin() + 20 * VectorSize + 2); + assert(max_element(v.begin(), v.end()) == v.begin() + 25 * VectorSize + 6); + assert(minmax_element(v.begin(), v.end()).first == v.begin() + 20 * VectorSize + 2); + assert(minmax_element(v.begin(), v.end()).second == v.begin() + last_block_first_elem + 25 * VectorSize + 9); + + + // same block in different vectors + fill(v.begin(), v.end(), ElementType{1}); + for (size_t vector_pos = 0; vector_pos != block_size_in_vectors; ++vector_pos) { + v.at(2 * block_size_in_elements + vector_pos * VectorSize + 2) = 0; + v.at(2 * block_size_in_elements + vector_pos * VectorSize + 5) = 0; + v.at(2 * block_size_in_elements + vector_pos * VectorSize + 6) = 2; + v.at(2 * block_size_in_elements + vector_pos * VectorSize + 9) = 2; + } + assert(min_element(v.begin(), v.end()) == v.begin() + 2 * block_size_in_elements + 2); + assert(max_element(v.begin(), v.end()) == v.begin() + 2 * block_size_in_elements + 6); + assert(minmax_element(v.begin(), v.end()).first == v.begin() + 2 * block_size_in_elements + 2); + assert(minmax_element(v.begin(), v.end()).second + == v.begin() + 2 * block_size_in_elements + last_vector_first_elem + 9); } template @@ -345,7 +341,9 @@ void test_vector_algorithms() { test_min_max_element(gen); test_min_max_element(gen); - test_min_max_element_special_cases(); + test_min_max_element_special_cases(); // SSE2 vectors + test_min_max_element_special_cases(); // AVX2 vectors + test_min_max_element_special_cases(); // AVX512 vectors test_reverse(gen); test_reverse(gen); From 9c6ba048a27aaa92222253f518ef6a53b1610b29 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 May 2022 17:07:56 +0300 Subject: [PATCH 54/60] format --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 4c3d7d3e6f..ddb5745934 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -182,8 +182,8 @@ void test_min_max_element_special_cases() { constexpr size_t array_size = num_blocks * block_size_in_elements + tail_size; constexpr size_t last_block_first_elem = (num_blocks - 1) * block_size_in_elements; constexpr size_t last_vector_first_elem = (block_size_in_vectors - 1) * VectorSize; - - vector v(array_size); // not array to avoid large data on stack + + vector v(array_size); // not array to avoid large data on stack // all equal fill(v.begin(), v.end(), ElementType{1}); From 17e3b251d36674806d8a51275d7fbe37eaf609a2 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sat, 28 May 2022 17:25:01 +0300 Subject: [PATCH 55/60] bring back a needed include --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index ddb5745934..3189e41da5 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -9,6 +9,7 @@ #include #include #include +#include using namespace std; From 56799e8738846cfc4fae4b3813e2229dfa4c60dd Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 17 Jun 2022 22:24:01 -0700 Subject: [PATCH 56/60] Code review feedback. --- stl/inc/algorithm | 39 ++++------ stl/src/vector_algorithms.cpp | 72 +++++++++---------- .../VSO_0000000_vector_algorithms/test.cpp | 52 +++++++++----- 3 files changed, 84 insertions(+), 79 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index d2f6ccbf0f..c0cfe0e3ca 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -61,10 +61,7 @@ template _Ty* __std_min_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; - if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return const_cast<_Ty*>(reinterpret_cast( - __std_min_element(reinterpret_cast(_First), reinterpret_cast(_Last)))); - } else if constexpr (sizeof(_Ty) == 1) { + if constexpr (sizeof(_Ty) == 1) { return const_cast<_Ty*>(static_cast(__std_min_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { return const_cast<_Ty*>(static_cast(__std_min_element_2(_First, _Last, _Signed))); @@ -81,10 +78,7 @@ template _Ty* __std_max_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; - if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - return const_cast<_Ty*>(reinterpret_cast( - __std_max_element(reinterpret_cast(_First), reinterpret_cast(_Last)))); - } else if constexpr (sizeof(_Ty) == 1) { + if constexpr (sizeof(_Ty) == 1) { return const_cast<_Ty*>(static_cast(__std_max_element_1(_First, _Last, _Signed))); } else if constexpr (sizeof(_Ty) == 2) { return const_cast<_Ty*>(static_cast(__std_max_element_2(_First, _Last, _Signed))); @@ -101,30 +95,21 @@ template _STD pair<_Ty*, _Ty*> __std_minmax_element(_Ty* _First, _Ty* _Last) noexcept { constexpr bool _Signed = _STD is_signed_v<_Ty>; - if constexpr (_STD is_pointer_v<_Ty> || _STD is_null_pointer_v<_Ty>) { - auto _Res = - __std_minmax_element(reinterpret_cast(_First), reinterpret_cast(_Last)); - return {const_cast<_Ty*>(reinterpret_cast(_Res.first)), - const_cast<_Ty*>(reinterpret_cast(_Res.second))}; - } else if constexpr (sizeof(_Ty) == 1) { - auto _Res = __std_minmax_element_1(_First, _Last, _Signed); - return { - const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; + _Min_max_element_t _Res; + + if constexpr (sizeof(_Ty) == 1) { + _Res = __std_minmax_element_1(_First, _Last, _Signed); } else if constexpr (sizeof(_Ty) == 2) { - auto _Res = __std_minmax_element_2(_First, _Last, _Signed); - return { - const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; + _Res = __std_minmax_element_2(_First, _Last, _Signed); } else if constexpr (sizeof(_Ty) == 4) { - auto _Res = __std_minmax_element_4(_First, _Last, _Signed); - return { - const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; + _Res = __std_minmax_element_4(_First, _Last, _Signed); } else if constexpr (sizeof(_Ty) == 8) { - auto _Res = __std_minmax_element_8(_First, _Last, _Signed); - return { - const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; + _Res = __std_minmax_element_8(_First, _Last, _Signed); } else { static_assert(_STD _Always_false<_Ty>, "Unexpected size"); } + + return {const_cast<_Ty*>(static_cast(_Res._Min)), const_cast<_Ty*>(static_cast(_Res._Max))}; } #endif // _USE_STD_VECTOR_ALGORITHMS @@ -9617,7 +9602,7 @@ constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _ const auto _First_ptr = _To_address(_First); const auto _Result = __std_minmax_element(_First_ptr, _To_address(_Last)); if constexpr (is_pointer_v<_FwdIt>) { - return {_Result.first, _Result.second}; + return _Result; } else { return {_First + (_Result.first - _First_ptr), _First + (_Result.second - _First_ptr)}; } diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 216741f665..9e20798ba8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -71,7 +71,7 @@ namespace { } // unnamed namespace extern "C" { -// Must be in sync with _Min_max_t in +// Must be in sync with _Min_max_element_t in struct _Min_max_element_t { const void* _Min; const void* _Max; @@ -486,15 +486,15 @@ namespace { } enum _Min_max_mode { - _Mode_min = 1, - _Mode_max = 2, - _Mode_both = 3, + _Mode_min = 1 << 0, + _Mode_max = 1 << 1, + _Mode_both = _Mode_min | _Mode_max, }; template <_Min_max_mode _Mode, class _STy, class _UTy> auto _Minmax_tail(const void* _First, const void* _Last, _Min_max_element_t& _Res, bool _Sign, _UTy _Cur_min, _UTy _Cur_max) noexcept { - constexpr _UTy _Cor = (_UTy{1} << (sizeof(_UTy) * 8 - 1)); + constexpr _UTy _Cor = _UTy{1} << (sizeof(_UTy) * 8 - 1); if constexpr (_Mode == _Mode_min) { if (_Sign) { @@ -543,12 +543,12 @@ namespace { const __m128i _Shuf_bytes = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - __m128i _H_min = _Cur; - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); - _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_bytes)); - return _H_min; + __m128i _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi8(_H_min_val, _Shuf_words)); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi8(_H_min_val, _Shuf_bytes)); + return _H_min_val; } static __m128i _H_min(const __m128i _Cur) noexcept { @@ -587,7 +587,7 @@ namespace { return _mm_min_epi8(_First, _Second); } - static __m128i _Max(const __m128i _First, __m128i _Second, __m128i) noexcept { + static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi8(_First, _Second); } }; @@ -616,11 +616,11 @@ namespace { static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { const __m128i _Shuf_words = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - __m128i _H_min = _Cur; - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - _H_min = _Funct(_H_min, _mm_shuffle_epi8(_H_min, _Shuf_words)); - return _H_min; + __m128i _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(2, 3, 0, 1))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi8(_H_min_val, _Shuf_words)); + return _H_min_val; } static __m128i _H_min(const __m128i _Cur) noexcept { @@ -644,8 +644,7 @@ namespace { } static _Unsigned_t _Get_v_pos(const __m128i _Idx, const unsigned long _H_pos) noexcept { - static constexpr const _Unsigned_t _Shuf[] = { - 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}; + static constexpr _Unsigned_t _Shuf[] = {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}; return static_cast<_Unsigned_t>( _mm_cvtsi128_si32(_mm_shuffle_epi8(_Idx, _mm_cvtsi32_si128(_Shuf[_H_pos >> 1])))); @@ -659,11 +658,11 @@ namespace { return _mm_cmpgt_epi16(_First, _Second); } - static __m128i _Min(const __m128i _First, const __m128i _Second, const __m128i) noexcept { + static __m128i _Min(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_min_epi16(_First, _Second); } - static __m128i _Max(const __m128i _First, const __m128i _Second, const __m128i) noexcept { + static __m128i _Max(const __m128i _First, const __m128i _Second, __m128i) noexcept { return _mm_max_epi16(_First, _Second); } }; @@ -694,10 +693,10 @@ namespace { template static __m128i _H_func(const __m128i _Cur, _Fn _Funct) noexcept { - __m128i _H_min = _Cur; - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(1, 0, 3, 2))); - _H_min = _Funct(_H_min, _mm_shuffle_epi32(_H_min, _MM_SHUFFLE(2, 3, 0, 1))); - return _H_min; + __m128i _H_min_val = _Cur; + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(1, 0, 3, 2))); + _H_min_val = _Funct(_H_min_val, _mm_shuffle_epi32(_H_min_val, _MM_SHUFFLE(2, 3, 0, 1))); + return _H_min_val; } static __m128i _H_min(const __m128i _Cur) noexcept { @@ -752,7 +751,7 @@ namespace { static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF'FFFF'FFFF'FFFFULL); static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000'0000'0000'0000ULL); - static __m128i _Sign_cor(__m128i _Val, const bool _Sign) { + static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) { alignas(16) static constexpr _Unsigned_t _Sign_cors[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; return _mm_sub_epi64(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); @@ -792,7 +791,7 @@ namespace { #ifdef _M_IX86 return static_cast<_Signed_t>((static_cast<_Unsigned_t>(_mm_extract_epi32(_Cur, 1)) << 32) | static_cast<_Unsigned_t>(_mm_cvtsi128_si32(_Cur))); -#else // ^^^ x86 ^^^ / // vvv x64 vvv +#else // ^^^ x86 ^^^ / vvv x64 vvv return static_cast<_Signed_t>(_mm_cvtsi128_si64(_Cur)); #endif // ^^^ x64 ^^^ } @@ -820,9 +819,10 @@ namespace { } }; - // Exactly the same signature with __std_min_element_N / __std_max_element_N / __std_minmax_element_N, - // up to calling convention. This makes sure the template specialization is fused with the export function. - // In optimized build it avoids extra call, as this function is too large to inline. + // _Minmax_element has exactly the same signature as the extern "C" functions + // (__std_min_element_N, __std_max_element_N, __std_minmax_element_N), up to calling convention. + // This makes sure the template specialization is fused with the extern "C" function. + // In optimized builds it avoids an extra call, as this function is too large to inline. template <_Min_max_mode _Mode, class _Traits> auto __stdcall _Minmax_element(const void* _First, const void* const _Last, const bool _Sign) noexcept { _Min_max_element_t _Res = {_First, _First}; @@ -901,7 +901,7 @@ namespace { unsigned long _H_pos; if constexpr (_Mode == _Mode_both) { - // Looking for the last occurence of maximum + // Looking for the last occurrence of maximum // Indices of maximum elements or zero if none const __m128i _Idx_max_val = _mm_blendv_epi8(_mm_setzero_si128(), _Cur_idx_max, _Eq_mask); @@ -911,7 +911,7 @@ namespace { _BitScanReverse(&_H_pos, _Mask); // Find the largest horizontal index _H_pos -= sizeof(_Cur_max_val) - 1; // Correct from highest val bit to lowest } else { - // Looking for the first occurence of maximum + // Looking for the first occurrence of maximum // Indices of maximum elements or the greatest index if none const __m128i _All_max = _mm_set1_epi8(static_cast(0xFF)); const __m128i _Idx_max_val = _mm_blendv_epi8(_All_max, _Cur_idx_max, _Eq_mask); @@ -950,6 +950,7 @@ namespace { _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } + if constexpr ((_Mode & _Mode_max) != 0) { // TRANSITION, 17.3 Preview 2 _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); @@ -966,21 +967,20 @@ namespace { _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 - - // Looking for the first occurence of minimum, don't overwrite with newly found occurences + // Looking for the first occurrence of minimum, don't overwrite with newly found occurrences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } if constexpr (_Mode == _Mode_max) { // TRANSITION, 17.3 Preview 2 - // Looking for the first occurence of maximum, don't overwrite with newly found occurences + // Looking for the first occurrence of maximum, don't overwrite with newly found occurrences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = _mm_blendv_epi8(_Cur_idx_max, _Cur_idx, _Is_greater); // Remember their vertical indices _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals, _Is_greater); // Update the current maximum } else if constexpr (_Mode == _Mode_both) { - // Looking for the last occurence of maximum, do overwrite with newly found occurences + // Looking for the last occurrence of maximum, do overwrite with newly found occurrences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_max, _Cur_vals); // !(_Cur_vals >= _Cur_vals_max) _Cur_idx_max = _mm_blendv_epi8(_Cur_idx, _Cur_idx_max, _Is_less); // Remember their vertical indices diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 3189e41da5..aaaee36733 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -84,7 +84,7 @@ void test_find(mt19937_64& gen) { } template -FwdIt last_good_known_min_element(FwdIt first, FwdIt last) { +FwdIt last_known_good_min_element(FwdIt first, FwdIt last) { FwdIt result = first; for (; first != last; ++first) { @@ -97,7 +97,7 @@ FwdIt last_good_known_min_element(FwdIt first, FwdIt last) { } template -FwdIt last_good_known_max_element(FwdIt first, FwdIt last) { +FwdIt last_known_good_max_element(FwdIt first, FwdIt last) { FwdIt result = first; for (; first != last; ++first) { @@ -110,7 +110,7 @@ FwdIt last_good_known_max_element(FwdIt first, FwdIt last) { } template -std::pair last_good_known_minmax_element(FwdIt first, FwdIt last) { +pair last_known_good_minmax_element(FwdIt first, FwdIt last) { // find smallest and largest elements pair found(first, first); @@ -128,6 +128,7 @@ std::pair last_good_known_minmax_element(FwdIt first, FwdIt last) if (*next < *found.first) { found.first = next; } + if (!(*first < *found.second)) { found.second = first; } @@ -135,6 +136,7 @@ std::pair last_good_known_minmax_element(FwdIt first, FwdIt last) if (*first < *found.first) { found.first = first; } + if (!(*next < *found.second)) { found.second = next; } @@ -149,9 +151,9 @@ std::pair last_good_known_minmax_element(FwdIt first, FwdIt last) template void test_case_min_max_element(const vector& input) { - auto expected_min = last_good_known_min_element(input.begin(), input.end()); - auto expected_max = last_good_known_max_element(input.begin(), input.end()); - auto expected_minmax = last_good_known_minmax_element(input.begin(), input.end()); + auto expected_min = last_known_good_min_element(input.begin(), input.end()); + auto expected_max = last_known_good_max_element(input.begin(), input.end()); + auto expected_minmax = last_known_good_minmax_element(input.begin(), input.end()); auto actual_min = min_element(input.begin(), input.end()); auto actual_max = max_element(input.begin(), input.end()); auto actual_minmax = minmax_element(input.begin(), input.end()); @@ -162,8 +164,10 @@ void test_case_min_max_element(const vector& input) { template void test_min_max_element(mt19937_64& gen) { - auto dis = conditional_t, uniform_real_distribution, - conditional_t<(sizeof(T) > 1), uniform_int_distribution, uniform_int_distribution>>(1, 20); + using Distribution = conditional_t, uniform_real_distribution, + conditional_t<(sizeof(T) > 1), uniform_int_distribution, uniform_int_distribution>>; + + Distribution dis(1, 20); vector input; input.reserve(dataCount); @@ -174,6 +178,20 @@ void test_min_max_element(mt19937_64& gen) { } } +void test_min_max_element_pointers(mt19937_64& gen) { + const short arr[20]{}; + + uniform_int_distribution dis(0, size(arr) - 1); + + vector input; + input.reserve(dataCount); + test_case_min_max_element(input); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(arr + dis(gen)); + test_case_min_max_element(input); + } +} + template void test_min_max_element_special_cases() { constexpr size_t block_size_in_vectors = 1 << (sizeof(ElementType) * CHAR_BIT); @@ -196,10 +214,10 @@ void test_min_max_element_special_cases() { // same position in different blocks fill(v.begin(), v.end(), ElementType{1}); for (size_t block_pos = 0; block_pos != num_blocks; ++block_pos) { - v.at(block_pos * block_size_in_elements + 20 * VectorSize + 2) = 0; - v.at(block_pos * block_size_in_elements + 20 * VectorSize + 5) = 0; - v.at(block_pos * block_size_in_elements + 25 * VectorSize + 6) = 2; - v.at(block_pos * block_size_in_elements + 25 * VectorSize + 9) = 2; + v[block_pos * block_size_in_elements + 20 * VectorSize + 2] = 0; + v[block_pos * block_size_in_elements + 20 * VectorSize + 5] = 0; + v[block_pos * block_size_in_elements + 25 * VectorSize + 6] = 2; + v[block_pos * block_size_in_elements + 25 * VectorSize + 9] = 2; } assert(min_element(v.begin(), v.end()) == v.begin() + 20 * VectorSize + 2); assert(max_element(v.begin(), v.end()) == v.begin() + 25 * VectorSize + 6); @@ -210,10 +228,10 @@ void test_min_max_element_special_cases() { // same block in different vectors fill(v.begin(), v.end(), ElementType{1}); for (size_t vector_pos = 0; vector_pos != block_size_in_vectors; ++vector_pos) { - v.at(2 * block_size_in_elements + vector_pos * VectorSize + 2) = 0; - v.at(2 * block_size_in_elements + vector_pos * VectorSize + 5) = 0; - v.at(2 * block_size_in_elements + vector_pos * VectorSize + 6) = 2; - v.at(2 * block_size_in_elements + vector_pos * VectorSize + 9) = 2; + v[2 * block_size_in_elements + vector_pos * VectorSize + 2] = 0; + v[2 * block_size_in_elements + vector_pos * VectorSize + 5] = 0; + v[2 * block_size_in_elements + vector_pos * VectorSize + 6] = 2; + v[2 * block_size_in_elements + vector_pos * VectorSize + 9] = 2; } assert(min_element(v.begin(), v.end()) == v.begin() + 2 * block_size_in_elements + 2); assert(max_element(v.begin(), v.end()) == v.begin() + 2 * block_size_in_elements + 6); @@ -342,6 +360,8 @@ void test_vector_algorithms() { test_min_max_element(gen); test_min_max_element(gen); + test_min_max_element_pointers(gen); + test_min_max_element_special_cases(); // SSE2 vectors test_min_max_element_special_cases(); // AVX2 vectors test_min_max_element_special_cases(); // AVX512 vectors From f538bbc103b766bb2860d2670edf45f8fa1b2dfd Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 18 Jun 2022 02:20:08 -0700 Subject: [PATCH 57/60] Remove workarounds now that we have 17.3 Preview 2. --- stl/src/vector_algorithms.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9e20798ba8..c0f01d481d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -862,7 +862,7 @@ namespace { // Reached end or indices wrap around point. // Compute horizontal min and/or max. Determine horizontal and vertical position of it. - if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode & _Mode_min) { const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it @@ -885,7 +885,7 @@ namespace { } } - if constexpr ((_Mode & _Mode_max) != 0) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode & _Mode_max) { const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it @@ -946,12 +946,12 @@ namespace { _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode & _Mode_min) { _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } - if constexpr ((_Mode & _Mode_max) != 0) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode & _Mode_max) { _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); } @@ -966,14 +966,14 @@ namespace { // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr ((_Mode & _Mode_min) != 0) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode & _Mode_min) { // Looking for the first occurrence of minimum, don't overwrite with newly found occurrences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals, _Is_less); // Update the current minimum } - if constexpr (_Mode == _Mode_max) { // TRANSITION, 17.3 Preview 2 + if constexpr (_Mode == _Mode_max) { // Looking for the first occurrence of maximum, don't overwrite with newly found occurrences const __m128i _Is_greater = _Traits::_Cmp_gt(_Cur_vals, _Cur_vals_max); // _Cur_vals > _Cur_vals_max _Cur_idx_max = From 11724832d428e7fbe5025a123036bc4ada6befec Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 18 Jun 2022 16:03:50 -0700 Subject: [PATCH 58/60] Rename "cor" to "correction". --- stl/src/vector_algorithms.cpp | 41 ++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index c0f01d481d..3666fe39da 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -494,26 +494,26 @@ namespace { template <_Min_max_mode _Mode, class _STy, class _UTy> auto _Minmax_tail(const void* _First, const void* _Last, _Min_max_element_t& _Res, bool _Sign, _UTy _Cur_min, _UTy _Cur_max) noexcept { - constexpr _UTy _Cor = _UTy{1} << (sizeof(_UTy) * 8 - 1); + constexpr _UTy _Correction = _UTy{1} << (sizeof(_UTy) * 8 - 1); if constexpr (_Mode == _Mode_min) { if (_Sign) { return _Min_tail(_First, _Last, _Res._Min, static_cast<_STy>(_Cur_min)); } else { - return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min + _Cor)); + return _Min_tail(_First, _Last, _Res._Min, static_cast<_UTy>(_Cur_min + _Correction)); } } else if constexpr (_Mode == _Mode_max) { if (_Sign) { return _Max_tail(_First, _Last, _Res._Max, static_cast<_STy>(_Cur_max)); } else { - return _Max_tail(_First, _Last, _Res._Max, static_cast<_UTy>(_Cur_max + _Cor)); + return _Max_tail(_First, _Last, _Res._Max, static_cast<_UTy>(_Cur_max + _Correction)); } } else { if (_Sign) { return _Both_tail(_First, _Last, _Res, static_cast<_STy>(_Cur_min), static_cast<_STy>(_Cur_max)); } else { - return _Both_tail( - _First, _Last, _Res, static_cast<_UTy>(_Cur_min + _Cor), static_cast<_UTy>(_Cur_max + _Cor)); + return _Both_tail(_First, _Last, _Res, static_cast<_UTy>(_Cur_min + _Correction), + static_cast<_UTy>(_Cur_max + _Correction)); } } } @@ -528,10 +528,10 @@ namespace { static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7F); static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x80); - static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr _Unsigned_t _Sign_cors[2][16] = { + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { + alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][16] = { {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}, {}}; - return _mm_sub_epi8(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi8(_Val, _mm_load_si128(reinterpret_cast(_Sign_corrections[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -602,10 +602,10 @@ namespace { static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF); static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000); - static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr _Unsigned_t _Sign_cors[2][8] = { + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { + alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][8] = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, {}}; - return _mm_sub_epi16(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi16(_Val, _mm_load_si128(reinterpret_cast(_Sign_corrections[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -681,10 +681,10 @@ namespace { static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF'FFFFUL); static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000'0000UL); - static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) noexcept { - alignas(16) static constexpr _Unsigned_t _Sign_cors[2][4] = { + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) noexcept { + alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][4] = { 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, 0x8000'0000UL, {}}; - return _mm_sub_epi32(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi32(_Val, _mm_load_si128(reinterpret_cast(_Sign_corrections[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -751,10 +751,10 @@ namespace { static constexpr _Signed_t _Init_min_val = static_cast<_Signed_t>(0x7FFF'FFFF'FFFF'FFFFULL); static constexpr _Signed_t _Init_max_val = static_cast<_Signed_t>(0x8000'0000'0000'0000ULL); - static __m128i _Sign_cor(const __m128i _Val, const bool _Sign) { - alignas(16) static constexpr _Unsigned_t _Sign_cors[2][2] = { + static __m128i _Sign_correction(const __m128i _Val, const bool _Sign) { + alignas(16) static constexpr _Unsigned_t _Sign_corrections[2][2] = { 0x8000'0000'0000'0000ULL, 0x8000'0000'0000'0000ULL, {}}; - return _mm_sub_epi64(_Val, _mm_load_si128(reinterpret_cast(_Sign_cors[_Sign]))); + return _mm_sub_epi64(_Val, _mm_load_si128(reinterpret_cast(_Sign_corrections[_Sign]))); } static __m128i _Inc(__m128i _Idx) noexcept { @@ -845,7 +845,8 @@ namespace { _Advance_bytes(_Stop_at, _Portion_byte_size); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - __m128i _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + __m128i _Cur_vals = + _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); __m128i _Cur_vals_min = _Cur_vals; // vector of vertical minimum values __m128i _Cur_idx_min = _mm_setzero_si128(); // vector of vertical minimum indices __m128i _Cur_vals_max = _Cur_vals; // vector of vertical maximum values @@ -944,7 +945,7 @@ namespace { _Base = static_cast(_First); // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = - _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode & _Mode_min) { _Cur_vals_min = _Cur_vals; @@ -964,7 +965,7 @@ namespace { // This is the main part, finding vertical minimum/maximum // Load values and if unsigned adjust them to be signed (for signed vector comparisons) - _Cur_vals = _Traits::_Sign_cor(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); + _Cur_vals = _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); if constexpr (_Mode & _Mode_min) { // Looking for the first occurrence of minimum, don't overwrite with newly found occurrences From f56642679b7bdeca8edcaeae91c43993216783c9 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 18 Jun 2022 16:11:15 -0700 Subject: [PATCH 59/60] Remove CUDA guards. We believe that CUDA 11.6 supports `__builtin_is_constant_evaluated`. --- stl/inc/algorithm | 12 ++++++------ stl/inc/xtr1common | 2 -- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index c0cfe0e3ca..3b650e89b6 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -9386,7 +9386,7 @@ _INLINE_VAR constexpr bool _Is_min_max_optimization_safe = // Activate the vecto template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element -#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9398,7 +9398,7 @@ constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#endif // _USE_STD_VECTOR_ALGORITHMS _FwdIt _Found = _First; if (_First != _Last) { @@ -9491,7 +9491,7 @@ namespace ranges { template constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find smallest element -#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9503,7 +9503,7 @@ constexpr _FwdIt _Min_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#endif // _USE_STD_VECTOR_ALGORITHMS _FwdIt _Found = _First; if (_First != _Last) { @@ -9596,7 +9596,7 @@ namespace ranges { template constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { -#if _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#if _USE_STD_VECTOR_ALGORITHMS if constexpr (_Is_min_max_optimization_safe<_FwdIt, _Pr>) { if (!_Is_constant_evaluated()) { const auto _First_ptr = _To_address(_First); @@ -9608,7 +9608,7 @@ constexpr pair<_FwdIt, _FwdIt> _Minmax_element_unchecked(_FwdIt _First, _FwdIt _ } } } -#endif // _USE_STD_VECTOR_ALGORITHMS && !defined(__CUDACC__) +#endif // _USE_STD_VECTOR_ALGORITHMS // find smallest and largest elements pair<_FwdIt, _FwdIt> _Found(_First, _First); diff --git a/stl/inc/xtr1common b/stl/inc/xtr1common index 4dce7467bd..b92134c6c6 100644 --- a/stl/inc/xtr1common +++ b/stl/inc/xtr1common @@ -165,11 +165,9 @@ template _INLINE_VAR constexpr bool _Is_any_of_v = // true if and only if _Ty is in _Types disjunction_v...>; -#ifndef __CUDACC__ _NODISCARD constexpr bool _Is_constant_evaluated() noexcept { // Internal function for any standard mode return __builtin_is_constant_evaluated(); } -#endif // __CUDACC__ #if _HAS_CXX20 _NODISCARD constexpr bool is_constant_evaluated() noexcept { From d1783033e6780403e110cc38e652eebd13394f00 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 18 Jun 2022 19:58:05 -0700 Subject: [PATCH 60/60] Restore perma-workarounds. --- stl/src/vector_algorithms.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 3666fe39da..65855e9aba 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -863,7 +863,7 @@ namespace { // Reached end or indices wrap around point. // Compute horizontal min and/or max. Determine horizontal and vertical position of it. - if constexpr (_Mode & _Mode_min) { + if constexpr ((_Mode & _Mode_min) != 0) { const __m128i _H_min = _Traits::_H_min(_Cur_vals_min); // Vector populated by the smallest element const auto _H_min_val = _Traits::_Get_any(_H_min); // Get any element of it @@ -886,7 +886,7 @@ namespace { } } - if constexpr (_Mode & _Mode_max) { + if constexpr ((_Mode & _Mode_max) != 0) { const __m128i _H_max = _Traits::_H_max(_Cur_vals_max); // Vector populated by the largest element const auto _H_max_val = _Traits::_Get_any(_H_max); // Get any element of it @@ -947,12 +947,12 @@ namespace { _Cur_vals = _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr (_Mode & _Mode_min) { + if constexpr ((_Mode & _Mode_min) != 0) { _Cur_vals_min = _Cur_vals; _Cur_idx_min = _mm_setzero_si128(); } - if constexpr (_Mode & _Mode_max) { + if constexpr ((_Mode & _Mode_max) != 0) { _Cur_vals_max = _Cur_vals; _Cur_idx_max = _mm_setzero_si128(); } @@ -967,7 +967,7 @@ namespace { // Load values and if unsigned adjust them to be signed (for signed vector comparisons) _Cur_vals = _Traits::_Sign_correction(_mm_loadu_si128(reinterpret_cast(_First)), _Sign); - if constexpr (_Mode & _Mode_min) { + if constexpr ((_Mode & _Mode_min) != 0) { // Looking for the first occurrence of minimum, don't overwrite with newly found occurrences const __m128i _Is_less = _Traits::_Cmp_gt(_Cur_vals_min, _Cur_vals); // _Cur_vals < _Cur_vals_min _Cur_idx_min = _mm_blendv_epi8(_Cur_idx_min, _Cur_idx, _Is_less); // Remember their vertical indices