From 30683827382d2f75c0cdbfd5c8fe26bac7bf50ac Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 18 May 2025 13:49:46 +0300 Subject: [PATCH 1/8] test deeper! --- .../tests/VSO_0000000_vector_algorithms/test.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 1232bde41a..0e5025cabd 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -778,17 +778,17 @@ void test_case_rotate( } template -void test_rotate(mt19937_64& gen) { +void test_rotate(mt19937_64& gen, size_t data_count = dataCount) { vector actual; vector actual_r; vector expected; vector tmp; - actual.reserve(dataCount); - actual_r.reserve(dataCount); - expected.reserve(dataCount); - tmp.reserve(dataCount); + actual.reserve(data_count); + actual_r.reserve(data_count); + expected.reserve(data_count); + tmp.reserve(data_count); test_case_rotate(actual, actual_r, expected, 0, tmp); - for (size_t attempts = 0; attempts < dataCount; ++attempts) { + for (size_t attempts = 0; attempts < data_count; ++attempts) { const T val = static_cast(gen()); // intentionally narrows actual.push_back(val); actual_r.push_back(val); @@ -1241,7 +1241,7 @@ void test_vector_algorithms(mt19937_64& gen) { test_reverse_copy(gen); test_reverse_copy(gen); - test_rotate(gen); + test_rotate(gen, 20000); // one real long rotate run, as for smaller arrays some strategies aren't executed test_rotate(gen); test_rotate(gen); test_rotate(gen); From 7b482be6eff87bcadec5e24d13bbcb27e7696727 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 18 May 2025 11:54:22 +0300 Subject: [PATCH 2/8] move a little closer --- stl/src/vector_algorithms.cpp | 202 +++++++++++++++++----------------- 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index fb20d8d71c..68267b9e4a 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -163,6 +163,107 @@ void* __cdecl __std_swap_ranges_trivially_swappable( } // extern "C" +namespace { + namespace _Rotating { + // TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs": + // As a workaround, the following code calls memmove() for 8 KB portions. + constexpr size_t _Portion_size = 8192; + constexpr size_t _Portion_mask = _Portion_size - 1; + static_assert((_Portion_size & _Portion_mask) == 0); + + void _Move_to_lower_address(void* _Dest, const void* _Src, const size_t _Size) noexcept { + const size_t _Whole_portions_size = _Size & ~_Portion_mask; + + void* _Dest_end = _Dest; + _Advance_bytes(_Dest_end, _Whole_portions_size); + + while (_Dest != _Dest_end) { + memmove(_Dest, _Src, _Portion_size); + _Advance_bytes(_Dest, _Portion_size); + _Advance_bytes(_Src, _Portion_size); + } + + if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) { + memmove(_Dest, _Src, _Tail); + } + } + + void _Move_to_higher_address(void* const _Dest, const void* const _Src, const size_t _Size) noexcept { + const size_t _Whole_portions_size = _Size & ~_Portion_mask; + + void* _Dest_end = _Dest; + _Advance_bytes(_Dest_end, _Whole_portions_size); + const void* _Src_end = _Src; + _Advance_bytes(_Src_end, _Whole_portions_size); + + if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) { + memmove(_Dest_end, _Src_end, _Tail); + } + + while (_Dest_end != _Dest) { + _Rewind_bytes(_Dest_end, _Portion_size); + _Rewind_bytes(_Src_end, _Portion_size); + memmove(_Dest_end, _Src_end, _Portion_size); + } + } + + constexpr size_t _Buf_size = 512; + + bool _Use_buffer(const size_t _Smaller, const size_t _Larger) noexcept { + return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2); + } + } // namespace _Rotating +} // unnamed namespace + +extern "C" { + +__declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Last) noexcept { + unsigned char _Buf[_Rotating::_Buf_size]; + + for (;;) { + const size_t _Left = _Byte_length(_First, _Mid); + const size_t _Right = _Byte_length(_Mid, _Last); + + if (_Left <= _Right) { + if (_Left == 0) { + break; + } + + if (_Rotating::_Use_buffer(_Left, _Right)) { + memcpy(_Buf, _First, _Left); + _Rotating::_Move_to_lower_address(_First, _Mid, _Right); + _Advance_bytes(_First, _Right); + memcpy(_First, _Buf, _Left); + break; + } + + void* _Mid2 = _Last; + _Rewind_bytes(_Mid2, _Left); + __std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First); + _Last = _Mid2; + } else { + if (_Right == 0) { + break; + } + + if (_Rotating::_Use_buffer(_Right, _Left)) { + _Rewind_bytes(_Last, _Right); + memcpy(_Buf, _Last, _Right); + void* _Mid2 = _First; + _Advance_bytes(_Mid2, _Right); + _Rotating::_Move_to_higher_address(_Mid2, _First, _Left); + memcpy(_First, _Buf, _Right); + break; + } + + __std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First); + _Advance_bytes(_First, _Right); + } + } +} + +} // extern "C" + namespace { namespace _Reversing { #ifdef _M_ARM64EC @@ -382,107 +483,6 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( } // extern "C" -namespace { - namespace _Rotating { - // TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs": - // As a workaround, the following code calls memmove() for 8 KB portions. - constexpr size_t _Portion_size = 8192; - constexpr size_t _Portion_mask = _Portion_size - 1; - static_assert((_Portion_size & _Portion_mask) == 0); - - void _Move_to_lower_address(void* _Dest, const void* _Src, const size_t _Size) noexcept { - const size_t _Whole_portions_size = _Size & ~_Portion_mask; - - void* _Dest_end = _Dest; - _Advance_bytes(_Dest_end, _Whole_portions_size); - - while (_Dest != _Dest_end) { - memmove(_Dest, _Src, _Portion_size); - _Advance_bytes(_Dest, _Portion_size); - _Advance_bytes(_Src, _Portion_size); - } - - if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) { - memmove(_Dest, _Src, _Tail); - } - } - - void _Move_to_higher_address(void* const _Dest, const void* const _Src, const size_t _Size) noexcept { - const size_t _Whole_portions_size = _Size & ~_Portion_mask; - - void* _Dest_end = _Dest; - _Advance_bytes(_Dest_end, _Whole_portions_size); - const void* _Src_end = _Src; - _Advance_bytes(_Src_end, _Whole_portions_size); - - if (const size_t _Tail = _Size - _Whole_portions_size; _Tail != 0) { - memmove(_Dest_end, _Src_end, _Tail); - } - - while (_Dest_end != _Dest) { - _Rewind_bytes(_Dest_end, _Portion_size); - _Rewind_bytes(_Src_end, _Portion_size); - memmove(_Dest_end, _Src_end, _Portion_size); - } - } - - constexpr size_t _Buf_size = 512; - - bool _Use_buffer(const size_t _Smaller, const size_t _Larger) noexcept { - return _Smaller <= _Buf_size && (_Smaller <= 128 || _Larger >= _Smaller * 2); - } - } // namespace _Rotating -} // unnamed namespace - -extern "C" { - -__declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Last) noexcept { - unsigned char _Buf[_Rotating::_Buf_size]; - - for (;;) { - const size_t _Left = _Byte_length(_First, _Mid); - const size_t _Right = _Byte_length(_Mid, _Last); - - if (_Left <= _Right) { - if (_Left == 0) { - break; - } - - if (_Rotating::_Use_buffer(_Left, _Right)) { - memcpy(_Buf, _First, _Left); - _Rotating::_Move_to_lower_address(_First, _Mid, _Right); - _Advance_bytes(_First, _Right); - memcpy(_First, _Buf, _Left); - break; - } - - void* _Mid2 = _Last; - _Rewind_bytes(_Mid2, _Left); - __std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First); - _Last = _Mid2; - } else { - if (_Right == 0) { - break; - } - - if (_Rotating::_Use_buffer(_Right, _Left)) { - _Rewind_bytes(_Last, _Right); - memcpy(_Buf, _Last, _Right); - void* _Mid2 = _First; - _Advance_bytes(_Mid2, _Right); - _Rotating::_Move_to_higher_address(_Mid2, _First, _Left); - memcpy(_First, _Buf, _Right); - break; - } - - __std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First); - _Advance_bytes(_First, _Right); - } - } -} - -} // extern "C" - namespace { namespace _Sorting { enum _Min_max_mode { From 99cafac9595d868cb5e7e27987925c4423a406bb Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 18 May 2025 14:04:25 +0300 Subject: [PATCH 3/8] Three-way --- stl/src/vector_algorithms.cpp | 117 ++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 7 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 68267b9e4a..9f4e179950 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -148,9 +148,9 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( const auto _Last1c = static_cast(_Last1); auto _First2c = static_cast(_First2); for (; _First1c != _Last1c; ++_First1c, ++_First2c) { - unsigned char _Ch = *_First1c; - *_First1c = *_First2c; - *_First2c = _Ch; + const unsigned char _Ch = *_First1c; + *_First1c = *_First2c; + *_First2c = _Ch; } } @@ -165,6 +165,95 @@ void* __cdecl __std_swap_ranges_trivially_swappable( namespace { namespace _Rotating { + void __cdecl _Swap_ranges_3_way(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept { +#ifndef _M_ARM64EC + constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); + if (_Byte_length(_First1, _Last1) >= 32 && _Use_avx2()) { + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_32); + do { + const __m256i _Val1 = _mm256_loadu_si256(static_cast<__m256i*>(_First1)); + const __m256i _Val2 = _mm256_loadu_si256(static_cast<__m256i*>(_First2)); + const __m256i _Val3 = _mm256_loadu_si256(static_cast<__m256i*>(_First3)); + _mm256_storeu_si256(static_cast<__m256i*>(_First1), _Val2); + _mm256_storeu_si256(static_cast<__m256i*>(_First2), _Val3); + _mm256_storeu_si256(static_cast<__m256i*>(_First3), _Val1); + _Advance_bytes(_First1, 32); + _Advance_bytes(_First2, 32); + _Advance_bytes(_First3, 32); + } while (_First1 != _Stop_at); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } + + constexpr size_t _Mask_16 = ~((static_cast(1) << 4) - 1); + if (_Byte_length(_First1, _Last1) >= 16 && _Use_sse42()) { + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_16); + do { + const __m128i _Val1 = _mm_loadu_si128(static_cast<__m128i*>(_First1)); + const __m128i _Val2 = _mm_loadu_si128(static_cast<__m128i*>(_First2)); + const __m128i _Val3 = _mm_loadu_si128(static_cast<__m128i*>(_First3)); + _mm_storeu_si128(static_cast<__m128i*>(_First1), _Val2); + _mm_storeu_si128(static_cast<__m128i*>(_First2), _Val3); + _mm_storeu_si128(static_cast<__m128i*>(_First3), _Val1); + _Advance_bytes(_First1, 16); + _Advance_bytes(_First2, 16); + _Advance_bytes(_First3, 16); + } while (_First1 != _Stop_at); + } + +#if defined(_M_X64) // NOTE: UNALIGNED MEMORY ACCESSES + constexpr size_t _Mask_8 = ~((static_cast(1) << 3) - 1); + if (_Byte_length(_First1, _Last1) >= 8) { + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_8); + do { + const unsigned long long _Val1 = *static_cast(_First1); + const unsigned long long _Val2 = *static_cast(_First2); + const unsigned long long _Val3 = *static_cast(_First3); + *static_cast(_First1) = _Val2; + *static_cast(_First2) = _Val3; + *static_cast(_First3) = _Val1; + _Advance_bytes(_First1, 8); + _Advance_bytes(_First2, 8); + _Advance_bytes(_First3, 8); + } while (_First1 != _Stop_at); + } +#elif defined(_M_IX86) // NOTE: UNALIGNED MEMORY ACCESSES + constexpr size_t _Mask_4 = ~((static_cast(1) << 2) - 1); + if (_Byte_length(_First1, _Last1) >= 4) { + const void* _Stop_at = _First1; + _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_4); + do { + const unsigned long _Val1 = *static_cast(_First1); + const unsigned long _Val2 = *static_cast(_First2); + const unsigned long _Val3 = *static_cast(_First3); + *static_cast(_First1) = _Val2; + *static_cast(_First2) = _Val3; + *static_cast(_First3) = _Val1; + _Advance_bytes(_First1, 4); + _Advance_bytes(_First2, 4); + _Advance_bytes(_First3, 4); + } while (_First1 != _Stop_at); + } +#else +#error Unsupported architecture +#endif +#endif // ^^^ !defined(_M_ARM64EC) ^^^ + + auto _First1c = static_cast(_First1); + auto _First2c = static_cast(_First2); + auto _First3c = static_cast(_First3); + for (; _First1c != _Last1; ++_First1c, ++_First2c, ++_First3c) { + const unsigned char _Ch = *_First1c; + *_First1c = *_First2c; + *_First2c = *_First3c; + *_First3c = _Ch; + } + } + + // TRANSITION, GH-5506 "VCRuntime: memmove() is surprisingly slow for more than 8 KB on certain CPUs": // As a workaround, the following code calls memmove() for 8 KB portions. constexpr size_t _Portion_size = 8192; @@ -239,8 +328,15 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, void* _Mid2 = _Last; _Rewind_bytes(_Mid2, _Left); - __std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First); - _Last = _Mid2; + if (_Left * 2 > _Right) { + __std_swap_ranges_trivially_swappable_noalias(_Mid2, _Last, _First); + _Last = _Mid2; + } else { + void* _Mid3 = _Mid2; + _Rewind_bytes(_Mid3, _Left); + _Rotating::_Swap_ranges_3_way(_Mid2, _Last, _First, _Mid3); + _Last = _Mid3; + } } else { if (_Right == 0) { break; @@ -256,8 +352,15 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, break; } - __std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First); - _Advance_bytes(_First, _Right); + if (_Right * 2 > _Left) { + __std_swap_ranges_trivially_swappable_noalias(_Mid, _Last, _First); + _Advance_bytes(_First, _Right); + } else { + void* _Mid2 = _First; + _Advance_bytes(_Mid2, _Right); + _Rotating::_Swap_ranges_3_way(_Mid, _Last, _Mid2, _First); + _Advance_bytes(_First, _Right * 2); + } } } } From c89f5bae9d33cac133ba473f21960cf7b6f81492 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 18 May 2025 17:10:37 +0300 Subject: [PATCH 4/8] benchmark more --- benchmarks/src/rotate.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/src/rotate.cpp b/benchmarks/src/rotate.cpp index 9ba7c4b898..c0780aaf6e 100644 --- a/benchmarks/src/rotate.cpp +++ b/benchmarks/src/rotate.cpp @@ -55,4 +55,6 @@ BENCHMARK(bm_rotate)->Apply(common_args); BENCHMARK(bm_rotate)->Apply(common_args); BENCHMARK(bm_rotate)->Apply(common_args); +BENCHMARK(bm_rotate)->Args({35000, 520})->Args({35000, 3000}); + BENCHMARK_MAIN(); From bcf926884c8577aeb540636f7ff2336136d151a2 Mon Sep 17 00:00:00 2001 From: Alex Guteniev Date: Sun, 18 May 2025 18:31:29 +0300 Subject: [PATCH 5/8] this is not a spaceship --- stl/src/vector_algorithms.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9f4e179950..8a8252c257 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -165,7 +165,7 @@ void* __cdecl __std_swap_ranges_trivially_swappable( namespace { namespace _Rotating { - void __cdecl _Swap_ranges_3_way(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept { + void __cdecl _Swap_3_ranges(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept { #ifndef _M_ARM64EC constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); if (_Byte_length(_First1, _Last1) >= 32 && _Use_avx2()) { @@ -334,7 +334,7 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, } else { void* _Mid3 = _Mid2; _Rewind_bytes(_Mid3, _Left); - _Rotating::_Swap_ranges_3_way(_Mid2, _Last, _First, _Mid3); + _Rotating::_Swap_3_ranges(_Mid2, _Last, _First, _Mid3); _Last = _Mid3; } } else { @@ -358,7 +358,7 @@ __declspec(noalias) void __stdcall __std_rotate(void* _First, void* const _Mid, } else { void* _Mid2 = _First; _Advance_bytes(_Mid2, _Right); - _Rotating::_Swap_ranges_3_way(_Mid, _Last, _Mid2, _First); + _Rotating::_Swap_3_ranges(_Mid, _Last, _Mid2, _First); _Advance_bytes(_First, _Right * 2); } } From 553d27ddce2b8f21a6e58159b1838d6ab4d961da Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 22 May 2025 01:45:10 -0700 Subject: [PATCH 6/8] Add const. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 0e5025cabd..ed1ca6a12c 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -778,7 +778,7 @@ void test_case_rotate( } template -void test_rotate(mt19937_64& gen, size_t data_count = dataCount) { +void test_rotate(mt19937_64& gen, const size_t data_count = dataCount) { vector actual; vector actual_r; vector expected; From 0821f56f32990c6d3dadb6d3a79f435b1921cb75 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 22 May 2025 01:48:27 -0700 Subject: [PATCH 7/8] Drop `__cdecl`. --- stl/src/vector_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 8a8252c257..7622221aa2 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -165,7 +165,7 @@ void* __cdecl __std_swap_ranges_trivially_swappable( namespace { namespace _Rotating { - void __cdecl _Swap_3_ranges(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept { + void _Swap_3_ranges(void* _First1, void* const _Last1, void* _First2, void* _First3) noexcept { #ifndef _M_ARM64EC constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); if (_Byte_length(_First1, _Last1) >= 32 && _Use_avx2()) { From 9656a7f8ab5a0f3f712339dbbd553651e93f1cdd Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 22 May 2025 02:08:01 -0700 Subject: [PATCH 8/8] Backport direct comparison to `_Last1`. --- stl/src/vector_algorithms.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 7622221aa2..5f1f0df9cd 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -144,10 +144,9 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( #endif #endif // ^^^ !defined(_M_ARM64EC) ^^^ - auto _First1c = static_cast(_First1); - const auto _Last1c = static_cast(_Last1); - auto _First2c = static_cast(_First2); - for (; _First1c != _Last1c; ++_First1c, ++_First2c) { + auto _First1c = static_cast(_First1); + auto _First2c = static_cast(_First2); + for (; _First1c != _Last1; ++_First1c, ++_First2c) { const unsigned char _Ch = *_First1c; *_First1c = *_First2c; *_First2c = _Ch;