diff --git a/benchmarks/src/remove.cpp b/benchmarks/src/remove.cpp index f0d28f6d73..92ca87534a 100644 --- a/benchmarks/src/remove.cpp +++ b/benchmarks/src/remove.cpp @@ -26,6 +26,21 @@ void r(benchmark::State& state) { } } +template +void rc(benchmark::State& state) { + std::vector src(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector v(lorem_ipsum.size()); + for (auto _ : state) { + benchmark::DoNotOptimize(src); + benchmark::DoNotOptimize(v); + if constexpr (Type == alg_type::std_fn) { + benchmark::DoNotOptimize(std::remove_copy(src.begin(), src.end(), v.begin(), T{'l'})); + } else { + benchmark::DoNotOptimize(std::ranges::remove_copy(src, v.begin(), T{'l'})); + } + } +} + BENCHMARK(r); BENCHMARK(r); BENCHMARK(r); @@ -36,4 +51,14 @@ BENCHMARK(r); BENCHMARK(r); BENCHMARK(r); +BENCHMARK(rc); +BENCHMARK(rc); +BENCHMARK(rc); +BENCHMARK(rc); + +BENCHMARK(rc); +BENCHMARK(rc); +BENCHMARK(rc); +BENCHMARK(rc); + BENCHMARK_MAIN(); diff --git a/benchmarks/src/unique.cpp b/benchmarks/src/unique.cpp index 2ff40a902f..4baef63aa7 100644 --- a/benchmarks/src/unique.cpp +++ b/benchmarks/src/unique.cpp @@ -37,6 +37,27 @@ void u(benchmark::State& state) { } } +template +void uc(benchmark::State& state) { + std::mt19937_64 gen(22033); + using TD = std::conditional_t; + std::binomial_distribution dis(5); + + std::vector> src(2552); + std::generate(src.begin(), src.end(), [&] { return static_cast(dis(gen)); }); + + std::vector> v(src.size()); + for (auto _ : state) { + benchmark::DoNotOptimize(src); + benchmark::DoNotOptimize(v); + if constexpr (Type == alg_type::std_fn) { + benchmark::DoNotOptimize(std::unique_copy(src.begin(), src.end(), v.begin())); + } else { + benchmark::DoNotOptimize(std::ranges::unique_copy(src, v.begin())); + } + } +} + BENCHMARK(u); BENCHMARK(u); BENCHMARK(u); @@ -47,4 +68,14 @@ BENCHMARK(u); BENCHMARK(u); BENCHMARK(u); +BENCHMARK(uc); +BENCHMARK(uc); +BENCHMARK(uc); +BENCHMARK(uc); + +BENCHMARK(uc); +BENCHMARK(uc); +BENCHMARK(uc); +BENCHMARK(uc); + BENCHMARK_MAIN(); diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 566a097aba..5da863e79a 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -85,10 +85,20 @@ const void* __stdcall __std_search_n_2(const void* _First, const void* _Last, si const void* __stdcall __std_search_n_4(const void* _First, const void* _Last, size_t _Count, uint32_t _Value) noexcept; const void* __stdcall __std_search_n_8(const void* _First, const void* _Last, size_t _Count, uint64_t _Value) noexcept; +void* __stdcall __std_remove_copy_1(const void* _First, const void* _Last, void* _Out, uint8_t _Val) noexcept; +void* __stdcall __std_remove_copy_2(const void* _First, const void* _Last, void* _Out, uint16_t _Val) noexcept; +void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Out, uint32_t _Val) noexcept; +void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Out, uint64_t _Val) noexcept; + void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept; void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept; void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept; void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept; + +void* __stdcall __std_unique_copy_1(const void* _First, const void* _Last, void* _Dest) noexcept; +void* __stdcall __std_unique_copy_2(const void* _First, const void* _Last, void* _Dest) noexcept; +void* __stdcall __std_unique_copy_4(const void* _First, const void* _Last, void* _Dest) noexcept; +void* __stdcall __std_unique_copy_8(const void* _First, const void* _Last, void* _Dest) noexcept; } // extern "C" _STD_BEGIN @@ -257,6 +267,43 @@ _Ty* _Unique_vectorized(_Ty* const _First, _Ty* const _Last) noexcept { } } +template +_Ty* _Remove_copy_vectorized( + const _Ty* const _First, const _Ty* const _Last, _Ty* const _Dest, const _TVal _Val) noexcept { + if constexpr (is_pointer_v<_Ty>) { +#ifdef _WIN64 + return reinterpret_cast<_Ty*>(::__std_remove_copy_8(_First, _Last, _Dest, reinterpret_cast(_Val))); +#else // ^^^ defined(_WIN64) / !defined(_WIN64) vvv + return reinterpret_cast<_Ty*>(::__std_remove_copy_4(_First, _Last, _Dest, reinterpret_cast(_Val))); +#endif // ^^^ !defined(_WIN64) ^^^ + } else if constexpr (sizeof(_Ty) == 1) { + return reinterpret_cast<_Ty*>(::__std_remove_copy_1(_First, _Last, _Dest, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 2) { + return reinterpret_cast<_Ty*>(::__std_remove_copy_2(_First, _Last, _Dest, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 4) { + return reinterpret_cast<_Ty*>(::__std_remove_copy_4(_First, _Last, _Dest, static_cast(_Val))); + } else if constexpr (sizeof(_Ty) == 8) { + return reinterpret_cast<_Ty*>(::__std_remove_copy_8(_First, _Last, _Dest, static_cast(_Val))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size + } +} + +template +_Ty* _Unique_copy_vectorized(const _Ty* const _First, const _Ty* const _Last, _Ty* const _Dest) noexcept { + if constexpr (sizeof(_Ty) == 1) { + return reinterpret_cast<_Ty*>(::__std_unique_copy_1(_First, _Last, _Dest)); + } else if constexpr (sizeof(_Ty) == 2) { + return reinterpret_cast<_Ty*>(::__std_unique_copy_2(_First, _Last, _Dest)); + } else if constexpr (sizeof(_Ty) == 4) { + return reinterpret_cast<_Ty*>(::__std_unique_copy_4(_First, _Last, _Dest)); + } else if constexpr (sizeof(_Ty) == 8) { + return reinterpret_cast<_Ty*>(::__std_unique_copy_8(_First, _Last, _Dest)); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size + } +} + // Can we activate the vector algorithms for find_first_of? template constexpr bool _Vector_alg_in_find_first_of_is_safe = _Equal_memcmp_is_safe<_It1, _It2, _Pr>; @@ -282,6 +329,17 @@ constexpr bool _Vector_alg_in_search_n_is_safe = _Vector_alg_in_find_is_safe<_It // Can we activate the vector algorithms for unique? template constexpr bool _Vector_alg_in_unique_is_safe = _Equal_memcmp_is_safe<_Iter, _Iter, _Pr>; + +// Can we use this output iterator for remove_copy or unique_copy? +template +constexpr bool _Output_iterator_for_vector_alg_is_safe() { + if constexpr (_Iterator_is_contiguous<_Out>) { + return is_same_v<_Iter_value_t<_Out>, remove_const_t<_Iter_value_t<_In>>>; + } else { + return false; + } +} + _STD_END #endif // _USE_STD_VECTOR_ALGORITHMS @@ -4718,6 +4776,33 @@ _CONSTEXPR20 _OutIt remove_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const _ auto _UFirst = _STD _Get_unwrapped(_First); const auto _ULast = _STD _Get_unwrapped(_Last); auto _UDest = _STD _Get_unwrapped_unverified(_Dest); + +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_find_is_safe + && _Output_iterator_for_vector_alg_is_safe()) { + if (!_STD _Is_constant_evaluated()) { + if (!_STD _Could_compare_equal_to_value_type(_Val)) { + _UDest = _STD _Copy_unchecked(_UFirst, _ULast, _UDest); + _STD _Seek_wrapped(_Dest, _UDest); + return _Dest; + } + + const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _Result = + _STD _Remove_copy_vectorized(_STD _To_address(_UFirst), _STD _To_address(_ULast), _Dest_ptr, _Val); + + if constexpr (is_pointer_v) { + _UDest = _Result; + } else { + _UDest += _Result - _Dest_ptr; + } + + _STD _Seek_wrapped(_Dest, _UDest); + return _Dest; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (; _UFirst != _ULast; ++_UFirst) { if (!(*_UFirst == _Val)) { *_UDest = *_UFirst; @@ -4943,6 +5028,31 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(indirectly_copyable<_It, _Out>); _STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate, const _Ty*>); +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_find_is_safe<_It, _Ty> && _Output_iterator_for_vector_alg_is_safe<_Out, _It>() + && sized_sentinel_for<_Se, _It> && is_same_v<_Pj, identity>) { + if (!_STD is_constant_evaluated()) { + const auto _Size = _Last - _First; + auto _End = _First + _Size; + + if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { + _Output = _STD _Copy_unchecked(_First, _Last, _Output); + return {_STD move(_End), _STD move(_Output)}; + } + + const auto _Dest_ptr = _STD to_address(_Output); + const auto _Result = + _STD _Remove_copy_vectorized(_STD to_address(_First), _STD to_address(_End), _Dest_ptr, _Val); + + if constexpr (is_pointer_v<_Out>) { + return {_STD move(_End), _Result}; + } else { + return {_STD move(_End), _STD move(_Output) + (_Result - _Dest_ptr)}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (; _First != _Last; ++_First) { if (_STD invoke(_Proj, *_First) != _Val) { *_Output = *_First; @@ -5190,6 +5300,26 @@ _CONSTEXPR20 _OutIt unique_copy(_InIt _First, _InIt _Last, _OutIt _Dest, _Pr _Pr auto _UDest = _STD _Get_unwrapped_unverified(_Dest); +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_unique_is_safe + && _Output_iterator_for_vector_alg_is_safe()) { + if (!_STD _Is_constant_evaluated()) { + const auto _First_ptr = _STD _To_address(_UFirst); + const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _Result = _STD _Unique_copy_vectorized(_First_ptr, _STD _To_address(_ULast), _Dest_ptr); + + if constexpr (is_pointer_v) { + _UDest = _Result; + } else { + _UDest += _Result - _Dest_ptr; + } + + _STD _Seek_wrapped(_Dest, _UDest); + return _Dest; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Is_ranges_fwd_iter_v<_InIt>) { // can reread the source for comparison auto _Firstb = _UFirst; @@ -5317,6 +5447,26 @@ namespace ranges { return {_STD move(_First), _STD move(_Output)}; } +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (is_same_v<_Pj, identity> && sized_sentinel_for<_Se, _It> + && _Vector_alg_in_unique_is_safe<_It, _Pr> + && _Output_iterator_for_vector_alg_is_safe<_Out, _It>()) { + if (!_STD is_constant_evaluated()) { + const auto _Size = _Last - _First; + const auto _First_ptr = _STD to_address(_First); + const auto _Last_ptr = _First_ptr + static_cast(_Size); + const auto _Output_ptr = _STD to_address(_Output); + const auto _Result = _STD _Unique_copy_vectorized(_First_ptr, _Last_ptr, _Output_ptr); + + if constexpr (is_pointer_v<_It> && is_pointer_v<_Out>) { + return {_Last_ptr, _Result}; + } else { + return {_STD move(_First) + _Size, _STD move(_Output) + (_Result - _Output_ptr)}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Is_input_with_value_type<_Out, iter_value_t<_It>>) { // Can reread _Output *_Output = *_First; diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 623e49632b..3a79345a1a 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -5257,9 +5257,10 @@ __declspec(noalias) void __stdcall __std_replace_8( namespace { template - void* _Remove_fallback(void* const _First, void* const _Last, void* const _Out, const _Ty _Val) noexcept { - _Ty* _Src = reinterpret_cast<_Ty*>(_First); - _Ty* _Dest = reinterpret_cast<_Ty*>(_Out); + void* _Remove_fallback( + const void* const _First, const void* const _Last, void* const _Out, const _Ty _Val) noexcept { + const _Ty* _Src = reinterpret_cast(_First); + _Ty* _Dest = reinterpret_cast<_Ty*>(_Out); while (_Src != _Last) { if (*_Src != _Val) { @@ -5274,9 +5275,9 @@ namespace { } template - void* _Unique_fallback(void* const _First, void* const _Last, void* const _Dest) noexcept { - _Ty* _Out = reinterpret_cast<_Ty*>(_Dest); - _Ty* _Src = reinterpret_cast<_Ty*>(_First); + void* _Unique_fallback(const void* const _First, const void* const _Last, void* const _Dest) noexcept { + _Ty* _Out = reinterpret_cast<_Ty*>(_Dest); + const _Ty* _Src = reinterpret_cast(_First); while (_Src != _Last) { if (*_Src != *_Out) { @@ -5499,6 +5500,98 @@ namespace { } }; + constexpr size_t _Remove_copy_buffer_size = 512; + + template + void* _Remove_impl(void* _First, void* const _Stop, const _Ty _Val) noexcept { + void* _Out = _First; + const auto _Match = _Traits::_Set(_Val); + + do { + const auto _Src = _Traits::_Load(_First); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Out = _Traits::_Store_masked(_Out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + } while (_First != _Stop); + + return _Out; + } + + template + void* _Remove_copy_impl(const void* _First, const void* const _Stop, void* _Out, const _Ty _Val) noexcept { + unsigned char _Buffer[_Remove_copy_buffer_size]; + void* _Buffer_out = _Buffer; + void* const _Buffer_stop = _Buffer + _Remove_copy_buffer_size - _Traits::_Step; + + const auto _Match = _Traits::_Set(_Val); + + do { + const auto _Src = _Traits::_Load(_First); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Buffer_out = _Traits::_Store_masked(_Buffer_out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + + if (_Buffer_out >= _Buffer_stop) { + const size_t _Fill = _Byte_length(_Buffer, _Buffer_out); + memcpy(_Out, _Buffer, _Fill); + _Advance_bytes(_Out, _Fill); + _Buffer_out = _Buffer; + } + } while (_First != _Stop); + + const size_t _Fill = _Byte_length(_Buffer, _Buffer_out); + memcpy(_Out, _Buffer, _Fill); + _Advance_bytes(_Out, _Fill); + return _Out; + } + + template + void* _Unique_impl(void* _First, void* const _Stop) noexcept { + void* _Out = _First; + + do { + const auto _Src = _Traits::_Load(_First); + void* _First_d = _First; + _Rewind_bytes(_First_d, _Traits::_Elem_size); + const auto _Match = _Traits::_Load(_First_d); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Out = _Traits::_Store_masked(_Out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + } while (_First != _Stop); + + _Rewind_bytes(_Out, _Traits::_Elem_size); + return _Out; + } + + template + void* _Unique_copy_impl(const void* _First, const void* const _Stop, void* _Out) noexcept { + unsigned char _Buffer[_Remove_copy_buffer_size]; + void* _Buffer_out = _Buffer; + void* const _Buffer_stop = _Buffer + _Remove_copy_buffer_size - _Traits::_Step; + + do { + const auto _Src = _Traits::_Load(_First); + const void* _First_d = _First; + _Rewind_bytes(_First_d, _Traits::_Elem_size); + const auto _Match = _Traits::_Load(_First_d); + const uint32_t _Bingo = _Traits::_Mask(_Src, _Match); + _Buffer_out = _Traits::_Store_masked(_Buffer_out, _Src, _Bingo); + _Advance_bytes(_First, _Traits::_Step); + + if (_Buffer_out >= _Buffer_stop) { + const size_t _Fill = _Byte_length(_Buffer, _Buffer_out); + memcpy(static_cast(_Out) + _Traits::_Elem_size, _Buffer, _Fill); + _Advance_bytes(_Out, _Fill); + _Buffer_out = _Buffer; + } + } while (_First != _Stop); + + const size_t _Fill = _Byte_length(_Buffer, _Buffer_out); + memcpy(static_cast(_Out) + _Traits::_Elem_size, _Buffer, _Fill); + _Advance_bytes(_Out, _Fill); + return _Out; + } + template void* _Remove_impl(void* _First, const void* _Stop, const _Ty _Val) noexcept { void* _Out = _First; @@ -5611,7 +5704,77 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ return _Remove_fallback(_First, _Last, _Out, _Val); } -void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept { +void* __stdcall __std_remove_copy_1( + const void* _First, const void* const _Last, void* _Out, const uint8_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); + _Out = _Remove_copy_impl<_Remove_sse_1>(_First, _Stop, _Out, _Val); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_copy_2( + const void* _First, const void* const _Last, void* _Out, const uint16_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Out = _Remove_copy_impl<_Remove_sse_2>(_First, _Stop, _Out, _Val); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_copy_4( + const void* _First, const void* const _Last, void* _Out, const uint32_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + _Out = _Remove_copy_impl<_Remove_avx_4>(_First, _Stop, _Out, _Val); + _First = _Stop; + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Out = _Remove_copy_impl<_Remove_sse_4>(_First, _Stop, _Out, _Val); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_remove_copy_8( + const void* _First, const void* const _Last, void* _Out, const uint64_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + _Out = _Remove_copy_impl<_Remove_avx_8>(_First, _Stop, _Out, _Val); + _First = _Stop; + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Out = _Remove_copy_impl<_Remove_sse_8>(_First, _Stop, _Out, _Val); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Out, _Val); +} + +void* __stdcall __std_unique_1(void* _First, void* const _Last) noexcept { _First = const_cast(__std_adjacent_find_1(_First, _Last)); if (_First == _Last) { @@ -5633,7 +5796,7 @@ void* __stdcall __std_unique_1(void* _First, void* _Last) noexcept { return _Unique_fallback(_First, _Last, _Dest); } -void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept { +void* __stdcall __std_unique_2(void* _First, void* const _Last) noexcept { _First = const_cast(__std_adjacent_find_2(_First, _Last)); if (_First == _Last) { @@ -5655,7 +5818,7 @@ void* __stdcall __std_unique_2(void* _First, void* _Last) noexcept { return _Unique_fallback(_First, _Last, _Dest); } -void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept { +void* __stdcall __std_unique_4(void* _First, void* const _Last) noexcept { _First = const_cast(__std_adjacent_find_4(_First, _Last)); if (_First == _Last) { @@ -5684,7 +5847,7 @@ void* __stdcall __std_unique_4(void* _First, void* _Last) noexcept { return _Unique_fallback(_First, _Last, _Dest); } -void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept { +void* __stdcall __std_unique_8(void* _First, void* const _Last) noexcept { _First = const_cast(__std_adjacent_find_8(_First, _Last)); if (_First == _Last) { @@ -5713,6 +5876,100 @@ void* __stdcall __std_unique_8(void* _First, void* _Last) noexcept { return _Unique_fallback(_First, _Last, _Dest); } +void* __stdcall __std_unique_copy_1(const void* _First, const void* const _Last, void* _Dest) noexcept { + if (_First == _Last) { + return _Dest; + } + + memcpy(_Dest, _First, 1); + _Advance_bytes(_First, 1); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{7}); + _Dest = _Unique_copy_impl<_Remove_sse_1>(_First, _Stop, _Dest); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_copy_2(const void* _First, const void* const _Last, void* _Dest) noexcept { + if (_First == _Last) { + return _Dest; + } + + memcpy(_Dest, _First, 2); + _Advance_bytes(_First, 2); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Dest = _Unique_copy_impl<_Remove_sse_2>(_First, _Stop, _Dest); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_copy_4(const void* _First, const void* const _Last, void* _Dest) noexcept { + if (_First == _Last) { + return _Dest; + } + + memcpy(_Dest, _First, 4); + _Advance_bytes(_First, 4); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + _Dest = _Unique_copy_impl<_Remove_avx_4>(_First, _Stop, _Dest); + _First = _Stop; + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Dest = _Unique_copy_impl<_Remove_sse_4>(_First, _Stop, _Dest); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + +void* __stdcall __std_unique_copy_8(const void* _First, const void* const _Last, void* _Dest) noexcept { + if (_First == _Last) { + return _Dest; + } + + memcpy(_Dest, _First, 8); + _Advance_bytes(_First, 8); + +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + _Dest = _Unique_copy_impl<_Remove_avx_8>(_First, _Stop, _Dest); + _First = _Stop; + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } else if (_Use_sse42() && _Size_bytes >= 16) { + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0xF}); + _Dest = _Unique_copy_impl<_Remove_sse_8>(_First, _Stop, _Dest); + _First = _Stop; + } +#endif // !defined(_M_ARM64EC) + + return _Unique_fallback(_First, _Last, _Dest); +} + } // extern "C" namespace { diff --git a/tests/std/tests/Dev11_0316853_find_memchr_optimization/test.cpp b/tests/std/tests/Dev11_0316853_find_memchr_optimization/test.cpp index 4a7eb9526b..29e680c2ba 100644 --- a/tests/std/tests/Dev11_0316853_find_memchr_optimization/test.cpp +++ b/tests/std/tests/Dev11_0316853_find_memchr_optimization/test.cpp @@ -4,6 +4,7 @@ // DevDiv-316853 ": find()'s memchr() optimization is incorrect" // DevDiv-468500 ": find()'s memchr() optimization is insufficiently aggressive" +#pragma warning(disable : 4244) // '=': conversion from 'const _Ty' to 'unsigned int', possible loss of data #pragma warning(disable : 4389) // signed/unsigned mismatch #pragma warning(disable : 4805) // '==': unsafe mix of type '_Ty' and type 'const _Ty' in operation // This test intentionally triggers that warning when one of the inputs to find is bool @@ -16,6 +17,7 @@ #include #include +#include #include #include #include @@ -763,4 +765,94 @@ int main() { static_assert(!_Vector_alg_in_find_is_safe, "should not optimize"); static_assert(!_Vector_alg_in_find_is_safe, "should not optimize"); } + + { // quick checks to exercise more codepaths with _Could_compare_equal_to_value_type() + const vector v{200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215}; + const uint32_t u32{205}; + const uint64_t u64{0x1234'5678'0000'00CDull}; + + assert(u32 != u64); // u64 is out-of-range for uint32_t, so it can never compare equal... + assert(u32 == static_cast(u64)); // ... unless an algorithm performs an improper cast + + assert(count(v.begin(), v.end(), u32) == 1); + assert(count(v.begin(), v.end(), u64) == 0); + +#if _HAS_CXX20 + assert(ranges::count(v, u32) == 1); + assert(ranges::count(v, u64) == 0); +#endif // _HAS_CXX20 + + assert(find(v.begin(), v.end(), u32) == v.begin() + 5); + assert(find(v.begin(), v.end(), u64) == v.end()); + +#if _HAS_CXX20 + assert(ranges::find(v, u32) == v.begin() + 5); + assert(ranges::find(v, u64) == v.end()); +#endif // _HAS_CXX20 + +#if _HAS_CXX23 + { + const auto result = ranges::find_last(v, u32); + assert(result.begin() == v.begin() + 5); + assert(result.end() == v.end()); + } + { + const auto result = ranges::find_last(v, u64); + assert(result.begin() == v.end()); + assert(result.end() == v.end()); + } +#endif // _HAS_CXX23 + + { + const vector rem{200, 201, 202, 203, 204, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 0}; + + vector dst(v.size(), 0); + assert(remove_copy(v.begin(), v.end(), dst.begin(), u32) == dst.end() - 1); + assert(dst == rem); + + dst.assign(v.size(), 0); + assert(remove_copy(v.begin(), v.end(), dst.begin(), u64) == dst.end()); + assert(dst == v); + +#if _HAS_CXX20 + { + dst.assign(v.size(), 0); + const auto result = ranges::remove_copy(v, dst.begin(), u32); + assert(result.in == v.end()); + assert(result.out == dst.end() - 1); + assert(dst == rem); + } + { + dst.assign(v.size(), 0); + const auto result = ranges::remove_copy(v, dst.begin(), u64); + assert(result.in == v.end()); + assert(result.out == dst.end()); + assert(dst == v); + } +#endif // _HAS_CXX20 + } + + { + const vector rep{200, 201, 202, 203, 204, 333, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215}; + const uint32_t val{333}; + + vector dst = v; + replace(dst.begin(), dst.end(), u32, val); + assert(dst == rep); + + dst = v; + replace(dst.begin(), dst.end(), u64, uint64_t{val}); + assert(dst == v); + +#if _HAS_CXX20 + dst = v; + assert(ranges::replace(dst, u32, val) == dst.end()); + assert(dst == rep); + + dst = v; + assert(ranges::replace(dst, u64, val) == dst.end()); + assert(dst == v); +#endif // _HAS_CXX20 + } + } } diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 13134bdd86..3bbe63d8b7 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -720,6 +720,20 @@ FwdIt last_known_good_remove(FwdIt first, FwdIt last, T val) { return dest; } +template +OutIt last_known_good_remove_copy(InIt first, InIt last, OutIt dest, T val) { + while (first != last) { + if (*first != val) { + *dest = *first; + ++dest; + } + + ++first; + } + + return dest; +} + template void test_case_remove(vector& in_out_expected, vector& in_out_actual, vector& in_out_actual_r, const T val) { auto rem_expected = last_known_good_remove(in_out_expected.begin(), in_out_expected.end(), val); @@ -729,26 +743,51 @@ void test_case_remove(vector& in_out_expected, vector& in_out_actual, vect #if _HAS_CXX20 auto rem_actual_r = ranges::remove(in_out_actual_r, val); assert(equal(in_out_expected.begin(), rem_expected, begin(in_out_actual_r), begin(rem_actual_r))); + assert(end(rem_actual_r) == in_out_actual_r.end()); #else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv (void) in_out_actual_r; #endif // ^^^ !_HAS_CXX20 ^^^ } +template +void test_case_remove_copy( + const vector& source, vector& out_expected, vector& out_actual, vector& out_actual_r, const T val) { + auto rem_expected = last_known_good_remove_copy(source.begin(), source.end(), out_expected.begin(), val); + auto rem_actual = remove_copy(source.begin(), source.end(), out_actual.begin(), val); + assert(equal(out_expected.begin(), rem_expected, out_actual.begin(), rem_actual)); + assert(equal(rem_expected, out_expected.end(), rem_actual, out_actual.end())); + +#if _HAS_CXX20 + auto rem_actual_r = ranges::remove_copy(source, out_actual_r.begin(), val); + assert(equal(out_expected.begin(), rem_expected, out_actual_r.begin(), rem_actual_r.out)); + assert(equal(rem_expected, out_expected.end(), rem_actual_r.out, out_actual_r.end())); + assert(rem_actual_r.in == source.end()); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + (void) out_actual_r; +#endif // ^^^ !_HAS_CXX20 ^^^ +} + template void test_remove(mt19937_64& gen) { using TD = conditional_t; binomial_distribution dis(10); vector source; + vector out_expected; + vector out_actual; + vector out_actual_r; vector in_out_expected; vector in_out_actual; vector in_out_actual_r; - for (const auto& v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { + for (const auto& v : + {&source, &in_out_expected, &in_out_actual, &in_out_actual_r, &out_expected, &out_actual, &out_actual_r}) { v->reserve(dataCount); } test_case_remove(in_out_expected, in_out_actual, in_out_actual_r, static_cast(dis(gen))); + test_case_remove_copy(source, out_expected, out_actual, out_actual_r, static_cast(dis(gen))); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { source.push_back(static_cast(dis(gen))); @@ -756,7 +795,12 @@ void test_remove(mt19937_64& gen) { *v = source; } + for (const auto& v : {&out_expected, &out_actual, &out_actual_r}) { + v->assign(source.size(), T{0}); + } + test_case_remove(in_out_expected, in_out_actual, in_out_actual_r, static_cast(dis(gen))); + test_case_remove_copy(source, out_expected, out_actual, out_actual_r, static_cast(dis(gen))); } } @@ -782,6 +826,28 @@ FwdIt last_known_good_unique(FwdIt first, FwdIt last) { return dest; } +template +FwdItOut last_known_good_unique_copy(FwdItIn first, FwdItIn last, FwdItOut dest) { + if (first == last) { + return dest; + } + + *dest = *first; + ++first; + + while (first != last) { + if (*first != *dest) { + ++dest; + *dest = *first; + } + + ++first; + } + + ++dest; + return dest; +} + template void test_case_unique(vector& in_out_expected, vector& in_out_actual, vector& in_out_actual_r) { auto un_expected = last_known_good_unique(in_out_expected.begin(), in_out_expected.end()); @@ -791,11 +857,30 @@ void test_case_unique(vector& in_out_expected, vector& in_out_actual, vect #if _HAS_CXX20 auto un_actual_r = ranges::unique(in_out_actual_r); assert(equal(in_out_expected.begin(), un_expected, begin(in_out_actual_r), begin(un_actual_r))); + assert(end(un_actual_r) == in_out_actual_r.end()); #else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv (void) in_out_actual_r; #endif // ^^^ !_HAS_CXX20 ^^^ } +template +void test_case_unique_copy( + const vector& source, vector& out_expected, vector& out_actual, vector& out_actual_r) { + auto un_expected = last_known_good_unique_copy(source.begin(), source.end(), out_expected.begin()); + auto un_actual = unique_copy(source.begin(), source.end(), out_actual.begin()); + assert(equal(out_expected.begin(), un_expected, out_actual.begin(), un_actual)); + assert(equal(un_expected, out_expected.end(), un_actual, out_actual.end())); + +#if _HAS_CXX20 + auto un_actual_r = ranges::unique_copy(source, out_actual_r.begin()); + assert(equal(out_expected.begin(), un_expected, out_actual_r.begin(), un_actual_r.out)); + assert(equal(un_expected, out_expected.end(), un_actual_r.out, out_actual_r.end())); + assert(un_actual_r.in == source.end()); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + (void) out_actual_r; +#endif // ^^^ !_HAS_CXX20 ^^^ +} + template void test_unique(mt19937_64& gen) { constexpr int number_of_values = 5; @@ -808,15 +893,20 @@ void test_unique(mt19937_64& gen) { binomial_distribution dis(number_of_values); vector source; + vector out_expected; + vector out_actual; + vector out_actual_r; vector in_out_expected; vector in_out_actual; vector in_out_actual_r; - for (const auto& v : {&source, &in_out_expected, &in_out_actual, &in_out_actual_r}) { + for (const auto& v : + {&source, &in_out_expected, &in_out_actual, &in_out_actual_r, &out_expected, &out_actual, &out_actual_r}) { v->reserve(dataCount); } test_case_unique(in_out_expected, in_out_actual, in_out_actual_r); + test_case_unique_copy(source, out_expected, out_actual, out_actual_r); for (size_t attempts = 0; attempts < dataCount; ++attempts) { if constexpr (is_pointer_v) { source.push_back(ptr_val_array.data() + dis(gen)); @@ -828,7 +918,12 @@ void test_unique(mt19937_64& gen) { *v = source; } + for (const auto& v : {&out_expected, &out_actual, &out_actual_r}) { + v->assign(source.size(), T{0}); + } + test_case_unique(in_out_expected, in_out_actual, in_out_actual_r); + test_case_unique_copy(source, out_expected, out_actual, out_actual_r); } }