diff --git a/benchmarks/src/remove.cpp b/benchmarks/src/remove.cpp index f0d28f6d734..8eae9882f39 100644 --- a/benchmarks/src/remove.cpp +++ b/benchmarks/src/remove.cpp @@ -7,13 +7,14 @@ #include #include "lorem.hpp" +#include "skewed_allocator.hpp" enum class alg_type { std_fn, rng }; template void r(benchmark::State& state) { - const std::vector src(lorem_ipsum.begin(), lorem_ipsum.end()); - std::vector v; + const std::vector> src(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector> v; v.reserve(lorem_ipsum.size()); for (auto _ : state) { v = src; @@ -26,6 +27,21 @@ void r(benchmark::State& state) { } } +template +void rc(benchmark::State& state) { + std::vector> src(lorem_ipsum.begin(), lorem_ipsum.end()); + std::vector> dst; + dst.resize(src.size()); + for (auto _ : state) { + benchmark::DoNotOptimize(src); + if constexpr (Type == alg_type::std_fn) { + benchmark::DoNotOptimize(std::remove_copy(src.begin(), src.end(), dst.begin(), T{'l'})); + } else { + benchmark::DoNotOptimize(std::ranges::remove_copy(src, dst.begin(), T{'l'})); + } + } +} + BENCHMARK(r); BENCHMARK(r); BENCHMARK(r); @@ -36,4 +52,10 @@ BENCHMARK(r); BENCHMARK(r); BENCHMARK(r); +BENCHMARK(rc); +BENCHMARK(rc); + +BENCHMARK(rc); +BENCHMARK(rc); + BENCHMARK_MAIN(); diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 1b66037563d..1693aa34660 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -47,6 +47,9 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( const void* _First, const void* _Last, void* _Dest) noexcept; +void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Dest, uint32_t _Val) noexcept; +void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Dest, uint64_t _Val) noexcept; + _Min_max_element_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_element_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept; _Min_max_element_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept; @@ -98,6 +101,26 @@ __declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void } } +template +_OutTy* _Remove_copy_vectorized( + const _InTy* const _First, const _InTy* const _Last, _OutTy* const _Dest, const _TVal _Val) noexcept { + _STL_INTERNAL_STATIC_ASSERT(sizeof(_InTy) == sizeof(_OutTy)); + + if constexpr (is_pointer_v<_InTy>) { +#ifdef _WIN64 + return reinterpret_cast<_OutTy*>(::__std_remove_copy_8(_First, _Last, _Dest, reinterpret_cast(_Val))); +#else + return reinterpret_cast<_OutTy*>(::__std_remove_copy_4(_First, _Last, _Dest, reinterpret_cast(_Val))); +#endif + } else if constexpr (sizeof(_InTy) == 4) { + return reinterpret_cast<_OutTy*>(::__std_remove_copy_4(_First, _Last, _Dest, static_cast(_Val))); + } else if constexpr (sizeof(_InTy) == 8) { + return reinterpret_cast<_OutTy*>(::__std_remove_copy_8(_First, _Last, _Dest, static_cast(_Val))); + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size + } +} + template pair<_Ty*, _Ty*> _Minmax_element_vectorized(_Ty* const _First, _Ty* const _Last) noexcept { constexpr bool _Signed = is_signed_v<_Ty>; @@ -239,6 +262,16 @@ template constexpr bool _Vector_alg_in_ranges_replace_is_safe = _Vector_alg_in_replace_is_safe<_Iter, _Ty1> // can search and replace && _Vector_alg_in_find_is_safe_elem<_Ty2, _Iter_value_t<_Iter>>; // replacement fits + +// Can we activate the vector algorithms for remove_copy_if? +template +constexpr bool _Vector_alg_in_remove_copy_is_safe = + // Can copy values bitwise + _Sent_copy_cat<_InIt, _Sent, _OutIt>::_Bitcopy_assignable + // The type of the value to remove must be compatible with the type of the elements. + && _Vector_alg_in_find_is_safe_elem<_Ty, _Iter_value_t<_InIt>> + // AVX2 mask compatible size + && sizeof(_Iter_value_t<_InIt>) >= 4; _STD_END #endif // _USE_STD_VECTOR_ALGORITHMS @@ -4564,6 +4597,30 @@ _CONSTEXPR20 _OutIt remove_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const _ auto _UFirst = _STD _Get_unwrapped(_First); const auto _ULast = _STD _Get_unwrapped(_Last); auto _UDest = _STD _Get_unwrapped_unverified(_Dest); + +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_remove_copy_is_safe) { + if (!_STD _Is_constant_evaluated()) { + if (!_STD _Could_compare_equal_to_value_type(_Val)) { + _STD _Seek_wrapped(_Dest, _STD _Copy_unchecked(_UFirst, _ULast, _UDest)); + return _Dest; + } + + const auto _Dest_ptr = _STD _To_address(_UDest); + const auto _Result = + _STD _Remove_copy_vectorized(_STD _To_address(_UFirst), _STD _To_address(_ULast), _Dest_ptr, _Val); + if constexpr (is_pointer_v) { + _UDest = _Result; + } else { + _UDest += _Result - _Dest_ptr; + } + + _STD _Seek_wrapped(_Dest, _UDest); + return _Dest; + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (; _UFirst != _ULast; ++_UFirst) { if (!(*_UFirst == _Val)) { *_UDest = *_UFirst; @@ -4789,6 +4846,28 @@ namespace ranges { _STL_INTERNAL_STATIC_ASSERT(indirectly_copyable<_It, _Out>); _STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate, const _Ty*>); +#if _USE_STD_VECTOR_ALGORITHMS + if constexpr (_Vector_alg_in_remove_copy_is_safe<_It, _Se, _Out, _Ty> && is_same_v<_Pj, identity>) { + if (!_STD _Is_constant_evaluated()) { + if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) { + return _RANGES _Copy_unchecked(_First, _Last, _Output); + } + + const auto _Size = _Last - _First; + const auto _First_ptr = _STD to_address(_First); + const auto _Last_ptr = _First_ptr + static_cast(_Size); + const auto _Dest_ptr = _STD to_address(_Output); + const auto _Result = _STD _Remove_copy_vectorized(_First_ptr, _Last_ptr, _Dest_ptr, _Val); + + if constexpr (is_pointer_v<_It> && is_pointer_v<_Out>) { + return {_Last_ptr, _Result}; + } else { + return {_First + _Size, _Output + (_Result - _Dest_ptr)}; + } + } + } +#endif // _USE_STD_VECTOR_ALGORITHMS + for (; _First != _Last; ++_First) { if (_STD invoke(_Proj, *_First) != _Val) { *_Output = *_First; diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index e89a0fba919..fb7122380a4 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4118,9 +4118,10 @@ __declspec(noalias) void __stdcall __std_replace_8( namespace { template - void* _Remove_fallback(void* const _First, void* const _Last, void* const _Out, const _Ty _Val) noexcept { - _Ty* _Src = reinterpret_cast<_Ty*>(_First); - _Ty* _Dest = reinterpret_cast<_Ty*>(_Out); + void* _Remove_fallback( + const void* const _First, const void* const _Last, void* const _Out, const _Ty _Val) noexcept { + auto _Src = reinterpret_cast(_First); + auto _Dest = reinterpret_cast<_Ty*>(_Out); while (_Src != _Last) { if (*_Src != _Val) { @@ -4192,7 +4193,7 @@ namespace { extern "C" { void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _Val) noexcept { - void* _Out = _First; + void* _Dest = _First; #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) { @@ -4204,19 +4205,19 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V const __m128i _Src = _mm_loadu_si64(_First); const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF; const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si64(_Out, _Dest); - _Advance_bytes(_Out, _Remove_tables_1_sse._Size[_Bingo]); + const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si64(_Dest, _Vec); + _Advance_bytes(_Dest, _Remove_tables_1_sse._Size[_Bingo]); _Advance_bytes(_First, 8); } while (_First != _Stop); } #endif // !defined(_M_ARM64EC) - return _Remove_fallback(_First, _Last, _Out, _Val); + return _Remove_fallback(_First, _Last, _Dest, _Val); } void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _Val) noexcept { - void* _Out = _First; + void* _Dest = _First; #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) { @@ -4229,19 +4230,19 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _ const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match); const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128())); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_2_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_2_sse._Size[_Bingo]); + const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec); + _Advance_bytes(_Dest, _Remove_tables_2_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } #endif // !defined(_M_ARM64EC) - return _Remove_fallback(_First, _Last, _Out, _Val); + return _Remove_fallback(_First, _Last, _Dest, _Val); } void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _Val) noexcept { - void* _Out = _First; + void* _Dest = _First; #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { @@ -4254,9 +4255,9 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_4_avx._Size[_Bingo]); + const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Vec); + _Advance_bytes(_Dest, _Remove_tables_4_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); @@ -4271,19 +4272,19 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _ const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match); const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask)); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_4_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_4_sse._Size[_Bingo]); + const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec); + _Advance_bytes(_Dest, _Remove_tables_4_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } #endif // !defined(_M_ARM64EC) - return _Remove_fallback(_First, _Last, _Out, _Val); + return _Remove_fallback(_First, _Last, _Dest, _Val); } void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _Val) noexcept { - void* _Out = _First; + void* _Dest = _First; #ifndef _M_ARM64EC if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { @@ -4296,9 +4297,9 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); - const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_8_avx._Size[_Bingo]); + const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Vec); + _Advance_bytes(_Dest, _Remove_tables_8_avx._Size[_Bingo]); _Advance_bytes(_First, 32); } while (_First != _Stop); @@ -4313,15 +4314,67 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _ const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match); const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask)); const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast(_Remove_tables_8_sse._Shuf[_Bingo])); - const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf); - _mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest); - _Advance_bytes(_Out, _Remove_tables_8_sse._Size[_Bingo]); + const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf); + _mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec); + _Advance_bytes(_Dest, _Remove_tables_8_sse._Size[_Bingo]); _Advance_bytes(_First, 16); } while (_First != _Stop); } #endif // !defined(_M_ARM64EC) - return _Remove_fallback(_First, _Last, _Out, _Val); + return _Remove_fallback(_First, _Last, _Dest, _Val); +} + +void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Dest, uint32_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const __m256i _Match = _mm256_set1_epi32(_Val); + + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match); + const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask)); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo])); + const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const size_t _Left = _Remove_tables_4_avx._Size[_Bingo]; + _mm256_maskstore_epi32(reinterpret_cast(_Dest), _Avx2_tail_mask_32(_Left >> 2), _Vec); + _Advance_bytes(_Dest, _Left); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Dest, _Val); +} + +void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Dest, uint64_t _Val) noexcept { +#ifndef _M_ARM64EC + if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) { + const __m256i _Match = _mm256_set1_epi64x(_Val); + + const void* _Stop = _First; + _Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F}); + do { + const __m256i _Src = _mm256_loadu_si256(reinterpret_cast(_First)); + const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match); + const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask)); + const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo])); + const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf); + const size_t _Left = _Remove_tables_8_avx._Size[_Bingo]; + _mm256_maskstore_epi64(reinterpret_cast(_Dest), _Avx2_tail_mask_32(_Left >> 2), _Vec); + _Advance_bytes(_Dest, _Left); + _Advance_bytes(_First, 32); + } while (_First != _Stop); + + _mm256_zeroupper(); // TRANSITION, DevCom-10331414 + } +#endif // !defined(_M_ARM64EC) + + return _Remove_fallback(_First, _Last, _Dest, _Val); } } // extern "C" diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index bba38a1d3d1..bc0c9c4a05b 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -871,6 +871,20 @@ FwdIt last_known_good_remove(FwdIt first, FwdIt last, T val) { return dest; } +template +OutFwdIt last_known_good_remove_copy(InFwdIt first, InFwdIt last, OutFwdIt dest, T val) { + while (first != last) { + if (*first != val) { + *dest = *first; + ++dest; + } + + ++first; + } + + return dest; +} + template void test_case_remove(vector& in_out_expected, vector& in_out_actual, vector& in_out_actual_r, const T val) { auto rem_expected = last_known_good_remove(in_out_expected.begin(), in_out_expected.end(), val); @@ -911,6 +925,49 @@ void test_remove(mt19937_64& gen) { } } +template +void test_case_remove_copy( + const vector& input, vector& out_expected, vector& out_actual, vector& out_actual_r, const T val) { + + auto rem_expected = last_known_good_remove_copy(input.begin(), input.end(), out_expected.begin(), val); + auto rem_actual = remove_copy(input.begin(), input.end(), out_actual.begin(), val); + assert(equal(out_expected.begin(), rem_expected, out_actual.begin(), rem_actual)); + +#if _HAS_CXX20 + auto rem_actual_r = ranges::remove_copy(input, out_actual_r.begin(), val); + assert(equal(out_expected.begin(), rem_expected, begin(out_actual_r), rem_actual_r.out)); +#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv + (void) out_actual_r; +#endif // ^^^ !_HAS_CXX20 ^^^ +} + +template +void test_remove_copy(mt19937_64& gen) { + using TD = conditional_t; + binomial_distribution dis(10); + + vector source; + vector out_expected; + vector out_actual; + vector out_actual_r; + + for (const auto& v : {&source, &out_expected, &out_actual, &out_actual_r}) { + v->reserve(dataCount); + } + + test_case_remove_copy(source, out_expected, out_actual, out_actual_r, static_cast(dis(gen))); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + source.push_back(static_cast(dis(gen))); + + for (const auto& v : {&out_expected, &out_actual, &out_actual_r}) { + v->clear(); + v->resize(source.size()); + } + + test_case_remove_copy(source, out_expected, out_actual, out_actual_r, static_cast(dis(gen))); + } +} + template void test_swap_ranges(mt19937_64& gen) { const auto fn = [&]() { return static_cast(gen()); }; @@ -1128,6 +1185,11 @@ void test_vector_algorithms(mt19937_64& gen) { test_remove(gen); test_remove(gen); + test_remove_copy(gen); + test_remove_copy(gen); + test_remove_copy(gen); + test_remove_copy(gen); + test_swap_ranges(gen); test_swap_ranges(gen); test_swap_ranges(gen);