Skip to content
26 changes: 24 additions & 2 deletions benchmarks/src/remove.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
#include <vector>

#include "lorem.hpp"
#include "skewed_allocator.hpp"

enum class alg_type { std_fn, rng };

template <alg_type Type, class T>
void r(benchmark::State& state) {
const std::vector<T> src(lorem_ipsum.begin(), lorem_ipsum.end());
std::vector<T> v;
const std::vector<T, not_highly_aligned_allocator<T>> src(lorem_ipsum.begin(), lorem_ipsum.end());
std::vector<T, not_highly_aligned_allocator<T>> v;
v.reserve(lorem_ipsum.size());
for (auto _ : state) {
v = src;
Expand All @@ -26,6 +27,21 @@ void r(benchmark::State& state) {
}
}

template <alg_type Type, class T>
void rc(benchmark::State& state) {
std::vector<T, not_highly_aligned_allocator<T>> src(lorem_ipsum.begin(), lorem_ipsum.end());
std::vector<T, not_highly_aligned_allocator<T>> dst;
dst.resize(src.size());
for (auto _ : state) {
benchmark::DoNotOptimize(src);
if constexpr (Type == alg_type::std_fn) {
benchmark::DoNotOptimize(std::remove_copy(src.begin(), src.end(), dst.begin(), T{'l'}));
} else {
benchmark::DoNotOptimize(std::ranges::remove_copy(src, dst.begin(), T{'l'}));
}
}
}

BENCHMARK(r<alg_type::std_fn, std::uint8_t>);
BENCHMARK(r<alg_type::std_fn, std::uint16_t>);
BENCHMARK(r<alg_type::std_fn, std::uint32_t>);
Expand All @@ -36,4 +52,10 @@ BENCHMARK(r<alg_type::rng, std::uint16_t>);
BENCHMARK(r<alg_type::rng, std::uint32_t>);
BENCHMARK(r<alg_type::rng, std::uint64_t>);

BENCHMARK(rc<alg_type::std_fn, std::uint32_t>);
BENCHMARK(rc<alg_type::std_fn, std::uint64_t>);

BENCHMARK(rc<alg_type::rng, std::uint32_t>);
BENCHMARK(rc<alg_type::rng, std::uint64_t>);

BENCHMARK_MAIN();
79 changes: 79 additions & 0 deletions stl/inc/algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ __declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4(
__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8(
const void* _First, const void* _Last, void* _Dest) noexcept;

void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Dest, uint32_t _Val) noexcept;
void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Dest, uint64_t _Val) noexcept;

_Min_max_element_t __stdcall __std_minmax_element_1(const void* _First, const void* _Last, bool _Signed) noexcept;
_Min_max_element_t __stdcall __std_minmax_element_2(const void* _First, const void* _Last, bool _Signed) noexcept;
_Min_max_element_t __stdcall __std_minmax_element_4(const void* _First, const void* _Last, bool _Signed) noexcept;
Expand Down Expand Up @@ -98,6 +101,26 @@ __declspec(noalias) void _Reverse_copy_vectorized(const void* _First, const void
}
}

template <class _InTy, class _OutTy, class _TVal>
_OutTy* _Remove_copy_vectorized(
const _InTy* const _First, const _InTy* const _Last, _OutTy* const _Dest, const _TVal _Val) noexcept {
_STL_INTERNAL_STATIC_ASSERT(sizeof(_InTy) == sizeof(_OutTy));

if constexpr (is_pointer_v<_InTy>) {
#ifdef _WIN64
return reinterpret_cast<_OutTy*>(::__std_remove_copy_8(_First, _Last, _Dest, reinterpret_cast<uint64_t>(_Val)));
#else
return reinterpret_cast<_OutTy*>(::__std_remove_copy_4(_First, _Last, _Dest, reinterpret_cast<uint32_t>(_Val)));
#endif
} else if constexpr (sizeof(_InTy) == 4) {
return reinterpret_cast<_OutTy*>(::__std_remove_copy_4(_First, _Last, _Dest, static_cast<uint32_t>(_Val)));
} else if constexpr (sizeof(_InTy) == 8) {
return reinterpret_cast<_OutTy*>(::__std_remove_copy_8(_First, _Last, _Dest, static_cast<uint64_t>(_Val)));
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // Unexpected size
}
}

template <class _Ty>
pair<_Ty*, _Ty*> _Minmax_element_vectorized(_Ty* const _First, _Ty* const _Last) noexcept {
constexpr bool _Signed = is_signed_v<_Ty>;
Expand Down Expand Up @@ -239,6 +262,16 @@ template <class _Iter, class _Ty1, class _Ty2>
constexpr bool _Vector_alg_in_ranges_replace_is_safe =
_Vector_alg_in_replace_is_safe<_Iter, _Ty1> // can search and replace
&& _Vector_alg_in_find_is_safe_elem<_Ty2, _Iter_value_t<_Iter>>; // replacement fits

// Can we activate the vector algorithms for remove_copy_if?
template <class _InIt, class _Sent, class _OutIt, class _Ty>
constexpr bool _Vector_alg_in_remove_copy_is_safe =
// Can copy values bitwise
_Sent_copy_cat<_InIt, _Sent, _OutIt>::_Bitcopy_assignable
// The type of the value to remove must be compatible with the type of the elements.
&& _Vector_alg_in_find_is_safe_elem<_Ty, _Iter_value_t<_InIt>>
// AVX2 mask compatible size
&& sizeof(_Iter_value_t<_InIt>) >= 4;
_STD_END
#endif // _USE_STD_VECTOR_ALGORITHMS

Expand Down Expand Up @@ -4564,6 +4597,30 @@ _CONSTEXPR20 _OutIt remove_copy(_InIt _First, _InIt _Last, _OutIt _Dest, const _
auto _UFirst = _STD _Get_unwrapped(_First);
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_unverified(_Dest);

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Vector_alg_in_remove_copy_is_safe<decltype(_UFirst), decltype(_UFirst), decltype(_UDest), _Ty>) {
if (!_STD _Is_constant_evaluated()) {
if (!_STD _Could_compare_equal_to_value_type<decltype(_UFirst)>(_Val)) {
_STD _Seek_wrapped(_Dest, _STD _Copy_unchecked(_UFirst, _ULast, _UDest));
return _Dest;
}

const auto _Dest_ptr = _STD _To_address(_UDest);
const auto _Result =
_STD _Remove_copy_vectorized(_STD _To_address(_UFirst), _STD _To_address(_ULast), _Dest_ptr, _Val);
if constexpr (is_pointer_v<decltype(_UDest)>) {
_UDest = _Result;
} else {
_UDest += _Result - _Dest_ptr;
}

_STD _Seek_wrapped(_Dest, _UDest);
return _Dest;
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

for (; _UFirst != _ULast; ++_UFirst) {
if (!(*_UFirst == _Val)) {
*_UDest = *_UFirst;
Expand Down Expand Up @@ -4789,6 +4846,28 @@ namespace ranges {
_STL_INTERNAL_STATIC_ASSERT(indirectly_copyable<_It, _Out>);
_STL_INTERNAL_STATIC_ASSERT(indirect_binary_predicate<ranges::equal_to, projected<_It, _Pj>, const _Ty*>);

#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (_Vector_alg_in_remove_copy_is_safe<_It, _Se, _Out, _Ty> && is_same_v<_Pj, identity>) {
if (!_STD _Is_constant_evaluated()) {
if (!_STD _Could_compare_equal_to_value_type<_It>(_Val)) {
return _RANGES _Copy_unchecked(_First, _Last, _Output);
}

const auto _Size = _Last - _First;
const auto _First_ptr = _STD to_address(_First);
const auto _Last_ptr = _First_ptr + static_cast<size_t>(_Size);
const auto _Dest_ptr = _STD to_address(_Output);
const auto _Result = _STD _Remove_copy_vectorized(_First_ptr, _Last_ptr, _Dest_ptr, _Val);

if constexpr (is_pointer_v<_It> && is_pointer_v<_Out>) {
return {_Last_ptr, _Result};
} else {
return {_First + _Size, _Output + (_Result - _Dest_ptr)};
}
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS

for (; _First != _Last; ++_First) {
if (_STD invoke(_Proj, *_First) != _Val) {
*_Output = *_First;
Expand Down
111 changes: 82 additions & 29 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4118,9 +4118,10 @@ __declspec(noalias) void __stdcall __std_replace_8(

namespace {
template <class _Ty>
void* _Remove_fallback(void* const _First, void* const _Last, void* const _Out, const _Ty _Val) noexcept {
_Ty* _Src = reinterpret_cast<_Ty*>(_First);
_Ty* _Dest = reinterpret_cast<_Ty*>(_Out);
void* _Remove_fallback(
const void* const _First, const void* const _Last, void* const _Out, const _Ty _Val) noexcept {
auto _Src = reinterpret_cast<const _Ty*>(_First);
auto _Dest = reinterpret_cast<_Ty*>(_Out);

while (_Src != _Last) {
if (*_Src != _Val) {
Expand Down Expand Up @@ -4192,7 +4193,7 @@ namespace {
extern "C" {

void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _Val) noexcept {
void* _Out = _First;
void* _Dest = _First;

#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 8) {
Expand All @@ -4204,19 +4205,19 @@ void* __stdcall __std_remove_1(void* _First, void* const _Last, const uint8_t _V
const __m128i _Src = _mm_loadu_si64(_First);
const uint32_t _Bingo = _mm_movemask_epi8(_mm_cmpeq_epi8(_Src, _Match)) & 0xFF;
const __m128i _Shuf = _mm_loadu_si64(_Remove_tables_1_sse._Shuf[_Bingo]);
const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si64(_Out, _Dest);
_Advance_bytes(_Out, _Remove_tables_1_sse._Size[_Bingo]);
const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si64(_Dest, _Vec);
_Advance_bytes(_Dest, _Remove_tables_1_sse._Size[_Bingo]);
_Advance_bytes(_First, 8);
} while (_First != _Stop);
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Out, _Val);
return _Remove_fallback(_First, _Last, _Dest, _Val);
}

void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _Val) noexcept {
void* _Out = _First;
void* _Dest = _First;

#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_sse42() && _Size_bytes >= 16) {
Expand All @@ -4229,19 +4230,19 @@ void* __stdcall __std_remove_2(void* _First, void* const _Last, const uint16_t _
const __m128i _Mask = _mm_cmpeq_epi16(_Src, _Match);
const uint32_t _Bingo = _mm_movemask_epi8(_mm_packs_epi16(_Mask, _mm_setzero_si128()));
const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Remove_tables_2_sse._Shuf[_Bingo]));
const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_tables_2_sse._Size[_Bingo]);
const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec);
_Advance_bytes(_Dest, _Remove_tables_2_sse._Size[_Bingo]);
_Advance_bytes(_First, 16);
} while (_First != _Stop);
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Out, _Val);
return _Remove_fallback(_First, _Last, _Dest, _Val);
}

void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _Val) noexcept {
void* _Out = _First;
void* _Dest = _First;

#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) {
Expand All @@ -4254,9 +4255,9 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _
const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match);
const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask));
const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_tables_4_avx._Size[_Bingo]);
const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Vec);
_Advance_bytes(_Dest, _Remove_tables_4_avx._Size[_Bingo]);
_Advance_bytes(_First, 32);
} while (_First != _Stop);

Expand All @@ -4271,19 +4272,19 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _
const __m128i _Mask = _mm_cmpeq_epi32(_Src, _Match);
const uint32_t _Bingo = _mm_movemask_ps(_mm_castsi128_ps(_Mask));
const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Remove_tables_4_sse._Shuf[_Bingo]));
const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_tables_4_sse._Size[_Bingo]);
const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec);
_Advance_bytes(_Dest, _Remove_tables_4_sse._Size[_Bingo]);
_Advance_bytes(_First, 16);
} while (_First != _Stop);
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Out, _Val);
return _Remove_fallback(_First, _Last, _Dest, _Val);
}

void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _Val) noexcept {
void* _Out = _First;
void* _Dest = _First;

#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) {
Expand All @@ -4296,9 +4297,9 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _
const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match);
const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask));
const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_tables_8_avx._Size[_Bingo]);
const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Dest), _Vec);
_Advance_bytes(_Dest, _Remove_tables_8_avx._Size[_Bingo]);
_Advance_bytes(_First, 32);
} while (_First != _Stop);

Expand All @@ -4313,15 +4314,67 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _
const __m128i _Mask = _mm_cmpeq_epi64(_Src, _Match);
const uint32_t _Bingo = _mm_movemask_pd(_mm_castsi128_pd(_Mask));
const __m128i _Shuf = _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Remove_tables_8_sse._Shuf[_Bingo]));
const __m128i _Dest = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_tables_8_sse._Size[_Bingo]);
const __m128i _Vec = _mm_shuffle_epi8(_Src, _Shuf);
_mm_storeu_si128(reinterpret_cast<__m128i*>(_Dest), _Vec);
_Advance_bytes(_Dest, _Remove_tables_8_sse._Size[_Bingo]);
_Advance_bytes(_First, 16);
} while (_First != _Stop);
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Out, _Val);
return _Remove_fallback(_First, _Last, _Dest, _Val);
}

void* __stdcall __std_remove_copy_4(const void* _First, const void* _Last, void* _Dest, uint32_t _Val) noexcept {
#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) {
const __m256i _Match = _mm256_set1_epi32(_Val);

const void* _Stop = _First;
_Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F});
do {
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match);
const uint32_t _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask));
const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_4_avx._Shuf[_Bingo]));
const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf);
const size_t _Left = _Remove_tables_4_avx._Size[_Bingo];
_mm256_maskstore_epi32(reinterpret_cast<int*>(_Dest), _Avx2_tail_mask_32(_Left >> 2), _Vec);
_Advance_bytes(_Dest, _Left);
_Advance_bytes(_First, 32);
} while (_First != _Stop);

_mm256_zeroupper(); // TRANSITION, DevCom-10331414
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Dest, _Val);
}

void* __stdcall __std_remove_copy_8(const void* _First, const void* _Last, void* _Dest, uint64_t _Val) noexcept {
#ifndef _M_ARM64EC
if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes >= 32) {
const __m256i _Match = _mm256_set1_epi64x(_Val);

const void* _Stop = _First;
_Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F});
do {
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match);
const uint32_t _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask));
const __m256i _Shuf = _mm256_cvtepu8_epi32(_mm_loadu_si64(_Remove_tables_8_avx._Shuf[_Bingo]));
const __m256i _Vec = _mm256_permutevar8x32_epi32(_Src, _Shuf);
const size_t _Left = _Remove_tables_8_avx._Size[_Bingo];
_mm256_maskstore_epi64(reinterpret_cast<long long*>(_Dest), _Avx2_tail_mask_32(_Left >> 2), _Vec);
_Advance_bytes(_Dest, _Left);
_Advance_bytes(_First, 32);
} while (_First != _Stop);

_mm256_zeroupper(); // TRANSITION, DevCom-10331414
}
#endif // !defined(_M_ARM64EC)

return _Remove_fallback(_First, _Last, _Dest, _Val);
}

} // extern "C"
Expand Down
Loading