diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index fb20d8d71c3..d38dfc06325 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -4899,215 +4899,499 @@ __declspec(noalias) size_t __stdcall __std_find_last_not_of_trivial_pos_2(const namespace { namespace _Find_seq { #ifdef _M_ARM64EC - using _Find_seq_traits_1 = void; - using _Find_seq_traits_2 = void; - using _Find_seq_traits_4 = void; - using _Find_seq_traits_8 = void; + using _Find_seq_traits_avx_1 = void; + using _Find_seq_traits_avx_2 = void; + using _Find_seq_traits_avx_4 = void; + using _Find_seq_traits_avx_8 = void; + using _Find_seq_traits_sse_4 = void; + using _Find_seq_traits_sse_8 = void; #else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv - struct _Find_seq_traits_1 { - static __m256i _Broadcast_avx(const __m128i _Data) noexcept { - return _mm256_broadcastb_epi8(_Data); + struct _Find_seq_traits_avx { + using _Guard = _Zeroupper_on_exit; + + static constexpr size_t _Vec_size = 32; + + static __m256i _Mask(const size_t _Count_in_bytes) noexcept { + return _Avx2_tail_mask_32(_Count_in_bytes); + } + + static __m256i _Load(const void* const _Src) noexcept { + return _mm256_loadu_si256(reinterpret_cast(_Src)); + } + + static __m256i _Xor(const __m256i _Val1, const __m256i _Val2) noexcept { + return _mm256_xor_si256(_Val1, _Val2); + } + + static bool _TestZ(const __m256i _Val1, const __m256i _Val2) noexcept { + return _mm256_testz_si256(_Val1, _Val2); + } + + static unsigned int _Bsf(const unsigned long _Mask) noexcept { + return _tzcnt_u32(_Mask); + } + + static unsigned int _Bsr(const unsigned long _Mask) noexcept { + return 31 - _lzcnt_u32(_Mask); + } + }; + + struct _Find_seq_traits_avx_1_2 : _Find_seq_traits_avx { + static __m256i _Load_tail( + const void* const _Src, const size_t _Size_bytes, __m256i = _mm256_undefined_si256()) noexcept { + unsigned char _Tmp[32]; + memcpy(_Tmp, _Src, _Size_bytes); + return _mm256_loadu_si256(reinterpret_cast(_Tmp)); + } + }; + + struct _Find_seq_traits_avx_4_8 : _Find_seq_traits_avx { + static __m256i _Load_tail(const void* const _Src, size_t, const __m256i _Mask) noexcept { + return _mm256_maskload_epi32(reinterpret_cast(_Src), _Mask); } - static unsigned long _Cmp_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept { + static __m256i _Load_tail(const void* const _Src, const size_t _Size_bytes) noexcept { + const __m256i _Mask = _Avx2_tail_mask_32(_Size_bytes); + return _mm256_maskload_epi32(reinterpret_cast(_Src), _Mask); + } + }; + + struct _Find_seq_traits_avx_1 : _Find_seq_traits_avx_1_2 { + static __m256i _Broadcast(const __m256i _Data) noexcept { + return _mm256_broadcastb_epi8(_mm256_castsi256_si128(_Data)); + } + + static unsigned long _Cmp(const __m256i _Lhs, const __m256i _Rhs) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi8(_Lhs, _Rhs)); } }; - struct _Find_seq_traits_2 { - static __m256i _Broadcast_avx(const __m128i _Data) noexcept { - return _mm256_broadcastw_epi16(_Data); + struct _Find_seq_traits_avx_2 : _Find_seq_traits_avx_1_2 { + static __m256i _Broadcast(const __m256i _Data) noexcept { + return _mm256_broadcastw_epi16(_mm256_castsi256_si128(_Data)); } - static unsigned long _Cmp_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept { + static unsigned long _Cmp(const __m256i _Lhs, const __m256i _Rhs) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi16(_Lhs, _Rhs)) & 0x55555555; } }; - struct _Find_seq_traits_4 { - static __m256i _Broadcast_avx(const __m128i _Data) noexcept { - return _mm256_broadcastd_epi32(_Data); + struct _Find_seq_traits_avx_4 : _Find_seq_traits_avx_4_8 { + static __m256i _Broadcast(const __m256i _Data) noexcept { + return _mm256_broadcastd_epi32(_mm256_castsi256_si128(_Data)); } - static unsigned long _Cmp_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept { + static unsigned long _Cmp(const __m256i _Lhs, const __m256i _Rhs) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi32(_Lhs, _Rhs)) & 0x11111111; } }; - struct _Find_seq_traits_8 { - static __m256i _Broadcast_avx(const __m128i _Data) noexcept { - return _mm256_broadcastq_epi64(_Data); + struct _Find_seq_traits_avx_8 : _Find_seq_traits_avx_4_8 { + static __m256i _Broadcast(const __m256i _Data) noexcept { + return _mm256_broadcastq_epi64(_mm256_castsi256_si128(_Data)); } - static unsigned long _Cmp_avx(const __m256i _Lhs, const __m256i _Rhs) noexcept { + static unsigned long _Cmp(const __m256i _Lhs, const __m256i _Rhs) noexcept { return _mm256_movemask_epi8(_mm256_cmpeq_epi64(_Lhs, _Rhs)) & 0x01010101; } }; - template - __m256i _Avx2_load_tail(const void* const _Src, const size_t _Size_bytes, const __m256i _Mask) noexcept { - if constexpr (sizeof(_Ty) >= 4) { - return _mm256_maskload_epi32(reinterpret_cast(_Src), _Mask); - } else { - unsigned char _Tmp[32]; - memcpy(_Tmp, _Src, _Size_bytes); - return _mm256_loadu_si256(reinterpret_cast<__m256i*>(_Tmp)); + struct _Find_seq_traits_sse_4_8 { + using _Guard = char; + + static constexpr size_t _Vec_size = 16; + + static __m128i _Mask(const size_t _Count_in_bytes) noexcept { + // _Count_in_bytes must be within [0, 16]. + static constexpr unsigned int _Tail_masks[8] = {~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0}; + return _mm_loadu_si128(reinterpret_cast( + reinterpret_cast(_Tail_masks) + (16 - _Count_in_bytes))); } - } - template - __m256i _Avx2_load_tail(const void* const _Src, const size_t _Size_bytes) noexcept { - if constexpr (sizeof(_Ty) >= 4) { - const __m256i _Mask = _Avx2_tail_mask_32(_Size_bytes); - return _mm256_maskload_epi32(reinterpret_cast(_Src), _Mask); - } else { - unsigned char _Tmp[32]; + static __m128i _Load(const void* const _Src) noexcept { + return _mm_loadu_si128(reinterpret_cast(_Src)); + } + + static __m128i _Xor(const __m128i _Val1, const __m128i _Val2) noexcept { + return _mm_xor_si128(_Val1, _Val2); + } + + static bool _TestZ(const __m128i _Val1, const __m128i _Val2) noexcept { + return _mm_testz_si128(_Val1, _Val2); + } + + static __m128i _Load_tail( + const void* const _Src, const size_t _Size_bytes, __m128i = _mm_undefined_si128()) noexcept { + unsigned char _Tmp[16]; memcpy(_Tmp, _Src, _Size_bytes); - return _mm256_loadu_si256(reinterpret_cast<__m256i*>(_Tmp)); + return _mm_loadu_si128(reinterpret_cast(_Tmp)); } - } -#endif // ^^^ !defined(_M_ARM64EC) ^^^ - template - const void* __stdcall _Search_impl( - const void* _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - if (_Count2 == 0) { - return _First1; + static unsigned int _Bsf(const unsigned long _Mask) noexcept { + unsigned long _Index; + // CodeQL [SM02313] _Index is always initialized: we checked _Mask != 0 on every call site + _BitScanForward(&_Index, _Mask); + return _Index; } - if (_Count2 == 1) { - return _Finding::_Find_impl<_FindTraits, _Finding::_Predicate::_Equal>( - _First1, _Last1, *static_cast(_First2)); + static unsigned int _Bsr(const unsigned long _Mask) noexcept { + unsigned long _Index; + // CodeQL [SM02313] _Index is always initialized: we checked _Mask != 0 on every call site + _BitScanReverse(&_Index, _Mask); + return _Index; } + }; - const size_t _Size_bytes_1 = _Byte_length(_First1, _Last1); - const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + struct _Find_seq_traits_sse_4 : _Find_seq_traits_sse_4_8 { + static __m128i _Broadcast(const __m128i _Data) noexcept { + return _mm_shuffle_epi32(_Data, _MM_SHUFFLE(0, 0, 0, 0)); + } - if (_Size_bytes_1 < _Size_bytes_2) { - return _Last1; + static unsigned long _Cmp(const __m128i _Lhs, const __m128i _Rhs) noexcept { + return _mm_movemask_epi8(_mm_cmpeq_epi32(_Lhs, _Rhs)) & 0x1111; } + }; -#ifndef _M_ARM64EC - if (_Use_avx2() && _Size_bytes_1 >= 32) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 + struct _Find_seq_traits_sse_8 : _Find_seq_traits_sse_4_8 { + static __m128i _Broadcast(const __m128i _Data) noexcept { + return _mm_shuffle_epi32(_Data, _MM_SHUFFLE(1, 0, 1, 0)); + } - if (_Size_bytes_2 <= 32) { - const __m256i _Mask2 = _Avx2_tail_mask_32(_Size_bytes_2); - const __m256i _Data2 = _Avx2_load_tail<_Ty>(_First2, _Size_bytes_2, _Mask2); - const __m256i _Start2 = _Traits::_Broadcast_avx(_mm256_castsi256_si128(_Data2)); + static unsigned long _Cmp(const __m128i _Lhs, const __m128i _Rhs) noexcept { + return _mm_movemask_epi8(_mm_cmpeq_epi64(_Lhs, _Rhs)) & 0x0101; + } + }; - const void* _Stop1 = _First1; - _Advance_bytes(_Stop1, _Size_bytes_1 & ~size_t{0x1F}); - do { - const __m256i _Data1 = _mm256_loadu_si256(static_cast(_First1)); - unsigned long _Bingo = _Traits::_Cmp_avx(_Data1, _Start2); - - while (_Bingo != 0) { - const unsigned int _Pos = _tzcnt_u32(_Bingo); - - const void* _Match = _First1; - _Advance_bytes(_Match, _Pos); - - __m256i _Cmp; - if (const size_t _Left_match = _Byte_length(_Match, _Last1); _Left_match >= 32) { - const __m256i _Match_val = _mm256_loadu_si256(reinterpret_cast(_Match)); - _Cmp = _mm256_xor_si256(_Data2, _Match_val); - } else if (_Left_match >= _Size_bytes_2) { - const __m256i _Match_val = _Avx2_load_tail<_Ty>(_Match, _Left_match); - _Cmp = _mm256_xor_si256(_Data2, _Match_val); - } else { - break; - } + template + const void* _Search_cmpeq(const void* _First1, const void* const _Last1, const void* const _First2, + const size_t _Size_bytes_2) noexcept { + [[maybe_unused]] typename _Traits::_Guard _Guard; // TRANSITION, DevCom-10331414 + const size_t _Size_bytes_1 = _Byte_length(_First1, _Last1); + constexpr size_t _Vec_size = _Traits::_Vec_size; + constexpr size_t _Vec_mask = _Vec_size - 1; - if (_mm256_testz_si256(_Cmp, _Mask2)) { - return _Match; - } + if (_Size_bytes_2 <= _Vec_size) { + const auto _Mask2 = _Traits::_Mask(_Size_bytes_2); + const auto _Data2 = _Traits::_Load_tail(_First2, _Size_bytes_2, _Mask2); + const auto _Start2 = _Traits::_Broadcast(_Data2); - _Bingo ^= 1 << _Pos; + const void* _Stop1 = _First1; + _Advance_bytes(_Stop1, _Size_bytes_1 & ~_Vec_mask); + do { + const auto _Data1 = _Traits::_Load(_First1); + unsigned long _Bingo = _Traits::_Cmp(_Data1, _Start2); + + while (_Bingo != 0) { + const unsigned int _Pos = _Traits::_Bsf(_Bingo); + + const void* _Match = _First1; + _Advance_bytes(_Match, _Pos); + + decltype(_Traits::_Load(_Match)) _Cmp; + if (const size_t _Left_match = _Byte_length(_Match, _Last1); _Left_match >= _Vec_size) { + const auto _Match_val = _Traits::_Load(_Match); + _Cmp = _Traits::_Xor(_Data2, _Match_val); + } else if (_Left_match >= _Size_bytes_2) { + const auto _Match_val = _Traits::_Load_tail(_Match, _Left_match); + _Cmp = _Traits::_Xor(_Data2, _Match_val); + } else { + break; } - _Advance_bytes(_First1, 32); + if (_Traits::_TestZ(_Cmp, _Mask2)) { + return _Match; + } - } while (_First1 != _Stop1); + _Bingo ^= 1 << _Pos; + } - if (const size_t _Left1 = _Byte_length(_First1, _Last1); _Left1 >= _Size_bytes_2) { - const __m256i _Data1 = _Avx2_load_tail<_Ty>(_First1, _Left1); - unsigned long _Bingo = _Traits::_Cmp_avx(_Data1, _Start2); + _Advance_bytes(_First1, _Vec_size); - while (_Bingo != 0) { - const unsigned int _Pos = _tzcnt_u32(_Bingo); + } while (_First1 != _Stop1); - if (_Pos > _Left1 - _Size_bytes_2) { - break; - } + if (const size_t _Left1 = _Byte_length(_First1, _Last1); _Left1 >= _Size_bytes_2) { + const auto _Data1 = _Traits::_Load_tail(_First1, _Left1); + unsigned long _Bingo = _Traits::_Cmp(_Data1, _Start2); + + while (_Bingo != 0) { + const unsigned int _Pos = _Traits::_Bsf(_Bingo); + + if (_Pos > _Left1 - _Size_bytes_2) { + break; + } + + const void* _Match = _First1; + _Advance_bytes(_Match, _Pos); + + const size_t _Left_match = _Byte_length(_Match, _Last1); + const auto _Match_val = _Traits::_Load_tail(_Match, _Left_match); + const auto _Cmp = _Traits::_Xor(_Data2, _Match_val); + + if (_Traits::_TestZ(_Cmp, _Mask2)) { + return _Match; + } + + _Bingo ^= 1 << _Pos; + } + } + + return _Last1; + } else { // _Size_bytes_2 is greater than _Vec_size bytes + const auto _Data2 = _Traits::_Load(_First2); + const auto _Start2 = _Traits::_Broadcast(_Data2); + + const size_t _Max_pos = _Size_bytes_1 - _Size_bytes_2; + + const void* _Stop1 = _First1; + _Advance_bytes(_Stop1, _Max_pos); + + const void* _Tail2 = _First2; + _Advance_bytes(_Tail2, _Vec_size); + + do { + const auto _Data1 = _Traits::_Load(_First1); + unsigned long _Bingo = _Traits::_Cmp(_Data1, _Start2); + + while (_Bingo != 0) { + const unsigned int _Pos = _Traits::_Bsf(_Bingo); + + const void* _Match = _First1; + _Advance_bytes(_Match, _Pos); + + if (_Match > _Stop1) { + break; // Oops, doesn't fit + } - const void* _Match = _First1; - _Advance_bytes(_Match, _Pos); + const auto _Match_val = _Traits::_Load(_Match); + const auto _Cmp = _Traits::_Xor(_Data2, _Match_val); - const size_t _Left_match = _Byte_length(_Match, _Last1); - const __m256i _Match_val = _Avx2_load_tail<_Ty>(_Match, _Left_match); - const __m256i _Cmp = _mm256_xor_si256(_Data2, _Match_val); + if (_Traits::_TestZ(_Cmp, _Cmp)) { + const void* _Tail1 = _Match; + _Advance_bytes(_Tail1, _Vec_size); - if (_mm256_testz_si256(_Cmp, _Mask2)) { + if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - _Vec_size) == 0) { return _Match; } + } + + _Bingo ^= 1 << _Pos; + } + + _Advance_bytes(_First1, _Vec_size); + + } while (_First1 <= _Stop1); - _Bingo ^= 1 << _Pos; + return _Last1; + } + } + + template + const void* _Find_end_cmpeq(const void* const _First1, const void* const _Last1, const void* const _First2, + const size_t _Size_bytes_2) noexcept { + [[maybe_unused]] typename _Traits::_Guard _Guard; // TRANSITION, DevCom-10331414 + const size_t _Size_bytes_1 = _Byte_length(_First1, _Last1); + constexpr size_t _Vec_size = _Traits::_Vec_size; + constexpr size_t _Vec_mask = _Vec_size - 1; + + if (_Size_bytes_2 <= _Vec_size) { + const unsigned int _Needle_fit_mask = (1 << (_Vec_size - _Size_bytes_2 + sizeof(_Ty))) - 1; + + const void* _Stop1 = _First1; + _Advance_bytes(_Stop1, _Size_bytes_1 & _Vec_mask); + + const auto _Mask2 = _Traits::_Mask(_Size_bytes_2); + const auto _Data2 = _Traits::_Load_tail(_First2, _Size_bytes_2, _Mask2); + const auto _Start2 = _Traits::_Broadcast(_Data2); + + const void* _Mid1 = _Last1; + _Rewind_bytes(_Mid1, _Vec_size); + +#pragma warning(push) +#pragma warning(disable : 4324) // structure was padded due to alignment specifier + const auto _Check_first = [=, &_Mid1](long _Match) noexcept { + while (_Match != 0) { + const unsigned int _Pos = _Traits::_Bsr(_Match); + + const void* _Tmp1 = _Mid1; + _Advance_bytes(_Tmp1, _Pos); + + const auto _Match_data = _Traits::_Load_tail(_Tmp1, _Byte_length(_Tmp1, _Last1)); + const auto _Cmp_result = _Traits::_Xor(_Data2, _Match_data); + + if (_Traits::_TestZ(_Cmp_result, _Mask2)) { + _Mid1 = _Tmp1; + return true; } + + _Match ^= 1 << _Pos; } - return _Last1; - } else { // _Size_bytes_2 is greater than 32 bytes - const __m256i _Data2 = _mm256_loadu_si256(reinterpret_cast(_First2)); - const __m256i _Start2 = _Traits::_Broadcast_avx(_mm256_castsi256_si128(_Data2)); + return false; + }; - const size_t _Max_pos = _Size_bytes_1 - _Size_bytes_2; + const auto _Check = [=, &_Mid1](long _Match) noexcept { + while (_Match != 0) { + const unsigned int _Pos = _Traits::_Bsr(_Match); - const void* _Stop1 = _First1; - _Advance_bytes(_Stop1, _Max_pos); + const void* _Tmp1 = _Mid1; + _Advance_bytes(_Tmp1, _Pos); - const void* _Tail2 = _First2; - _Advance_bytes(_Tail2, 32); + const auto _Match_data = _Traits::_Load(_Tmp1); + const auto _Cmp_result = _Traits::_Xor(_Data2, _Match_data); - do { - const __m256i _Data1 = _mm256_loadu_si256(static_cast(_First1)); - unsigned long _Bingo = _Traits::_Cmp_avx(_Data1, _Start2); + if (_Traits::_TestZ(_Cmp_result, _Mask2)) { + _Mid1 = _Tmp1; + return true; + } - while (_Bingo != 0) { - const unsigned int _Pos = _tzcnt_u32(_Bingo); + _Match ^= 1 << _Pos; + } - const void* _Match = _First1; - _Advance_bytes(_Match, _Pos); + return false; + }; +#pragma warning(pop) - if (_Match > _Stop1) { - break; // Oops, doesn't fit - } + // The very last part, for any match needle should fit, otherwise false match + const auto _Data1_last = _Traits::_Load(_Mid1); + const unsigned long _Match_last_val = _Traits::_Cmp(_Data1_last, _Start2); + if (_Check_first(_Match_last_val & _Needle_fit_mask)) { + return _Mid1; + } + + // The middle part, fit and unfit needle + while (_Mid1 != _Stop1) { + _Rewind_bytes(_Mid1, _Vec_size); + const auto _Data1 = _Traits::_Load(_Mid1); + const unsigned long _Match_val = _Traits::_Cmp(_Data1, _Start2); + if (_Check(_Match_val)) { + return _Mid1; + } + } + + // The first part, fit and unfit needle, mask out already processed positions + if (const size_t _Tail_bytes_1 = _Size_bytes_1 & _Vec_mask; _Tail_bytes_1 != 0) { + _Mid1 = _First1; + const auto _Data1 = _Traits::_Load(_Mid1); + const unsigned long _Match_val = _Traits::_Cmp(_Data1, _Start2); + if (_Match_val != 0 && _Check(_Match_val & ((1 << _Tail_bytes_1) - 1))) { + return _Mid1; + } + } - const __m256i _Match_val = _mm256_loadu_si256(reinterpret_cast(_Match)); - const __m256i _Cmp = _mm256_xor_si256(_Data2, _Match_val); + return _Last1; + } else { // _Size_bytes_2 is greater than _Vec_size bytes + const auto _Data2 = _Traits::_Load(_First2); + const auto _Start2 = _Traits::_Broadcast(_Data2); - if (_mm256_testz_si256(_Cmp, _Cmp)) { - const void* _Tail1 = _Match; - _Advance_bytes(_Tail1, 32); + const void* _Tail2 = _First2; + _Advance_bytes(_Tail2, _Vec_size); - if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - 32) == 0) { - return _Match; - } - } + const void* _Mid1 = _Last1; + _Rewind_bytes(_Mid1, _Size_bytes_2); - _Bingo ^= 1 << _Pos; + const size_t _Size_diff_bytes = _Size_bytes_1 - _Size_bytes_2; + const void* _Stop1 = _First1; + _Advance_bytes(_Stop1, _Size_diff_bytes & _Vec_mask); + +#pragma warning(push) +#pragma warning(disable : 4324) // structure was padded due to alignment specifier + const auto _Check = [=, &_Mid1](long _Match) noexcept { + while (_Match != 0) { + const unsigned int _Pos = _Traits::_Bsr(_Match); + + const void* _Tmp1 = _Mid1; + _Advance_bytes(_Tmp1, _Pos); + + const auto _Match_data = _Traits::_Load(_Tmp1); + const auto _Cmp_result = _Traits::_Xor(_Data2, _Match_data); + + if (_Traits::_TestZ(_Cmp_result, _Cmp_result)) { + const void* _Tail1 = _Tmp1; + _Advance_bytes(_Tail1, _Vec_size); + + if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - _Vec_size) == 0) { + _Mid1 = _Tmp1; + return true; + } } - _Advance_bytes(_First1, 32); + _Match ^= 1 << _Pos; + } + + return false; + }; +#pragma warning(pop) + // The very last part, just compare, as true match must start with first symbol + const auto _Data1_last = _Traits::_Load(_Mid1); + const auto _Match_last = _Traits::_Xor(_Data2, _Data1_last); - } while (_First1 <= _Stop1); + if (_Traits::_TestZ(_Match_last, _Match_last)) { + // Matched _Vec_size bytes, check the rest + const void* _Tail1 = _Mid1; + _Advance_bytes(_Tail1, _Vec_size); - return _Last1; + if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - _Vec_size) == 0) { + return _Mid1; + } } + + // The main part, match all characters + while (_Mid1 != _Stop1) { + _Rewind_bytes(_Mid1, _Vec_size); + + const auto _Data1 = _Traits::_Load(_Mid1); + const unsigned long _Match_val = _Traits::_Cmp(_Data1, _Start2); + if (_Check(_Match_val)) { + return _Mid1; + } + } + + // The first part, mask out already processed positions + if (const size_t _Tail_bytes_1 = _Size_diff_bytes & _Vec_mask; _Tail_bytes_1 != 0) { + _Mid1 = _First1; + const auto _Data1 = _Traits::_Load(_Mid1); + const unsigned long _Match_val = _Traits::_Cmp(_Data1, _Start2); + if (_Match_val != 0 && _Check(_Match_val & ((1 << _Tail_bytes_1) - 1))) { + return _Mid1; + } + } + + return _Last1; + } + } +#endif // ^^^ !defined(_M_ARM64EC) ^^^ + + template + const void* __stdcall _Search_impl( + const void* _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { + if (_Count2 == 0) { + return _First1; + } + + if (_Count2 == 1) { + return _Finding::_Find_impl<_FindTraits, _Finding::_Predicate::_Equal>( + _First1, _Last1, *static_cast(_First2)); + } + + const size_t _Size_bytes_1 = _Byte_length(_First1, _Last1); + const size_t _Size_bytes_2 = _Count2 * sizeof(_Ty); + + if (_Size_bytes_1 < _Size_bytes_2) { + return _Last1; } - if constexpr (sizeof(_Ty) <= 2) { - if (_Use_sse42() && _Size_bytes_1 >= 16) { +#ifndef _M_ARM64EC + // The AVX2 path for 8-bit elements is not necessarily more efficient than the SSE4.2 cmpestri path + if constexpr (sizeof(_Ty) != 1) { + if (_Use_avx2() && _Size_bytes_1 >= 32) { + return _Search_cmpeq<_Traits_avx, _Ty>(_First1, _Last1, _First2, _Size_bytes_2); + } + } + + if (_Use_sse42() && _Size_bytes_1 >= 16) { + if constexpr (sizeof(_Ty) >= 4) { + return _Search_cmpeq<_Traits_sse, _Ty>(_First1, _Last1, _First2, _Size_bytes_2); + } else { constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ORDERED; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; @@ -5240,7 +5524,7 @@ namespace { return _Last1; } - template + template const void* __stdcall _Find_end_impl(const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { if (_Count2 == 0) { @@ -5261,176 +5545,13 @@ namespace { #ifndef _M_ARM64EC if (_Use_avx2() && _Size_bytes_1 >= 32) { - _Zeroupper_on_exit _Guard; // TRANSITION, DevCom-10331414 - - if (_Size_bytes_2 <= 32) { - const unsigned int _Needle_fit_mask = (1 << (32 - _Size_bytes_2 + sizeof(_Ty))) - 1; - - const void* _Stop1 = _First1; - _Advance_bytes(_Stop1, _Size_bytes_1 & 0x1F); - - const __m256i _Mask2 = _Avx2_tail_mask_32(_Size_bytes_2); - const __m256i _Data2 = _Avx2_load_tail<_Ty>(_First2, _Size_bytes_2, _Mask2); - const __m256i _Start2 = _Traits::_Broadcast_avx(_mm256_castsi256_si128(_Data2)); - - const void* _Mid1 = _Last1; - _Rewind_bytes(_Mid1, 32); - -#pragma warning(push) -#pragma warning(disable : 4324) // structure was padded due to alignment specifier - const auto _Check_first = [=, &_Mid1](long _Match) noexcept { - while (_Match != 0) { - const unsigned int _Pos = 31 - _lzcnt_u32(_Match); - - const void* _Tmp1 = _Mid1; - _Advance_bytes(_Tmp1, _Pos); - - const __m256i _Match_data = _Avx2_load_tail<_Ty>(_Tmp1, _Byte_length(_Tmp1, _Last1)); - const __m256i _Cmp_result = _mm256_xor_si256(_Data2, _Match_data); - - if (_mm256_testz_si256(_Cmp_result, _Mask2)) { - _Mid1 = _Tmp1; - return true; - } - - _Match ^= 1 << _Pos; - } - - return false; - }; - - const auto _Check = [=, &_Mid1](long _Match) noexcept { - while (_Match != 0) { - const unsigned int _Pos = 31 - _lzcnt_u32(_Match); - - const void* _Tmp1 = _Mid1; - _Advance_bytes(_Tmp1, _Pos); - - const __m256i _Match_data = _mm256_loadu_si256(reinterpret_cast(_Tmp1)); - const __m256i _Cmp_result = _mm256_xor_si256(_Data2, _Match_data); - - if (_mm256_testz_si256(_Cmp_result, _Mask2)) { - _Mid1 = _Tmp1; - return true; - } - - _Match ^= 1 << _Pos; - } - - return false; - }; -#pragma warning(pop) - - // The very last part, for any match needle should fit, otherwise false match - const __m256i _Data1_last = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const unsigned long _Match_last_val = _Traits::_Cmp_avx(_Data1_last, _Start2); - if (_Check_first(_Match_last_val & _Needle_fit_mask)) { - return _Mid1; - } - - // The middle part, fit and unfit needle - while (_Mid1 != _Stop1) { - _Rewind_bytes(_Mid1, 32); - const __m256i _Data1 = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const unsigned long _Match_val = _Traits::_Cmp_avx(_Data1, _Start2); - if (_Check(_Match_val)) { - return _Mid1; - } - } - - // The first part, fit and unfit needle, mask out already processed positions - if (const size_t _Tail_bytes_1 = _Size_bytes_1 & 0x1F; _Tail_bytes_1 != 0) { - _Mid1 = _First1; - const __m256i _Data1 = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const unsigned long _Match_val = _Traits::_Cmp_avx(_Data1, _Start2); - if (_Match_val != 0 && _Check(_Match_val & ((1 << _Tail_bytes_1) - 1))) { - return _Mid1; - } - } - - return _Last1; - } else { // _Size_bytes_2 is greater than 32 bytes - const __m256i _Data2 = _mm256_loadu_si256(reinterpret_cast(_First2)); - const __m256i _Start2 = _Traits::_Broadcast_avx(_mm256_castsi256_si128(_Data2)); - - const void* _Tail2 = _First2; - _Advance_bytes(_Tail2, 32); - - const void* _Mid1 = _Last1; - _Rewind_bytes(_Mid1, _Size_bytes_2); - - const size_t _Size_diff_bytes = _Size_bytes_1 - _Size_bytes_2; - const void* _Stop1 = _First1; - _Advance_bytes(_Stop1, _Size_diff_bytes & 0x1F); - -#pragma warning(push) -#pragma warning(disable : 4324) // structure was padded due to alignment specifier - const auto _Check = [=, &_Mid1](long _Match) noexcept { - while (_Match != 0) { - const unsigned int _Pos = 31 - _lzcnt_u32(_Match); - - const void* _Tmp1 = _Mid1; - _Advance_bytes(_Tmp1, _Pos); - - const __m256i _Match_data = _mm256_loadu_si256(reinterpret_cast(_Tmp1)); - const __m256i _Cmp_result = _mm256_xor_si256(_Data2, _Match_data); - - if (_mm256_testz_si256(_Cmp_result, _Cmp_result)) { - const void* _Tail1 = _Tmp1; - _Advance_bytes(_Tail1, 32); - - if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - 32) == 0) { - _Mid1 = _Tmp1; - return true; - } - } - - _Match ^= 1 << _Pos; - } - - return false; - }; -#pragma warning(pop) - // The very last part, just compare, as true match must start with first symbol - const __m256i _Data1_last = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const __m256i _Match_last = _mm256_xor_si256(_Data2, _Data1_last); - if (_mm256_testz_si256(_Match_last, _Match_last)) { - // Matched 32 bytes, check the rest - const void* _Tail1 = _Mid1; - _Advance_bytes(_Tail1, 32); - - if (memcmp(_Tail1, _Tail2, _Size_bytes_2 - 32) == 0) { - return _Mid1; - } - } - - // The main part, match all characters - while (_Mid1 != _Stop1) { - _Rewind_bytes(_Mid1, 32); - - const __m256i _Data1 = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const unsigned long _Match_val = _Traits::_Cmp_avx(_Data1, _Start2); - if (_Check(_Match_val)) { - return _Mid1; - } - } - - // The first part, mask out already processed positions - if (const size_t _Tail_bytes_1 = _Size_diff_bytes & 0x1F; _Tail_bytes_1 != 0) { - _Mid1 = _First1; - const __m256i _Data1 = _mm256_loadu_si256(reinterpret_cast(_Mid1)); - const unsigned long _Match_val = _Traits::_Cmp_avx(_Data1, _Start2); - if (_Match_val != 0 && _Check(_Match_val & ((1 << _Tail_bytes_1) - 1))) { - return _Mid1; - } - } - - return _Last1; - } + return _Find_end_cmpeq<_Traits_avx, _Ty>(_First1, _Last1, _First2, _Size_bytes_2); } - if constexpr (sizeof(_Ty) <= 2) { - if (_Use_sse42() && _Size_bytes_1 >= 16) { + if (_Use_sse42() && _Size_bytes_1 >= 16) { + if constexpr (sizeof(_Ty) >= 4) { + return _Find_end_cmpeq<_Traits_sse, _Ty>(_First1, _Last1, _First2, _Size_bytes_2); + } else { constexpr int _Op = (sizeof(_Ty) == 1 ? _SIDD_UBYTE_OPS : _SIDD_UWORD_OPS) | _SIDD_CMP_EQUAL_ORDERED; constexpr int _Part_size_el = sizeof(_Ty) == 1 ? 16 : 8; @@ -5666,51 +5787,50 @@ extern "C" { const void* __stdcall __std_search_1( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Search_impl<_Finding::_Find_traits_1, _Find_seq::_Find_seq_traits_1, uint8_t>( - _First1, _Last1, _First2, _Count2); + return _Find_seq::_Search_impl<_Finding::_Find_traits_1, void, void, uint8_t>(_First1, _Last1, _First2, _Count2); } const void* __stdcall __std_search_2( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Search_impl<_Finding::_Find_traits_2, _Find_seq::_Find_seq_traits_2, uint16_t>( + return _Find_seq::_Search_impl<_Finding::_Find_traits_2, _Find_seq::_Find_seq_traits_avx_2, void, uint16_t>( _First1, _Last1, _First2, _Count2); } const void* __stdcall __std_search_4( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Search_impl<_Finding::_Find_traits_4, _Find_seq::_Find_seq_traits_4, uint32_t>( - _First1, _Last1, _First2, _Count2); + return _Find_seq::_Search_impl<_Finding::_Find_traits_4, _Find_seq::_Find_seq_traits_avx_4, + _Find_seq::_Find_seq_traits_sse_4, uint32_t>(_First1, _Last1, _First2, _Count2); } const void* __stdcall __std_search_8( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Search_impl<_Finding::_Find_traits_8, _Find_seq::_Find_seq_traits_8, uint64_t>( - _First1, _Last1, _First2, _Count2); + return _Find_seq::_Search_impl<_Finding::_Find_traits_8, _Find_seq::_Find_seq_traits_avx_8, + _Find_seq::_Find_seq_traits_sse_8, uint64_t>(_First1, _Last1, _First2, _Count2); } const void* __stdcall __std_find_end_1( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Find_end_impl<_Finding::_Find_traits_1, _Find_seq::_Find_seq_traits_1, uint8_t>( + return _Find_seq::_Find_end_impl<_Finding::_Find_traits_1, _Find_seq::_Find_seq_traits_avx_1, void, uint8_t>( _First1, _Last1, _First2, _Count2); } const void* __stdcall __std_find_end_2( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Find_end_impl<_Finding::_Find_traits_2, _Find_seq::_Find_seq_traits_2, uint16_t>( + return _Find_seq::_Find_end_impl<_Finding::_Find_traits_2, _Find_seq::_Find_seq_traits_avx_2, void, uint16_t>( _First1, _Last1, _First2, _Count2); } const void* __stdcall __std_find_end_4( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Find_end_impl<_Finding::_Find_traits_4, _Find_seq::_Find_seq_traits_4, uint32_t>( - _First1, _Last1, _First2, _Count2); + return _Find_seq::_Find_end_impl<_Finding::_Find_traits_4, _Find_seq::_Find_seq_traits_avx_4, + _Find_seq::_Find_seq_traits_sse_4, uint32_t>(_First1, _Last1, _First2, _Count2); } const void* __stdcall __std_find_end_8( const void* const _First1, const void* const _Last1, const void* const _First2, const size_t _Count2) noexcept { - return _Find_seq::_Find_end_impl<_Finding::_Find_traits_8, _Find_seq::_Find_seq_traits_8, uint64_t>( - _First1, _Last1, _First2, _Count2); + return _Find_seq::_Find_end_impl<_Finding::_Find_traits_8, _Find_seq::_Find_seq_traits_avx_8, + _Find_seq::_Find_seq_traits_sse_8, uint64_t>(_First1, _Last1, _First2, _Count2); } } // extern "C"