diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 9f5ea0f97cf..94bae9604e6 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -17,6 +17,26 @@ _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new +#if _USE_STD_VECTOR_ALGORITHMS +_EXTERN_C +// The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms +// won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by +// those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via +// the return value, so functions using "noalias" must usually return void. This attribute is valuable because these +// functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the +// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to +// unanalyzable routines may modify those arrays. +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( + const void* _First, const void* _Last, void* _Dest) noexcept; +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( + const void* _First, const void* _Last, void* _Dest) noexcept; +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( + const void* _First, const void* _Last, void* _Dest) noexcept; +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( + const void* _First, const void* _Last, void* _Dest) noexcept; +_END_EXTERN_C +#endif // _USE_STD_VECTOR_ALGORITHMS + _STD_BEGIN // COMMON SORT PARAMETERS _INLINE_VAR constexpr int _ISORT_MAX = 32; // maximum size for insertion sort @@ -4520,6 +4540,7 @@ namespace ranges { constexpr bool _Allow_vectorization = conjunction_v<_Is_trivially_swappable<_Elem>, negation>>; +#pragma warning(suppress : 6326) // Potential comparison of a constant with another constant if constexpr (_Allow_vectorization && _Nx <= 8 && (_Nx & (_Nx - 1)) == 0) { if (!_STD is_constant_evaluated()) { _Elem* const _First_addr = _STD to_address(_First); @@ -4584,6 +4605,37 @@ _CONSTEXPR20 _OutIt reverse_copy(_BidIt _First, _BidIt _Last, _OutIt _Dest) { const auto _UFirst = _Get_unwrapped(_First); auto _ULast = _Get_unwrapped(_Last); auto _UDest = _Get_unwrapped_n(_Dest, _Idl_distance<_BidIt>(_UFirst, _ULast)); + +#if _HAS_IF_CONSTEXPR && _USE_STD_VECTOR_ALGORITHMS + using _Elem = remove_pointer_t; + using _DestElem = remove_pointer_t; + constexpr bool _Allow_vectorization = conjunction_v, remove_const_t<_DestElem>>, + is_pointer, is_trivially_copyable<_Elem>, negation>>; + constexpr size_t _Nx = sizeof(_Elem); + +#pragma warning(suppress : 6326) // Potential comparison of a constant with another constant + if constexpr (_Allow_vectorization && _Nx <= 8 && (_Nx & (_Nx - 1)) == 0) { +#ifdef __cpp_lib_is_constant_evaluated + if (!_STD is_constant_evaluated()) +#endif // __cpp_lib_is_constant_evaluated + { + if constexpr (_Nx == 1) { + __std_reverse_copy_trivially_copyable_1(_UFirst, _ULast, _UDest); + } else if constexpr (_Nx == 2) { + __std_reverse_copy_trivially_copyable_2(_UFirst, _ULast, _UDest); + } else if constexpr (_Nx == 4) { + __std_reverse_copy_trivially_copyable_4(_UFirst, _ULast, _UDest); + } else { + __std_reverse_copy_trivially_copyable_8(_UFirst, _ULast, _UDest); + } + + _UDest += _ULast - _UFirst; + _Seek_wrapped(_Dest, _UDest); + return _Dest; + } + } +#endif // _HAS_IF_CONSTEXPR && _USE_STD_VECTOR_ALGORITHMS + for (; _UFirst != _ULast; ++_UDest) { *_UDest = *--_ULast; } diff --git a/stl/inc/xutility b/stl/inc/xutility index f80ed07fa86..ff8a7aa2d63 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -5481,116 +5481,42 @@ _NODISCARD _CONSTEXPR20 bool _Check_match_counts( } // FUNCTION TEMPLATE reverse -#if _HAS_IF_CONSTEXPR template _CONSTEXPR20 void reverse(const _BidIt _First, const _BidIt _Last) { // reverse elements in [_First, _Last) _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); auto _ULast = _Get_unwrapped(_Last); -#if _USE_STD_VECTOR_ALGORITHMS +#if _HAS_IF_CONSTEXPR && _USE_STD_VECTOR_ALGORITHMS using _Elem = remove_pointer_t; constexpr bool _Allow_vectorization = conjunction_v, _Is_trivially_swappable<_Elem>, negation>>; + constexpr size_t _Nx = sizeof(_Elem); - if constexpr (_Allow_vectorization && sizeof(_Elem) == 1) { -#ifdef __cpp_lib_is_constant_evaluated - if (!_STD is_constant_evaluated()) -#endif // __cpp_lib_is_constant_evaluated - { - __std_reverse_trivially_swappable_1(_UFirst, _ULast); - return; - } - } else if constexpr (_Allow_vectorization && sizeof(_Elem) == 2) { -#ifdef __cpp_lib_is_constant_evaluated - if (!_STD is_constant_evaluated()) -#endif // __cpp_lib_is_constant_evaluated - { - __std_reverse_trivially_swappable_2(_UFirst, _ULast); - return; - } - } else if constexpr (_Allow_vectorization && sizeof(_Elem) == 4) { +#pragma warning(suppress : 6326) // Potential comparison of a constant with another constant + if constexpr (_Allow_vectorization && _Nx <= 8 && (_Nx & (_Nx - 1)) == 0) { #ifdef __cpp_lib_is_constant_evaluated if (!_STD is_constant_evaluated()) #endif // __cpp_lib_is_constant_evaluated { - __std_reverse_trivially_swappable_4(_UFirst, _ULast); - return; - } - } else if constexpr (_Allow_vectorization && sizeof(_Elem) == 8) { -#ifdef __cpp_lib_is_constant_evaluated - if (!_STD is_constant_evaluated()) -#endif // __cpp_lib_is_constant_evaluated - { - __std_reverse_trivially_swappable_8(_UFirst, _ULast); + if constexpr (_Nx == 1) { + __std_reverse_trivially_swappable_1(_UFirst, _ULast); + } else if constexpr (_Nx == 2) { + __std_reverse_trivially_swappable_2(_UFirst, _ULast); + } else if constexpr (_Nx == 4) { + __std_reverse_trivially_swappable_4(_UFirst, _ULast); + } else { + __std_reverse_trivially_swappable_8(_UFirst, _ULast); + } + return; } } -#endif // _USE_STD_VECTOR_ALGORITHMS +#endif // _HAS_IF_CONSTEXPR && _USE_STD_VECTOR_ALGORITHMS for (; _UFirst != _ULast && _UFirst != --_ULast; ++_UFirst) { _STD iter_swap(_UFirst, _ULast); } } -#else // ^^^ _HAS_IF_CONSTEXPR / !_HAS_IF_CONSTEXPR vvv -template -void _Reverse_unchecked1(_BidIt _First, _BidIt _Last, integral_constant) { - // reverse elements in [_First, _Last), general bidirectional iterators - for (; _First != _Last && _First != --_Last; ++_First) { - _STD iter_swap(_First, _Last); - } -} - -#if _USE_STD_VECTOR_ALGORITHMS -template -void _Reverse_unchecked1(const _BidIt _First, const _BidIt _Last, integral_constant) { - // reverse elements in [_First, _Last), pointers to trivially swappable of size 1 - __std_reverse_trivially_swappable_1(_First, _Last); -} - -template -void _Reverse_unchecked1(const _BidIt _First, const _BidIt _Last, integral_constant) { - // reverse elements in [_First, _Last), pointers to trivially swappable of size 2 - __std_reverse_trivially_swappable_2(_First, _Last); -} - -template -void _Reverse_unchecked1(const _BidIt _First, const _BidIt _Last, integral_constant) { - // reverse elements in [_First, _Last), pointers to trivially swappable of size 4 - __std_reverse_trivially_swappable_4(_First, _Last); -} - -template -void _Reverse_unchecked1(const _BidIt _First, const _BidIt _Last, integral_constant) { - // reverse elements in [_First, _Last), pointers to trivially swappable of size 8 - __std_reverse_trivially_swappable_8(_First, _Last); -} -#endif // _USE_STD_VECTOR_ALGORITHMS - -template -void _Reverse_unchecked(const _BidIt _First, const _BidIt _Last) { - // reverse elements in [_First, _Last), choose optimization -#if _USE_STD_VECTOR_ALGORITHMS - using _Elem = remove_pointer_t<_BidIt>; - constexpr size_t _Opt = - is_pointer_v<_BidIt> // - && _Is_trivially_swappable_v<_Elem> // - && !is_volatile_v<_Elem> // - && (sizeof(_Elem) == 1 || sizeof(_Elem) == 2 || sizeof(_Elem) == 4 || sizeof(_Elem) == 8) - ? sizeof(_Elem) - : 0; -#else // ^^^ vectorize / no vectorize vvv - constexpr size_t _Opt = 0; -#endif // _USE_STD_VECTOR_ALGORITHMS - _Reverse_unchecked1(_First, _Last, integral_constant{}); -} - -template -void reverse(const _BidIt _First, const _BidIt _Last) { - // reverse elements in [_First, _Last) - _Adl_verify_range(_First, _Last); - _Reverse_unchecked(_Get_unwrapped(_First), _Get_unwrapped(_Last)); -} -#endif // _HAS_IF_CONSTEXPR #if _HAS_CXX17 template = 0> @@ -5694,19 +5620,19 @@ _FwdIt _Rotate_unchecked1(_FwdIt _First, _FwdIt _Mid, _FwdIt _Last, forward_iter template _BidIt _Rotate_unchecked1(_BidIt _First, _BidIt _Mid, _BidIt _Last, bidirectional_iterator_tag) { // rotate [_First, _Last) left by distance(_First, _Mid) positions, bidirectional iterators - _Reverse_unchecked(_First, _Mid); - _Reverse_unchecked(_Mid, _Last); + _STD reverse(_First, _Mid); + _STD reverse(_Mid, _Last); auto _Tmp = _Reverse_until_sentinel_unchecked(_First, _Mid, _Last); - _Reverse_unchecked(_Tmp.first, _Tmp.second); + _STD reverse(_Tmp.first, _Tmp.second); return _Mid != _Tmp.first ? _Tmp.first : _Tmp.second; } template _RanIt _Rotate_unchecked1(_RanIt _First, _RanIt _Mid, _RanIt _Last, random_access_iterator_tag) { // rotate [_First, _Last) left by distance(_First, _Mid) positions, random-access iterators - _Reverse_unchecked(_First, _Mid); - _Reverse_unchecked(_Mid, _Last); - _Reverse_unchecked(_First, _Last); + _STD reverse(_First, _Mid); + _STD reverse(_Mid, _Last); + _STD reverse(_First, _Last); return _First + (_Last - _Mid); } diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 2467b2b0626..4823e4fdd9d 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -24,20 +24,31 @@ static void _Reverse_tail(_BidIt _First, _BidIt _Last) noexcept { } } -static size_t _Byte_length(void* _First, void* _Last) noexcept { - return static_cast(_Last) - static_cast(_First); +template +static void _Reverse_copy_tail(_BidIt _First, _BidIt _Last, _OutIt _Dest) noexcept { + while (_First != _Last) { + *_Dest++ = *--_Last; + } +} + +static size_t _Byte_length(const void* _First, const void* _Last) noexcept { + return static_cast(_Last) - static_cast(_First); } static void _Advance_bytes(void*& _Target, ptrdiff_t _Offset) noexcept { _Target = static_cast(_Target) + _Offset; } +static void _Advance_bytes(const void*& _Target, ptrdiff_t _Offset) noexcept { + _Target = static_cast(_Target) + _Offset; +} + extern "C" { __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( void* _First1, void* _Last1, void* _First2) noexcept { constexpr size_t _Mask_32 = ~((static_cast(1) << 5) - 1); if (_Byte_length(_First1, _Last1) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { - void* _Stop_at = _First1; + const void* _Stop_at = _First1; _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_32); do { const __m256i _Left = _mm256_loadu_si256(static_cast<__m256i*>(_First1)); @@ -55,7 +66,7 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2) #endif // _M_IX86 ) { - void* _Stop_at = _First1; + const void* _Stop_at = _First1; _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_16); do { const __m128i _Left = _mm_loadu_si128(static_cast<__m128i*>(_First1)); @@ -70,7 +81,7 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( #if defined(_M_X64) // NOTE: UNALIGNED MEMORY ACCESSES constexpr size_t _Mask_8 = ~((static_cast(1) << 3) - 1); if (_Byte_length(_First1, _Last1) >= 8) { - void* _Stop_at = _First1; + const void* _Stop_at = _First1; _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_8); do { const unsigned long long _Left = *static_cast(_First1); @@ -84,7 +95,7 @@ __declspec(noalias) void __cdecl __std_swap_ranges_trivially_swappable_noalias( #elif defined(_M_IX86) // NOTE: UNALIGNED MEMORY ACCESSES constexpr size_t _Mask_4 = ~((static_cast(1) << 2) - 1); if (_Byte_length(_First1, _Last1) >= 4) { - void* _Stop_at = _First1; + const void* _Stop_at = _First1; _Advance_bytes(_Stop_at, _Byte_length(_First1, _Last1) & _Mask_4); do { const unsigned long _Left = *static_cast(_First1); @@ -120,7 +131,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_1(void* _Firs const __m256i _Reverse_char_lanes_avx = _mm256_set_epi8( // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); @@ -138,7 +149,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_1(void* _Firs if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE42)) { const __m128i _Reverse_char_sse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 4); do { _Advance_bytes(_Last, -16); @@ -160,7 +171,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_2(void* _Firs const __m256i _Reverse_short_lanes_avx = _mm256_set_epi8( // 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); @@ -176,7 +187,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_2(void* _Firs if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE42)) { const __m128i _Reverse_short_sse = _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 4); do { _Advance_bytes(_Last, -16); @@ -195,7 +206,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_2(void* _Firs __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _First, void* _Last) noexcept { if (_Byte_length(_First, _Last) >= 64 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); @@ -214,7 +225,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _Firs && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2) #endif // _M_IX86 ) { - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 4); do { _Advance_bytes(_Last, -16); @@ -233,7 +244,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_4(void* _Firs __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _First, void* _Last) noexcept { if (_Byte_length(_First, _Last) >= 64 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 6 << 5); do { _Advance_bytes(_Last, -32); @@ -252,7 +263,7 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2) #endif // _M_IX86 ) { - void* _Stop_at = _First; + const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 4); do { _Advance_bytes(_Last, -16); @@ -269,6 +280,143 @@ __declspec(noalias) void __cdecl __std_reverse_trivially_swappable_8(void* _Firs _Reverse_tail(static_cast(_First), static_cast(_Last)); } +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_1( + const void* _First, const void* _Last, void* _Dest) noexcept { + if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { + const __m256i _Reverse_char_lanes_avx = _mm256_set_epi8( // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); + do { + _Advance_bytes(_Last, -32); + const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); + const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block, _Reverse_char_lanes_avx); + _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 32); + } while (_Dest != _Stop_at); + } + + if (_Byte_length(_First, _Last) >= 16 && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE42)) { + const __m128i _Reverse_char_sse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 4 << 4); + do { + _Advance_bytes(_Last, -16); + const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); + const __m128i _Block_reversed = _mm_shuffle_epi8(_Block, _Reverse_char_sse); // SSSE3 + _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 16); + } while (_Dest != _Stop_at); + } + + _Reverse_copy_tail(static_cast(_First), static_cast(_Last), + static_cast(_Dest)); +} + +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_2( + const void* _First, const void* _Last, void* _Dest) noexcept { + if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { + const __m256i _Reverse_short_lanes_avx = _mm256_set_epi8( // + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, // + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); + do { + _Advance_bytes(_Last, -32); + const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); + const __m256i _Block_reversed = _mm256_shuffle_epi8(_Block, _Reverse_short_lanes_avx); + _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 32); + } while (_Dest != _Stop_at); + } + + if (_Byte_length(_First, _Last) >= 16 && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE42)) { + const __m128i _Reverse_short_sse = _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 4 << 4); + do { + _Advance_bytes(_Last, -16); + const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); + const __m128i _Block_reversed = _mm_shuffle_epi8(_Block, _Reverse_short_sse); // SSSE3 + _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 16); + } while (_Dest != _Stop_at); + } + + _Reverse_copy_tail(static_cast(_First), static_cast(_Last), + static_cast(_Dest)); +} + +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_4( + const void* _First, const void* _Last, void* _Dest) noexcept { + if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); + do { + _Advance_bytes(_Last, -32); + const __m256i _Block = _mm256_permute4x64_epi64(_mm256_loadu_si256(static_cast(_Last)), 78); + const __m256i _Block_reversed = _mm256_shuffle_epi32(_Block, 27); + _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 32); + } while (_Dest != _Stop_at); + } + + if (_Byte_length(_First, _Last) >= 16 +#ifdef _M_IX86 + && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2) +#endif // _M_IX86 + ) { + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 4 << 4); + do { + _Advance_bytes(_Last, -16); + const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); + const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, 27); + _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 16); + } while (_Dest != _Stop_at); + } + + _Reverse_copy_tail(static_cast(_First), static_cast(_Last), + static_cast(_Dest)); +} + +__declspec(noalias) void __cdecl __std_reverse_copy_trivially_copyable_8( + const void* _First, const void* _Last, void* _Dest) noexcept { + if (_Byte_length(_First, _Last) >= 32 && _bittest(&__isa_enabled, __ISA_AVAILABLE_AVX2)) { + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 5 << 5); + do { + _Advance_bytes(_Last, -32); + const __m256i _Block = _mm256_loadu_si256(static_cast(_Last)); + const __m256i _Block_reversed = _mm256_permute4x64_epi64(_Block, 27); + _mm256_storeu_si256(static_cast<__m256i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 32); + } while (_Dest != _Stop_at); + } + + if (_Byte_length(_First, _Last) >= 16 +#ifdef _M_IX86 + && _bittest(&__isa_enabled, __ISA_AVAILABLE_SSE2) +#endif // _M_IX86 + ) { + const void* _Stop_at = _Dest; + _Advance_bytes(_Stop_at, _Byte_length(_First, _Last) >> 4 << 4); + do { + _Advance_bytes(_Last, -16); + const __m128i _Block = _mm_loadu_si128(static_cast(_Last)); + const __m128i _Block_reversed = _mm_shuffle_epi32(_Block, 78); + _mm_storeu_si128(static_cast<__m128i*>(_Dest), _Block_reversed); + _Advance_bytes(_Dest, 16); + } while (_Dest != _Stop_at); + } + + _Reverse_copy_tail(static_cast(_First), static_cast(_Last), + static_cast(_Dest)); +} + + } // extern "C" #endif // (defined(_M_IX86) || defined(_M_X64)) && !defined(_M_CEE_PURE) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 6483c50a943..b37274ecb19 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -3,7 +3,9 @@ #include #include +#include #include +#include #include #include @@ -47,6 +49,26 @@ void test_reverse(mt19937_64& gen) { } } +template +void test_case_reverse_copy(vector& input) { + auto expected = input; + last_known_good_reverse(expected.begin(), expected.end()); + vector output(input.size(), T{}); + assert(reverse_copy(input.begin(), input.end(), output.begin()) == output.end()); + assert(expected == output); +} + +template +void test_reverse_copy(mt19937_64& gen) { + vector input; + input.reserve(dataCount); + test_case_reverse_copy(input); + for (size_t attempts = 0; attempts < dataCount; ++attempts) { + input.push_back(static_cast(gen())); // intentionally narrows + test_case_reverse_copy(input); + } +} + template inline FwdIt2 last_known_good_swap_ranges(FwdIt1 first1, const FwdIt1 last1, FwdIt2 dest) { for (; first1 != last1; ++first1, ++dest) { @@ -98,6 +120,19 @@ void test_vector_algorithms() { test_reverse(gen); test_reverse(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_reverse_copy(gen); + test_swap_ranges(gen); test_swap_ranges(gen); test_swap_ranges(gen); @@ -105,8 +140,47 @@ void test_vector_algorithms() { test_swap_ranges(gen); } +template +void test_two_containers() { + Container1 one = {10, 20, 30, 40, 50}; + Container2 two = {-1, -1, -1, -1, -1}; + static constexpr int reversed[] = {50, 40, 30, 20, 10}; + + assert(reverse_copy(one.begin(), one.end(), two.begin()) == two.end()); + assert(equal(two.begin(), two.end(), begin(reversed), end(reversed))); + + static constexpr int squares[] = {1, 4, 9, 16, 25}; + static constexpr int cubes[] = {1, 8, 27, 64, 125}; + one.assign(begin(squares), end(squares)); + two.assign(begin(cubes), end(cubes)); + + assert(swap_ranges(one.begin(), one.end(), two.begin()) == two.end()); + assert(equal(one.begin(), one.end(), begin(cubes), end(cubes))); + assert(equal(two.begin(), two.end(), begin(squares), end(squares))); +} + +template +void test_one_container() { + Container x = {10, 20, 30, 40, 50}; + static constexpr int reversed[] = {50, 40, 30, 20, 10}; + + reverse(x.begin(), x.end()); + assert(equal(x.begin(), x.end(), begin(reversed), end(reversed))); + + test_two_containers>(); + test_two_containers>(); + test_two_containers>(); +} + +void test_various_containers() { + test_one_container>(); // contiguous, vectorizable + test_one_container>(); // random-access, not vectorizable + test_one_container>(); // bidi, not vectorizable +} + int main() { test_vector_algorithms(); + test_various_containers(); #ifndef _M_CEE_PURE #if defined(_M_IX86) || defined(_M_X64) disable_instructions(__ISA_AVAILABLE_AVX2);