diff --git a/stl/inc/xutility b/stl/inc/xutility index 06c1dfa8dcc..577161e4571 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -89,7 +89,7 @@ _STL_DISABLE_CLANG_WARNINGS #define _VECTORIZED_FIND_LAST_OF _VECTORIZED_FOR_X64_X86 #define _VECTORIZED_INCLUDES _VECTORIZED_FOR_X64_X86 #define _VECTORIZED_IS_SORTED_UNTIL _VECTORIZED_FOR_X64_X86 -#define _VECTORIZED_MINMAX _VECTORIZED_FOR_X64_X86 +#define _VECTORIZED_MINMAX _VECTORIZED_FOR_X64_X86_ARM64 #define _VECTORIZED_MINMAX_ELEMENT _VECTORIZED_FOR_X64_X86_ARM64 #define _VECTORIZED_MISMATCH _VECTORIZED_FOR_X64_X86 #define _VECTORIZED_REMOVE _VECTORIZED_FOR_X64_X86 @@ -7302,7 +7302,7 @@ constexpr bool _Is_min_max_value_optimization_safe = // Activate the vector algo #ifndef _M_FP_FAST !is_floating_point_v<_Elem> && #endif // ^^^ !defined(_M_FP_FAST) ^^^ - _Is_min_max_optimization_safe<_Iter, _Pr>; + _Is_min_max_iterators_safe<_Iter> && _Is_predicate_less<_Iter, _Pr>; template constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 0f84855383d..0271995eed8 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -934,6 +934,7 @@ namespace { static constexpr bool _Vectorized = false; static constexpr size_t _Tail_mask = 0; static constexpr bool _Has_unsigned_cmp = false; + using _Vec_t = void; }; #ifdef _M_ARM64 @@ -945,22 +946,6 @@ namespace { static constexpr size_t _Tail_mask = 0; static constexpr bool _Has_unsigned_cmp = true; - static int8x16_t _Zero() noexcept { - return vdupq_n_s8(0); - } - - static int8x16_t _All_ones() noexcept { - return vdupq_n_s8(static_cast(0xFF)); - } - - static int8x16_t _Blend(const int8x16_t _Px1, const int8x16_t _Px2, const int8x16_t _Msk) noexcept { - return vbslq_s8(vreinterpretq_u8_s8(_Msk), _Px2, _Px1); - } - - static int8x16_t _Sign_correction(const int8x16_t _Val, bool) noexcept { - return _Val; - } - static void _Exit_vectorized() noexcept {} }; #elif !defined(_M_ARM64EC) @@ -971,6 +956,7 @@ namespace { static constexpr size_t _Vec_mask = 0xF; static constexpr size_t _Tail_mask = 0; static constexpr bool _Has_unsigned_cmp = false; + using _Vec_t = __m128i; static __m128i _Zero() noexcept { return _mm_setzero_si128(); @@ -1043,6 +1029,7 @@ namespace { struct _Traits_avx_i_base : _Traits_avx_base { static constexpr size_t _Tail_mask = 0x1C; + using _Vec_t = __m256i; static __m256i _Blendval(const __m256i _Px1, const __m256i _Px2, const __m256i _Msk) noexcept { return _mm256_blendv_epi8(_Px1, _Px2, _Msk); @@ -1076,6 +1063,22 @@ namespace { struct _Traits_1_neon : _Traits_1_base, _Traits_neon_base { using _Vec_t = int8x16_t; + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Vec_t _Zero() noexcept { + return vdupq_n_s8(0); + } + + static _Vec_t _All_ones() noexcept { + return vdupq_n_s8(static_cast(0xFF)); + } + + static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept { + return vbslq_s8(vreinterpretq_u8_s8(_Msk), _Px2, _Px1); + } + // Compresses a 128-bit Mask of 16 8-bit values into a 64-bit Mask of 16 4-bit values. static uint64_t _Mask(const _Vec_t _Val) noexcept { const uint8x8_t _Res = vshrn_n_u16(vreinterpretq_u16_s8(_Val), 4); @@ -1371,6 +1374,22 @@ namespace { struct _Traits_2_neon : _Traits_2_base, _Traits_neon_base { using _Vec_t = int16x8_t; + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Vec_t _Zero() noexcept { + return vdupq_n_s16(0); + } + + static _Vec_t _All_ones() noexcept { + return vreinterpretq_s16_s8(vdupq_n_s8(static_cast(0xFF))); + } + + static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept { + return vbslq_s16(vreinterpretq_u16_s16(_Msk), _Px2, _Px1); + } + // Compresses a 128-bit Mask of 8 16-bit values into a 64-bit Mask of 8 8-bit values. static uint64_t _Mask(const _Vec_t _Val) noexcept { const uint16x4_t _Res = vshrn_n_u32(vreinterpretq_u32_s16(_Val), 8); @@ -1663,6 +1682,22 @@ namespace { struct _Traits_4_neon : _Traits_4_base, _Traits_neon_base { using _Vec_t = int32x4_t; + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Vec_t _Zero() noexcept { + return vdupq_n_s32(0); + } + + static _Vec_t _All_ones() noexcept { + return vreinterpretq_s32_s8(vdupq_n_s8(static_cast(0xFF))); + } + + static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept { + return vbslq_s32(vreinterpretq_u32_s32(_Msk), _Px2, _Px1); + } + // Compresses a 128-bit Mask of 4 32-bit values into a 64-bit Mask of 4 16-bit values. static uint64_t _Mask(const int32x4_t _Val) noexcept { const uint32x2_t _Res = vshrn_n_u64(vreinterpretq_u64_s32(_Val), 16); @@ -1940,7 +1975,81 @@ namespace { #endif // ^^^ !defined(_M_ARM64EC) ^^^ }; -#if !defined(_M_ARM64) && !defined(_M_ARM64EC) +#ifdef _M_ARM64 + struct _Traits_8_neon : _Traits_8_base, _Traits_neon_base { + using _Vec_t = int64x2_t; + + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Vec_t _Load(const void* const _Src) noexcept { + return vld1q_s64(reinterpret_cast(_Src)); + } + + static _Vec_t _H_min(const _Vec_t _Cur) noexcept { + int64x2_t _Swapped = vextq_s64(_Cur, _Cur, 1); + uint64x2_t _Mask_lt = vcltq_s64(_Swapped, _Cur); + return vbslq_s64(_Mask_lt, _Swapped, _Cur); + } + + static _Vec_t _H_max(const _Vec_t _Cur) noexcept { + int64x2_t _Swapped = vextq_s64(_Cur, _Cur, 1); + uint64x2_t _Mask_gt = vcgtq_s64(_Swapped, _Cur); + return vbslq_s64(_Mask_gt, _Swapped, _Cur); + } + + static _Vec_t _H_min_u(const _Vec_t _Cur) noexcept { + const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur); + uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1); + uint64x2_t _Mask_lt = vcltq_u64(_Swapped, _Cur_u); + return vreinterpretq_s64_u64(vbslq_u64(_Mask_lt, _Swapped, _Cur_u)); + } + + static _Vec_t _H_max_u(const _Vec_t _Cur) noexcept { + const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur); + uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1); + uint64x2_t _Mask_gt = vcgtq_u64(_Swapped, _Cur_u); + return vreinterpretq_s64_u64(vbslq_u64(_Mask_gt, _Swapped, _Cur_u)); + } + + static _Signed_t _Get_any(const _Vec_t _Cur) noexcept { + return static_cast<_Signed_t>(vgetq_lane_s64(_Cur, 0)); + } + + static _Vec_t _Cmp_gt(const _Vec_t _First, const _Vec_t _Second) noexcept { + return vreinterpretq_s64_u64(vcgtq_s64(_First, _Second)); + } + + static _Vec_t _Cmp_gt_u(const _Vec_t _First, const _Vec_t _Second) noexcept { + return vreinterpretq_s64_u64(vcgtq_u64(vreinterpretq_u64_s64(_First), vreinterpretq_u64_s64(_Second))); + } + + static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, const _Vec_t _Mask) noexcept { + return vbslq_s64(vreinterpretq_u64_s64(_Mask), _Second, _First); + } + + static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second) noexcept { + return _Min(_First, _Second, _Cmp_gt(_First, _Second)); + } + + static _Vec_t _Min_u(const _Vec_t _First, const _Vec_t _Second) noexcept { + return _Min(_First, _Second, _Cmp_gt_u(_First, _Second)); + } + + static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, const _Vec_t _Mask) noexcept { + return vbslq_s64(vreinterpretq_u64_s64(_Mask), _Second, _First); + } + + static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second) noexcept { + return _Max(_First, _Second, _Cmp_gt(_Second, _First)); + } + + static _Vec_t _Max_u(const _Vec_t _First, const _Vec_t _Second) noexcept { + return _Max(_First, _Second, _Cmp_gt_u(_Second, _First)); + } + }; +#elif !defined(_M_ARM64EC) struct _Traits_8_sse : _Traits_8_base, _Traits_sse_base { static __m128i _Load(const void* const _Src) noexcept { return _mm_loadu_si128(reinterpret_cast(_Src)); @@ -2125,7 +2234,7 @@ namespace { return _Mask; } }; -#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ +#endif // ^^^ !defined(_M_ARM64EC) ^^^ struct _Traits_f_base { static constexpr bool _Is_floating = true; @@ -2155,6 +2264,22 @@ namespace { using _Idx_t = int32x4_t; static constexpr bool _Has_unsigned_cmp = false; + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Idx_t _Zero() noexcept { + return vdupq_n_s32(0); + } + + static _Idx_t _All_ones() noexcept { + return vreinterpretq_s32_s8(vdupq_n_s8(static_cast(0xFF))); + } + + static _Idx_t _Blend(const _Idx_t _Px1, const _Idx_t _Px2, const _Idx_t _Msk) noexcept { + return vbslq_s32(vreinterpretq_u32_s32(_Msk), _Px2, _Px1); + } + static uint64_t _Mask(const _Idx_t _Val) noexcept { return _Traits_4_neon::_Mask(_Val); } @@ -2211,20 +2336,22 @@ namespace { return _Traits_4_neon::_Cmp_eq_idx(_First, _Second); } - static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f32(0)) noexcept { + static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s32(0)) noexcept { return vminq_f32(_First, _Second); } - static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f32(0)) noexcept { + static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s32(0)) noexcept { return vmaxq_f32(_First, _Second); } - static _Idx_t _Mask_cast(const _Vec_t _Mask) noexcept { - return vreinterpretq_s32_f32(_Mask); + static _Idx_t _Mask_cast(const _Idx_t _Mask) noexcept { + return _Mask; } }; #elif !defined(_M_ARM64EC) struct _Traits_f_sse : _Traits_f_base, _Traits_sse_base { + using _Vec_t = __m128; + static __m128 _Load(const void* const _Src) noexcept { return _mm_loadu_ps(reinterpret_cast(_Src)); } @@ -2298,6 +2425,7 @@ namespace { struct _Traits_f_avx : _Traits_f_base, _Traits_avx_base { static constexpr size_t _Tail_mask = 0x1C; + using _Vec_t = __m256; static __m256 _Blendval(const __m256 _Px1, const __m256 _Px2, const __m256i _Msk) noexcept { return _mm256_blendv_ps(_Px1, _Px2, _mm256_castsi256_ps(_Msk)); @@ -2403,6 +2531,22 @@ namespace { using _Idx_t = int64x2_t; static constexpr bool _Has_unsigned_cmp = false; + static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept { + return _Val; + } + + static _Idx_t _Zero() noexcept { + return vdupq_n_s64(0); + } + + static _Idx_t _All_ones() noexcept { + return vreinterpretq_s64_s8(vdupq_n_s8(static_cast(0xFF))); + } + + static _Idx_t _Blend(const _Idx_t _Px1, const _Idx_t _Px2, const _Idx_t _Msk) noexcept { + return vbslq_s64(vreinterpretq_u64_s64(_Msk), _Px2, _Px1); + } + // Compresses a 128-bit Mask of 2 64-bit values into a 64-bit Mask of 2 32-bit values. static uint64_t _Mask(const int64x2_t _Val) noexcept { const uint32x2_t _Res = vreinterpret_u32_s32(vmovn_s64(_Val)); @@ -2434,17 +2578,11 @@ namespace { } static _Idx_t _H_min_u(const _Idx_t _Cur) noexcept { - const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur); - const uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1); - const uint64x2_t _Mask_lt = vcltq_u64(_Swapped, _Cur_u); - return vreinterpretq_s64_u64(vbslq_u64(_Mask_lt, _Swapped, _Cur_u)); + return _Traits_8_neon::_H_min_u(_Cur); } static _Idx_t _H_max_u(const _Idx_t _Cur) noexcept { - const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur); - const uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1); - const uint64x2_t _Mask_gt = vcgtq_u64(_Swapped, _Cur_u); - return vreinterpretq_s64_u64(vbslq_u64(_Mask_gt, _Swapped, _Cur_u)); + return _Traits_8_neon::_H_max_u(_Cur); } static double _Get_any(const _Vec_t _Cur) noexcept { @@ -2467,20 +2605,22 @@ namespace { return vreinterpretq_s64_u64(vceqq_s64(_First, _Second)); } - static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f64(0)) noexcept { + static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s64(0)) noexcept { return vminq_f64(_First, _Second); } - static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f64(0)) noexcept { + static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s64(0)) noexcept { return vmaxq_f64(_First, _Second); } - static _Idx_t _Mask_cast(const _Vec_t _Mask) noexcept { - return vreinterpretq_s64_f64(_Mask); + static _Idx_t _Mask_cast(const _Idx_t _Mask) noexcept { + return _Mask; } }; #elif !defined(_M_ARM64EC) struct _Traits_d_sse : _Traits_d_base, _Traits_sse_base { + using _Vec_t = __m128d; + static __m128d _Load(const void* const _Src) noexcept { return _mm_loadu_pd(reinterpret_cast(_Src)); } @@ -2552,6 +2692,7 @@ namespace { struct _Traits_d_avx : _Traits_d_base, _Traits_avx_base { static constexpr size_t _Tail_mask = 0x18; + using _Vec_t = __m256d; static __m256d _Blendval(const __m256d _Px1, const __m256d _Px2, const __m256i _Msk) noexcept { return _mm256_blendv_pd(_Px1, _Px2, _mm256_castsi256_pd(_Msk)); @@ -2663,15 +2804,15 @@ namespace { #endif // ^^^ !defined(_M_ARM64EC) ^^^ }; -#ifndef _M_ARM64 struct _Traits_8 { using _Scalar = _Traits_scalar<_Traits_8_base>; -#ifndef _M_ARM64EC +#ifdef _M_ARM64 + using _Neon = _Traits_8_neon; +#elif !defined(_M_ARM64EC) using _Sse = _Traits_8_sse; using _Avx = _Traits_8_avx; #endif // ^^^ !defined(_M_ARM64EC) ^^^ }; -#endif // ^^^ !defined(_M_ARM64) ^^^ struct _Traits_f { using _Scalar = _Traits_scalar<_Traits_f_base>; @@ -3098,10 +3239,10 @@ namespace { #endif // ^^^ !defined(_M_ARM64) ^^^ } -#ifndef _M_ARM64 - template <_Min_max_mode _Mode, class _Traits, bool _Sign> + template <_Min_max_mode _Mode, class _Traits, bool _Sign, bool _Unrolled = false> auto _Minmax_impl(const void* _First, const void* const _Last) noexcept { - using _Ty = std::conditional_t<_Sign, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>; + using _Ty = std::conditional_t<_Sign, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>; + using _VecTy = _Traits::_Vec_t; _Ty _Cur_min_val; // initialized in both of the branches below _Ty _Cur_max_val; // initialized in both of the branches below @@ -3110,56 +3251,85 @@ namespace { #ifdef _M_ARM64EC static_assert(false, "No vectorization for _M_ARM64EC yet"); #else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv + constexpr size_t _Lanes = _Unrolled ? 2 : 1; + constexpr size_t _Bytes_per_iter = _Lanes * _Traits::_Vec_size; + const size_t _Total_size_bytes = _Byte_length(_First, _Last); - const size_t _Vec_byte_size = _Total_size_bytes & ~_Traits::_Vec_mask; + const size_t _Vec_byte_size = _Total_size_bytes & ~(_Bytes_per_iter - 1); const void* _Stop_at = _First; _Advance_bytes(_Stop_at, _Vec_byte_size); - auto _Cur_vals = _Traits::_Load(_First); - // We don't have unsigned 64-bit stuff, so we'll use sign correction just for that case - constexpr bool _Sign_correction = sizeof(_Ty) == 8 && !_Sign; - - if constexpr (_Sign_correction) { - _Cur_vals = _Traits::_Sign_correction(_Cur_vals, false); + constexpr bool _Sign_correction = sizeof(_Ty) == 8 && !_Sign && !_Traits::_Has_unsigned_cmp; + + _VecTy _Cur_vals[_Lanes]; + _VecTy _Cur_vals_min[_Lanes]; // vector of vertical minimum values + _VecTy _Cur_vals_max[_Lanes]; // vector of vertical maximum values + for (size_t _Lane = 0; _Lane < _Lanes; ++_Lane) { + _Cur_vals[_Lane] = _Traits::_Load(static_cast(_First) + _Lane * _Traits::_Vec_size); + if constexpr (_Sign_correction) { + _Cur_vals[_Lane] = _Traits::_Sign_correction(_Cur_vals[_Lane], false); + } + _Cur_vals_min[_Lane] = _Cur_vals[_Lane]; + _Cur_vals_max[_Lane] = _Cur_vals[_Lane]; } - auto _Cur_vals_min = _Cur_vals; // vector of vertical minimum values - auto _Cur_vals_max = _Cur_vals; // vector of vertical maximum values - - const auto _Update_min_max = [&](const auto _Cur_vals) noexcept { + const auto _Update_min_max = [&](const auto _Cur_vals, size_t _Lane = 0) noexcept { if constexpr ((_Mode & _Mode_min) != 0) { if constexpr (_Sign || _Sign_correction) { - _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals); // Update the current minimum + _Cur_vals_min[_Lane] = + _Traits::_Min(_Cur_vals_min[_Lane], _Cur_vals); // Update the current minimum } else { - _Cur_vals_min = _Traits::_Min_u(_Cur_vals_min, _Cur_vals); // Update the current minimum + _Cur_vals_min[_Lane] = + _Traits::_Min_u(_Cur_vals_min[_Lane], _Cur_vals); // Update the current minimum } } if constexpr ((_Mode & _Mode_max) != 0) { if constexpr (_Sign || _Sign_correction) { - _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals); // Update the current maximum + _Cur_vals_max[_Lane] = + _Traits::_Max(_Cur_vals_max[_Lane], _Cur_vals); // Update the current maximum } else { - _Cur_vals_max = _Traits::_Max_u(_Cur_vals_max, _Cur_vals); // Update the current maximum + _Cur_vals_max[_Lane] = + _Traits::_Max_u(_Cur_vals_max[_Lane], _Cur_vals); // Update the current maximum } } }; for (;;) { - _Advance_bytes(_First, _Traits::_Vec_size); + _Advance_bytes(_First, _Bytes_per_iter); if (_First != _Stop_at) { // This is the main part, finding vertical minimum/maximum - _Cur_vals = _Traits::_Load(_First); + for (size_t _Lane = 0; _Lane < _Lanes; ++_Lane) { + _Cur_vals[_Lane] = + _Traits::_Load(static_cast(_First) + _Lane * _Traits::_Vec_size); - if constexpr (_Sign_correction) { - _Cur_vals = _Traits::_Sign_correction(_Cur_vals, false); - } + if constexpr (_Sign_correction) { + _Cur_vals[_Lane] = _Traits::_Sign_correction(_Cur_vals[_Lane], false); + } - _Update_min_max(_Cur_vals); + _Update_min_max(_Cur_vals[_Lane], _Lane); + } } else { + if constexpr (_Unrolled) { + const bool _Has_vec_tail = (_Byte_length(_First, _Last) & ~_Traits::_Vec_mask) != 0; + + if (_Has_vec_tail) { + _Cur_vals[0] = _Traits::_Load(_First); + + if constexpr (_Sign_correction) { + _Cur_vals[0] = _Traits::_Sign_correction(_Cur_vals[0], false); + } + + _Update_min_max(_Cur_vals[0], 0); + + _Advance_bytes(_First, _Traits::_Vec_size); + } + } + if constexpr (_Traits::_Tail_mask != 0) { const size_t _Tail_byte_size = _Total_size_bytes & _Traits::_Tail_mask; if (_Tail_byte_size != 0) { @@ -3170,7 +3340,7 @@ namespace { _Tail_vals = _Traits::_Sign_correction(_Tail_vals, false); } - _Tail_vals = _Traits::_Blendval(_Cur_vals, _Tail_vals, _Tail_mask); + _Tail_vals = _Traits::_Blendval(_Cur_vals[0], _Tail_vals, _Tail_mask); _Update_min_max(_Tail_vals); @@ -3182,24 +3352,48 @@ namespace { if constexpr ((_Mode & _Mode_min) != 0) { if constexpr (_Sign || _Sign_correction) { + if constexpr (_Unrolled) { + for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) { + _Cur_vals_min[0] = _Traits::_Min(_Cur_vals_min[0], _Cur_vals_min[_Lane]); + } + } + // Vector populated by the smallest element - const auto _H_min = _Traits::_H_min(_Cur_vals_min); + const auto _H_min = _Traits::_H_min(_Cur_vals_min[0]); _Cur_min_val = _Traits::_Get_any(_H_min); // Get any element of it } else { + if constexpr (_Unrolled) { + for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) { + _Cur_vals_min[0] = _Traits::_Min_u(_Cur_vals_min[0], _Cur_vals_min[_Lane]); + } + } + // Vector populated by the smallest element - const auto _H_min = _Traits::_H_min_u(_Cur_vals_min); + const auto _H_min = _Traits::_H_min_u(_Cur_vals_min[0]); _Cur_min_val = _Traits::_Get_any(_H_min); // Get any element of it } } if constexpr ((_Mode & _Mode_max) != 0) { if constexpr (_Sign || _Sign_correction) { + if constexpr (_Unrolled) { + for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) { + _Cur_vals_max[0] = _Traits::_Max(_Cur_vals_max[0], _Cur_vals_max[_Lane]); + } + } + // Vector populated by the largest element - const auto _H_max = _Traits::_H_max(_Cur_vals_max); + const auto _H_max = _Traits::_H_max(_Cur_vals_max[0]); _Cur_max_val = _Traits::_Get_any(_H_max); // Get any element of it } else { + if constexpr (_Unrolled) { + for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) { + _Cur_vals_max[0] = _Traits::_Max_u(_Cur_vals_max[0], _Cur_vals_max[_Lane]); + } + } + // Vector populated by the largest element - const auto _H_max = _Traits::_H_max_u(_Cur_vals_max); + const auto _H_max = _Traits::_H_max_u(_Cur_vals_max[0]); _Cur_max_val = _Traits::_Get_any(_H_max); // Get any element of it } } @@ -3229,6 +3423,8 @@ namespace { _Advance_bytes(_First, sizeof(_Ty)); } +// Avoid auto-vectorization of the scalar tail, as this is not beneficial for performance. +#pragma loop(no_vector) for (auto _Ptr = static_cast(_First); _Ptr != _Last; ++_Ptr) { if constexpr ((_Mode & _Mode_min) != 0) { if (*_Ptr < _Cur_min_val) { @@ -3258,7 +3454,7 @@ namespace { } } -#ifndef _M_ARM64EC +#if !defined(_M_ARM64) && !defined(_M_ARM64EC) // TRANSITION, DevCom-10767462 template <_Min_max_mode _Mode, class _Traits, bool _Sign> auto _Minmax_impl_wrap(const void* const _First, const void* const _Last) noexcept { @@ -3266,11 +3462,19 @@ namespace { _mm256_zeroupper(); return _Rx; } -#endif // ^^^ !defined(_M_ARM64EC) ^^^ +#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ template <_Min_max_mode _Mode, class _Traits, bool _Sign> auto __stdcall _Minmax_disp(const void* const _First, const void* const _Last) noexcept { -#ifndef _M_ARM64EC +#ifdef _M_ARM64 + if (_Byte_length(_First, _Last) >= 32) { + return _Minmax_impl<_Mode, typename _Traits::_Neon, _Sign, true>(_First, _Last); + } + + if (_Byte_length(_First, _Last) >= 16) { + return _Minmax_impl<_Mode, typename _Traits::_Neon, _Sign, false>(_First, _Last); + } +#elif !defined(_M_ARM64EC) if (_Byte_length(_First, _Last) >= 32 && _Use_avx2()) { if constexpr (_Traits::_Avx::_Is_floating) { return _Minmax_impl_wrap<_Mode, typename _Traits::_Avx, _Sign>(_First, _Last); @@ -3286,6 +3490,7 @@ namespace { return _Minmax_impl<_Mode, typename _Traits::_Scalar, _Sign>(_First, _Last); } +#ifndef _M_ARM64 template const void* _Is_sorted_until_impl(const void* _First, const void* const _Last, const bool _Greater) noexcept { const ptrdiff_t _Left_off = 0 - static_cast(_Greater); @@ -3487,7 +3692,6 @@ _Min_max_element_t __stdcall __std_minmax_element_d(const void* const _First, co return _Sorting::_Minmax_element_disp<_Sorting::_Mode_both, _Sorting::_Traits_d>(_First, _Last, false); } -#ifndef _M_ARM64 __declspec(noalias) int8_t __stdcall __std_min_1i(const void* const _First, const void* const _Last) noexcept { return _Sorting::_Minmax_disp<_Sorting::_Mode_min, _Sorting::_Traits_1, true>(_First, _Last); } @@ -3608,6 +3812,7 @@ __declspec(noalias) _Min_max_d __stdcall __std_minmax_d(const void* const _First return _Sorting::_Minmax_disp<_Sorting::_Mode_both, _Sorting::_Traits_d, true>(_First, _Last); } +#ifndef _M_ARM64 const void* __stdcall __std_is_sorted_until_1i( const void* const _First, const void* const _Last, const bool _Greater) noexcept { return _Sorting::_Is_sorted_until_disp<_Sorting::_Traits_1, int8_t>(_First, _Last, _Greater);