diff --git a/stl/inc/xutility b/stl/inc/xutility
index 06c1dfa8dcc..577161e4571 100644
--- a/stl/inc/xutility
+++ b/stl/inc/xutility
@@ -89,7 +89,7 @@ _STL_DISABLE_CLANG_WARNINGS
 #define _VECTORIZED_FIND_LAST_OF       _VECTORIZED_FOR_X64_X86
 #define _VECTORIZED_INCLUDES           _VECTORIZED_FOR_X64_X86
 #define _VECTORIZED_IS_SORTED_UNTIL    _VECTORIZED_FOR_X64_X86
-#define _VECTORIZED_MINMAX             _VECTORIZED_FOR_X64_X86
+#define _VECTORIZED_MINMAX             _VECTORIZED_FOR_X64_X86_ARM64
 #define _VECTORIZED_MINMAX_ELEMENT     _VECTORIZED_FOR_X64_X86_ARM64
 #define _VECTORIZED_MISMATCH           _VECTORIZED_FOR_X64_X86
 #define _VECTORIZED_REMOVE             _VECTORIZED_FOR_X64_X86
@@ -7302,7 +7302,7 @@ constexpr bool _Is_min_max_value_optimization_safe = // Activate the vector algo
 #ifndef _M_FP_FAST
     !is_floating_point_v<_Elem> &&
 #endif // ^^^ !defined(_M_FP_FAST) ^^^
-    _Is_min_max_optimization_safe<_Iter, _Pr>;
+    _Is_min_max_iterators_safe<_Iter> && _Is_predicate_less<_Iter, _Pr>;
 
 template <class _FwdIt, class _Pr>
 constexpr _FwdIt _Max_element_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // find largest element
diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp
index 0f84855383d..0271995eed8 100644
--- a/stl/src/vector_algorithms.cpp
+++ b/stl/src/vector_algorithms.cpp
@@ -934,6 +934,7 @@ namespace {
             static constexpr bool _Vectorized       = false;
             static constexpr size_t _Tail_mask      = 0;
             static constexpr bool _Has_unsigned_cmp = false;
+            using _Vec_t                            = void;
         };
 
 #ifdef _M_ARM64
@@ -945,22 +946,6 @@ namespace {
             static constexpr size_t _Tail_mask      = 0;
             static constexpr bool _Has_unsigned_cmp = true;
 
-            static int8x16_t _Zero() noexcept {
-                return vdupq_n_s8(0);
-            }
-
-            static int8x16_t _All_ones() noexcept {
-                return vdupq_n_s8(static_cast<int8_t>(0xFF));
-            }
-
-            static int8x16_t _Blend(const int8x16_t _Px1, const int8x16_t _Px2, const int8x16_t _Msk) noexcept {
-                return vbslq_s8(vreinterpretq_u8_s8(_Msk), _Px2, _Px1);
-            }
-
-            static int8x16_t _Sign_correction(const int8x16_t _Val, bool) noexcept {
-                return _Val;
-            }
-
             static void _Exit_vectorized() noexcept {}
         };
 #elif !defined(_M_ARM64EC)
@@ -971,6 +956,7 @@ namespace {
             static constexpr size_t _Vec_mask       = 0xF;
             static constexpr size_t _Tail_mask      = 0;
             static constexpr bool _Has_unsigned_cmp = false;
+            using _Vec_t                            = __m128i;
 
             static __m128i _Zero() noexcept {
                 return _mm_setzero_si128();
@@ -1043,6 +1029,7 @@ namespace {
 
         struct _Traits_avx_i_base : _Traits_avx_base {
             static constexpr size_t _Tail_mask = 0x1C;
+            using _Vec_t                       = __m256i;
 
             static __m256i _Blendval(const __m256i _Px1, const __m256i _Px2, const __m256i _Msk) noexcept {
                 return _mm256_blendv_epi8(_Px1, _Px2, _Msk);
@@ -1076,6 +1063,22 @@ namespace {
         struct _Traits_1_neon : _Traits_1_base, _Traits_neon_base {
             using _Vec_t = int8x16_t;
 
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Vec_t _Zero() noexcept {
+                return vdupq_n_s8(0);
+            }
+
+            static _Vec_t _All_ones() noexcept {
+                return vdupq_n_s8(static_cast<int8_t>(0xFF));
+            }
+
+            static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept {
+                return vbslq_s8(vreinterpretq_u8_s8(_Msk), _Px2, _Px1);
+            }
+
             // Compresses a 128-bit Mask of 16 8-bit values into a 64-bit Mask of 16 4-bit values.
             static uint64_t _Mask(const _Vec_t _Val) noexcept {
                 const uint8x8_t _Res = vshrn_n_u16(vreinterpretq_u16_s8(_Val), 4);
@@ -1371,6 +1374,22 @@ namespace {
         struct _Traits_2_neon : _Traits_2_base, _Traits_neon_base {
             using _Vec_t = int16x8_t;
 
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Vec_t _Zero() noexcept {
+                return vdupq_n_s16(0);
+            }
+
+            static _Vec_t _All_ones() noexcept {
+                return vreinterpretq_s16_s8(vdupq_n_s8(static_cast<int8_t>(0xFF)));
+            }
+
+            static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept {
+                return vbslq_s16(vreinterpretq_u16_s16(_Msk), _Px2, _Px1);
+            }
+
             // Compresses a 128-bit Mask of 8 16-bit values into a 64-bit Mask of 8 8-bit values.
             static uint64_t _Mask(const _Vec_t _Val) noexcept {
                 const uint16x4_t _Res = vshrn_n_u32(vreinterpretq_u32_s16(_Val), 8);
@@ -1663,6 +1682,22 @@ namespace {
         struct _Traits_4_neon : _Traits_4_base, _Traits_neon_base {
             using _Vec_t = int32x4_t;
 
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Vec_t _Zero() noexcept {
+                return vdupq_n_s32(0);
+            }
+
+            static _Vec_t _All_ones() noexcept {
+                return vreinterpretq_s32_s8(vdupq_n_s8(static_cast<int8_t>(0xFF)));
+            }
+
+            static _Vec_t _Blend(const _Vec_t _Px1, const _Vec_t _Px2, const _Vec_t _Msk) noexcept {
+                return vbslq_s32(vreinterpretq_u32_s32(_Msk), _Px2, _Px1);
+            }
+
             // Compresses a 128-bit Mask of 4 32-bit values into a 64-bit Mask of 4 16-bit values.
             static uint64_t _Mask(const int32x4_t _Val) noexcept {
                 const uint32x2_t _Res = vshrn_n_u64(vreinterpretq_u64_s32(_Val), 16);
@@ -1940,7 +1975,81 @@ namespace {
 #endif // ^^^ !defined(_M_ARM64EC) ^^^
         };
 
-#if !defined(_M_ARM64) && !defined(_M_ARM64EC)
+#ifdef _M_ARM64
+        struct _Traits_8_neon : _Traits_8_base, _Traits_neon_base {
+            using _Vec_t = int64x2_t;
+
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Vec_t _Load(const void* const _Src) noexcept {
+                return vld1q_s64(reinterpret_cast<const int64_t*>(_Src));
+            }
+
+            static _Vec_t _H_min(const _Vec_t _Cur) noexcept {
+                int64x2_t _Swapped  = vextq_s64(_Cur, _Cur, 1);
+                uint64x2_t _Mask_lt = vcltq_s64(_Swapped, _Cur);
+                return vbslq_s64(_Mask_lt, _Swapped, _Cur);
+            }
+
+            static _Vec_t _H_max(const _Vec_t _Cur) noexcept {
+                int64x2_t _Swapped  = vextq_s64(_Cur, _Cur, 1);
+                uint64x2_t _Mask_gt = vcgtq_s64(_Swapped, _Cur);
+                return vbslq_s64(_Mask_gt, _Swapped, _Cur);
+            }
+
+            static _Vec_t _H_min_u(const _Vec_t _Cur) noexcept {
+                const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur);
+                uint64x2_t _Swapped     = vextq_u64(_Cur_u, _Cur_u, 1);
+                uint64x2_t _Mask_lt     = vcltq_u64(_Swapped, _Cur_u);
+                return vreinterpretq_s64_u64(vbslq_u64(_Mask_lt, _Swapped, _Cur_u));
+            }
+
+            static _Vec_t _H_max_u(const _Vec_t _Cur) noexcept {
+                const uint64x2_t _Cur_u = vreinterpretq_u64_s64(_Cur);
+                uint64x2_t _Swapped     = vextq_u64(_Cur_u, _Cur_u, 1);
+                uint64x2_t _Mask_gt     = vcgtq_u64(_Swapped, _Cur_u);
+                return vreinterpretq_s64_u64(vbslq_u64(_Mask_gt, _Swapped, _Cur_u));
+            }
+
+            static _Signed_t _Get_any(const _Vec_t _Cur) noexcept {
+                return static_cast<_Signed_t>(vgetq_lane_s64(_Cur, 0));
+            }
+
+            static _Vec_t _Cmp_gt(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return vreinterpretq_s64_u64(vcgtq_s64(_First, _Second));
+            }
+
+            static _Vec_t _Cmp_gt_u(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return vreinterpretq_s64_u64(vcgtq_u64(vreinterpretq_u64_s64(_First), vreinterpretq_u64_s64(_Second)));
+            }
+
+            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, const _Vec_t _Mask) noexcept {
+                return vbslq_s64(vreinterpretq_u64_s64(_Mask), _Second, _First);
+            }
+
+            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return _Min(_First, _Second, _Cmp_gt(_First, _Second));
+            }
+
+            static _Vec_t _Min_u(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return _Min(_First, _Second, _Cmp_gt_u(_First, _Second));
+            }
+
+            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, const _Vec_t _Mask) noexcept {
+                return vbslq_s64(vreinterpretq_u64_s64(_Mask), _Second, _First);
+            }
+
+            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return _Max(_First, _Second, _Cmp_gt(_Second, _First));
+            }
+
+            static _Vec_t _Max_u(const _Vec_t _First, const _Vec_t _Second) noexcept {
+                return _Max(_First, _Second, _Cmp_gt_u(_Second, _First));
+            }
+        };
+#elif !defined(_M_ARM64EC)
         struct _Traits_8_sse : _Traits_8_base, _Traits_sse_base {
             static __m128i _Load(const void* const _Src) noexcept {
                 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(_Src));
@@ -2125,7 +2234,7 @@ namespace {
                 return _Mask;
             }
         };
-#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
+#endif // ^^^ !defined(_M_ARM64EC) ^^^
 
         struct _Traits_f_base {
             static constexpr bool _Is_floating = true;
@@ -2155,6 +2264,22 @@ namespace {
             using _Idx_t                            = int32x4_t;
             static constexpr bool _Has_unsigned_cmp = false;
 
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Idx_t _Zero() noexcept {
+                return vdupq_n_s32(0);
+            }
+
+            static _Idx_t _All_ones() noexcept {
+                return vreinterpretq_s32_s8(vdupq_n_s8(static_cast<int8_t>(0xFF)));
+            }
+
+            static _Idx_t _Blend(const _Idx_t _Px1, const _Idx_t _Px2, const _Idx_t _Msk) noexcept {
+                return vbslq_s32(vreinterpretq_u32_s32(_Msk), _Px2, _Px1);
+            }
+
             static uint64_t _Mask(const _Idx_t _Val) noexcept {
                 return _Traits_4_neon::_Mask(_Val);
             }
@@ -2211,20 +2336,22 @@ namespace {
                 return _Traits_4_neon::_Cmp_eq_idx(_First, _Second);
             }
 
-            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f32(0)) noexcept {
+            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s32(0)) noexcept {
                 return vminq_f32(_First, _Second);
             }
 
-            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f32(0)) noexcept {
+            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s32(0)) noexcept {
                 return vmaxq_f32(_First, _Second);
             }
 
-            static _Idx_t _Mask_cast(const _Vec_t _Mask) noexcept {
-                return vreinterpretq_s32_f32(_Mask);
+            static _Idx_t _Mask_cast(const _Idx_t _Mask) noexcept {
+                return _Mask;
             }
         };
 #elif !defined(_M_ARM64EC)
         struct _Traits_f_sse : _Traits_f_base, _Traits_sse_base {
+            using _Vec_t = __m128;
+
             static __m128 _Load(const void* const _Src) noexcept {
                 return _mm_loadu_ps(reinterpret_cast<const float*>(_Src));
             }
@@ -2298,6 +2425,7 @@ namespace {
 
         struct _Traits_f_avx : _Traits_f_base, _Traits_avx_base {
             static constexpr size_t _Tail_mask = 0x1C;
+            using _Vec_t                       = __m256;
 
             static __m256 _Blendval(const __m256 _Px1, const __m256 _Px2, const __m256i _Msk) noexcept {
                 return _mm256_blendv_ps(_Px1, _Px2, _mm256_castsi256_ps(_Msk));
@@ -2403,6 +2531,22 @@ namespace {
             using _Idx_t                            = int64x2_t;
             static constexpr bool _Has_unsigned_cmp = false;
 
+            static _Vec_t _Sign_correction(const _Vec_t _Val, bool) noexcept {
+                return _Val;
+            }
+
+            static _Idx_t _Zero() noexcept {
+                return vdupq_n_s64(0);
+            }
+
+            static _Idx_t _All_ones() noexcept {
+                return vreinterpretq_s64_s8(vdupq_n_s8(static_cast<int8_t>(0xFF)));
+            }
+
+            static _Idx_t _Blend(const _Idx_t _Px1, const _Idx_t _Px2, const _Idx_t _Msk) noexcept {
+                return vbslq_s64(vreinterpretq_u64_s64(_Msk), _Px2, _Px1);
+            }
+
             // Compresses a 128-bit Mask of 2 64-bit values into a 64-bit Mask of 2 32-bit values.
             static uint64_t _Mask(const int64x2_t _Val) noexcept {
                 const uint32x2_t _Res = vreinterpret_u32_s32(vmovn_s64(_Val));
@@ -2434,17 +2578,11 @@ namespace {
             }
 
             static _Idx_t _H_min_u(const _Idx_t _Cur) noexcept {
-                const uint64x2_t _Cur_u   = vreinterpretq_u64_s64(_Cur);
-                const uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1);
-                const uint64x2_t _Mask_lt = vcltq_u64(_Swapped, _Cur_u);
-                return vreinterpretq_s64_u64(vbslq_u64(_Mask_lt, _Swapped, _Cur_u));
+                return _Traits_8_neon::_H_min_u(_Cur);
             }
 
             static _Idx_t _H_max_u(const _Idx_t _Cur) noexcept {
-                const uint64x2_t _Cur_u   = vreinterpretq_u64_s64(_Cur);
-                const uint64x2_t _Swapped = vextq_u64(_Cur_u, _Cur_u, 1);
-                const uint64x2_t _Mask_gt = vcgtq_u64(_Swapped, _Cur_u);
-                return vreinterpretq_s64_u64(vbslq_u64(_Mask_gt, _Swapped, _Cur_u));
+                return _Traits_8_neon::_H_max_u(_Cur);
             }
 
             static double _Get_any(const _Vec_t _Cur) noexcept {
@@ -2467,20 +2605,22 @@ namespace {
                 return vreinterpretq_s64_u64(vceqq_s64(_First, _Second));
             }
 
-            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f64(0)) noexcept {
+            static _Vec_t _Min(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s64(0)) noexcept {
                 return vminq_f64(_First, _Second);
             }
 
-            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Vec_t = vdupq_n_f64(0)) noexcept {
+            static _Vec_t _Max(const _Vec_t _First, const _Vec_t _Second, _Idx_t = vdupq_n_s64(0)) noexcept {
                 return vmaxq_f64(_First, _Second);
             }
 
-            static _Idx_t _Mask_cast(const _Vec_t _Mask) noexcept {
-                return vreinterpretq_s64_f64(_Mask);
+            static _Idx_t _Mask_cast(const _Idx_t _Mask) noexcept {
+                return _Mask;
             }
         };
 #elif !defined(_M_ARM64EC)
         struct _Traits_d_sse : _Traits_d_base, _Traits_sse_base {
+            using _Vec_t = __m128d;
+
             static __m128d _Load(const void* const _Src) noexcept {
                 return _mm_loadu_pd(reinterpret_cast<const double*>(_Src));
             }
@@ -2552,6 +2692,7 @@ namespace {
 
         struct _Traits_d_avx : _Traits_d_base, _Traits_avx_base {
             static constexpr size_t _Tail_mask = 0x18;
+            using _Vec_t                       = __m256d;
 
             static __m256d _Blendval(const __m256d _Px1, const __m256d _Px2, const __m256i _Msk) noexcept {
                 return _mm256_blendv_pd(_Px1, _Px2, _mm256_castsi256_pd(_Msk));
@@ -2663,15 +2804,15 @@ namespace {
 #endif // ^^^ !defined(_M_ARM64EC) ^^^
         };
 
-#ifndef _M_ARM64
         struct _Traits_8 {
             using _Scalar = _Traits_scalar<_Traits_8_base>;
-#ifndef _M_ARM64EC
+#ifdef _M_ARM64
+            using _Neon = _Traits_8_neon;
+#elif !defined(_M_ARM64EC)
             using _Sse = _Traits_8_sse;
             using _Avx = _Traits_8_avx;
 #endif // ^^^ !defined(_M_ARM64EC) ^^^
         };
-#endif // ^^^ !defined(_M_ARM64) ^^^
 
         struct _Traits_f {
             using _Scalar = _Traits_scalar<_Traits_f_base>;
@@ -3098,10 +3239,10 @@ namespace {
 #endif // ^^^ !defined(_M_ARM64) ^^^
         }
 
-#ifndef _M_ARM64
-        template <_Min_max_mode _Mode, class _Traits, bool _Sign>
+        template <_Min_max_mode _Mode, class _Traits, bool _Sign, bool _Unrolled = false>
         auto _Minmax_impl(const void* _First, const void* const _Last) noexcept {
-            using _Ty = std::conditional_t<_Sign, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>;
+            using _Ty    = std::conditional_t<_Sign, typename _Traits::_Signed_t, typename _Traits::_Unsigned_t>;
+            using _VecTy = _Traits::_Vec_t;
 
             _Ty _Cur_min_val; // initialized in both of the branches below
             _Ty _Cur_max_val; // initialized in both of the branches below
@@ -3110,56 +3251,85 @@ namespace {
 #ifdef _M_ARM64EC
                 static_assert(false, "No vectorization for _M_ARM64EC yet");
 #else // ^^^ defined(_M_ARM64EC) / !defined(_M_ARM64EC) vvv
+                constexpr size_t _Lanes          = _Unrolled ? 2 : 1;
+                constexpr size_t _Bytes_per_iter = _Lanes * _Traits::_Vec_size;
+
                 const size_t _Total_size_bytes = _Byte_length(_First, _Last);
-                const size_t _Vec_byte_size    = _Total_size_bytes & ~_Traits::_Vec_mask;
+                const size_t _Vec_byte_size    = _Total_size_bytes & ~(_Bytes_per_iter - 1);
 
                 const void* _Stop_at = _First;
                 _Advance_bytes(_Stop_at, _Vec_byte_size);
 
-                auto _Cur_vals = _Traits::_Load(_First);
-
                 // We don't have unsigned 64-bit stuff, so we'll use sign correction just for that case
-                constexpr bool _Sign_correction = sizeof(_Ty) == 8 && !_Sign;
-
-                if constexpr (_Sign_correction) {
-                    _Cur_vals = _Traits::_Sign_correction(_Cur_vals, false);
+                constexpr bool _Sign_correction = sizeof(_Ty) == 8 && !_Sign && !_Traits::_Has_unsigned_cmp;
+
+                _VecTy _Cur_vals[_Lanes];
+                _VecTy _Cur_vals_min[_Lanes]; // vector of vertical minimum values
+                _VecTy _Cur_vals_max[_Lanes]; // vector of vertical maximum values
+                for (size_t _Lane = 0; _Lane < _Lanes; ++_Lane) {
+                    _Cur_vals[_Lane] = _Traits::_Load(static_cast<const uint8_t*>(_First) + _Lane * _Traits::_Vec_size);
+                    if constexpr (_Sign_correction) {
+                        _Cur_vals[_Lane] = _Traits::_Sign_correction(_Cur_vals[_Lane], false);
+                    }
+                    _Cur_vals_min[_Lane] = _Cur_vals[_Lane];
+                    _Cur_vals_max[_Lane] = _Cur_vals[_Lane];
                 }
 
-                auto _Cur_vals_min = _Cur_vals; // vector of vertical minimum values
-                auto _Cur_vals_max = _Cur_vals; // vector of vertical maximum values
-
-                const auto _Update_min_max = [&](const auto _Cur_vals) noexcept {
+                const auto _Update_min_max = [&](const auto _Cur_vals, size_t _Lane = 0) noexcept {
                     if constexpr ((_Mode & _Mode_min) != 0) {
                         if constexpr (_Sign || _Sign_correction) {
-                            _Cur_vals_min = _Traits::_Min(_Cur_vals_min, _Cur_vals); // Update the current minimum
+                            _Cur_vals_min[_Lane] =
+                                _Traits::_Min(_Cur_vals_min[_Lane], _Cur_vals); // Update the current minimum
                         } else {
-                            _Cur_vals_min = _Traits::_Min_u(_Cur_vals_min, _Cur_vals); // Update the current minimum
+                            _Cur_vals_min[_Lane] =
+                                _Traits::_Min_u(_Cur_vals_min[_Lane], _Cur_vals); // Update the current minimum
                         }
                     }
 
                     if constexpr ((_Mode & _Mode_max) != 0) {
                         if constexpr (_Sign || _Sign_correction) {
-                            _Cur_vals_max = _Traits::_Max(_Cur_vals_max, _Cur_vals); // Update the current maximum
+                            _Cur_vals_max[_Lane] =
+                                _Traits::_Max(_Cur_vals_max[_Lane], _Cur_vals); // Update the current maximum
                         } else {
-                            _Cur_vals_max = _Traits::_Max_u(_Cur_vals_max, _Cur_vals); // Update the current maximum
+                            _Cur_vals_max[_Lane] =
+                                _Traits::_Max_u(_Cur_vals_max[_Lane], _Cur_vals); // Update the current maximum
                         }
                     }
                 };
 
                 for (;;) {
-                    _Advance_bytes(_First, _Traits::_Vec_size);
+                    _Advance_bytes(_First, _Bytes_per_iter);
 
                     if (_First != _Stop_at) {
                         // This is the main part, finding vertical minimum/maximum
 
-                        _Cur_vals = _Traits::_Load(_First);
+                        for (size_t _Lane = 0; _Lane < _Lanes; ++_Lane) {
+                            _Cur_vals[_Lane] =
+                                _Traits::_Load(static_cast<const uint8_t*>(_First) + _Lane * _Traits::_Vec_size);
 
-                        if constexpr (_Sign_correction) {
-                            _Cur_vals = _Traits::_Sign_correction(_Cur_vals, false);
-                        }
+                            if constexpr (_Sign_correction) {
+                                _Cur_vals[_Lane] = _Traits::_Sign_correction(_Cur_vals[_Lane], false);
+                            }
 
-                        _Update_min_max(_Cur_vals);
+                            _Update_min_max(_Cur_vals[_Lane], _Lane);
+                        }
                     } else {
+                        if constexpr (_Unrolled) {
+                            const bool _Has_vec_tail = (_Byte_length(_First, _Last) & ~_Traits::_Vec_mask) != 0;
+
+                            if (_Has_vec_tail) {
+                                _Cur_vals[0] = _Traits::_Load(_First);
+
+                                if constexpr (_Sign_correction) {
+                                    _Cur_vals[0] = _Traits::_Sign_correction(_Cur_vals[0], false);
+                                }
+
+                                _Update_min_max(_Cur_vals[0], 0);
+
+                                _Advance_bytes(_First, _Traits::_Vec_size);
+                            }
+                        }
+
                         if constexpr (_Traits::_Tail_mask != 0) {
                             const size_t _Tail_byte_size = _Total_size_bytes & _Traits::_Tail_mask;
                             if (_Tail_byte_size != 0) {
@@ -3170,7 +3340,7 @@ namespace {
                                     _Tail_vals = _Traits::_Sign_correction(_Tail_vals, false);
                                 }
 
-                                _Tail_vals = _Traits::_Blendval(_Cur_vals, _Tail_vals, _Tail_mask);
+                                _Tail_vals = _Traits::_Blendval(_Cur_vals[0], _Tail_vals, _Tail_mask);
 
                                 _Update_min_max(_Tail_vals);
 
@@ -3182,24 +3352,48 @@ namespace {
 
                         if constexpr ((_Mode & _Mode_min) != 0) {
                             if constexpr (_Sign || _Sign_correction) {
+                                if constexpr (_Unrolled) {
+                                    for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) {
+                                        _Cur_vals_min[0] = _Traits::_Min(_Cur_vals_min[0], _Cur_vals_min[_Lane]);
+                                    }
+                                }
+
                                 // Vector populated by the smallest element
-                                const auto _H_min = _Traits::_H_min(_Cur_vals_min);
+                                const auto _H_min = _Traits::_H_min(_Cur_vals_min[0]);
                                 _Cur_min_val      = _Traits::_Get_any(_H_min); // Get any element of it
                             } else {
+                                if constexpr (_Unrolled) {
+                                    for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) {
+                                        _Cur_vals_min[0] = _Traits::_Min_u(_Cur_vals_min[0], _Cur_vals_min[_Lane]);
+                                    }
+                                }
+
                                 // Vector populated by the smallest element
-                                const auto _H_min = _Traits::_H_min_u(_Cur_vals_min);
+                                const auto _H_min = _Traits::_H_min_u(_Cur_vals_min[0]);
                                 _Cur_min_val      = _Traits::_Get_any(_H_min); // Get any element of it
                             }
                         }
 
                         if constexpr ((_Mode & _Mode_max) != 0) {
                             if constexpr (_Sign || _Sign_correction) {
+                                if constexpr (_Unrolled) {
+                                    for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) {
+                                        _Cur_vals_max[0] = _Traits::_Max(_Cur_vals_max[0], _Cur_vals_max[_Lane]);
+                                    }
+                                }
+
                                 // Vector populated by the largest element
-                                const auto _H_max = _Traits::_H_max(_Cur_vals_max);
+                                const auto _H_max = _Traits::_H_max(_Cur_vals_max[0]);
                                 _Cur_max_val      = _Traits::_Get_any(_H_max); // Get any element of it
                             } else {
+                                if constexpr (_Unrolled) {
+                                    for (size_t _Lane = 1; _Lane < _Lanes; ++_Lane) {
+                                        _Cur_vals_max[0] = _Traits::_Max_u(_Cur_vals_max[0], _Cur_vals_max[_Lane]);
+                                    }
+                                }
+
                                 // Vector populated by the largest element
-                                const auto _H_max = _Traits::_H_max_u(_Cur_vals_max);
+                                const auto _H_max = _Traits::_H_max_u(_Cur_vals_max[0]);
                                 _Cur_max_val      = _Traits::_Get_any(_H_max); // Get any element of it
                             }
                         }
@@ -3229,6 +3423,8 @@ namespace {
                 _Advance_bytes(_First, sizeof(_Ty));
             }
 
+// Avoid auto-vectorization of the scalar tail, as this is not beneficial for performance.
+#pragma loop(no_vector)
             for (auto _Ptr = static_cast<const _Ty*>(_First); _Ptr != _Last; ++_Ptr) {
                 if constexpr ((_Mode & _Mode_min) != 0) {
                     if (*_Ptr < _Cur_min_val) {
@@ -3258,7 +3454,7 @@ namespace {
             }
         }
 
-#ifndef _M_ARM64EC
+#if !defined(_M_ARM64) && !defined(_M_ARM64EC)
         // TRANSITION, DevCom-10767462
         template <_Min_max_mode _Mode, class _Traits, bool _Sign>
         auto _Minmax_impl_wrap(const void* const _First, const void* const _Last) noexcept {
@@ -3266,11 +3462,19 @@ namespace {
             _mm256_zeroupper();
             return _Rx;
         }
-#endif // ^^^ !defined(_M_ARM64EC) ^^^
+#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^
 
         template <_Min_max_mode _Mode, class _Traits, bool _Sign>
         auto __stdcall _Minmax_disp(const void* const _First, const void* const _Last) noexcept {
-#ifndef _M_ARM64EC
+#ifdef _M_ARM64
+            if (_Byte_length(_First, _Last) >= 32) {
+                return _Minmax_impl<_Mode, typename _Traits::_Neon, _Sign, true>(_First, _Last);
+            }
+
+            if (_Byte_length(_First, _Last) >= 16) {
+                return _Minmax_impl<_Mode, typename _Traits::_Neon, _Sign, false>(_First, _Last);
+            }
+#elif !defined(_M_ARM64EC)
             if (_Byte_length(_First, _Last) >= 32 && _Use_avx2()) {
                 if constexpr (_Traits::_Avx::_Is_floating) {
                     return _Minmax_impl_wrap<_Mode, typename _Traits::_Avx, _Sign>(_First, _Last);
@@ -3286,6 +3490,7 @@ namespace {
             return _Minmax_impl<_Mode, typename _Traits::_Scalar, _Sign>(_First, _Last);
         }
 
+#ifndef _M_ARM64
         template <class _Traits, class _Ty>
         const void* _Is_sorted_until_impl(const void* _First, const void* const _Last, const bool _Greater) noexcept {
             const ptrdiff_t _Left_off  = 0 - static_cast<ptrdiff_t>(_Greater);
@@ -3487,7 +3692,6 @@ _Min_max_element_t __stdcall __std_minmax_element_d(const void* const _First, co
     return _Sorting::_Minmax_element_disp<_Sorting::_Mode_both, _Sorting::_Traits_d>(_First, _Last, false);
 }
 
-#ifndef _M_ARM64
 __declspec(noalias) int8_t __stdcall __std_min_1i(const void* const _First, const void* const _Last) noexcept {
     return _Sorting::_Minmax_disp<_Sorting::_Mode_min, _Sorting::_Traits_1, true>(_First, _Last);
 }
@@ -3608,6 +3812,7 @@ __declspec(noalias) _Min_max_d __stdcall __std_minmax_d(const void* const _First
     return _Sorting::_Minmax_disp<_Sorting::_Mode_both, _Sorting::_Traits_d, true>(_First, _Last);
 }
 
+#ifndef _M_ARM64
 const void* __stdcall __std_is_sorted_until_1i(
     const void* const _First, const void* const _Last, const bool _Greater) noexcept {
     return _Sorting::_Is_sorted_until_disp<_Sorting::_Traits_1, int8_t>(_First, _Last, _Greater);