From 8a8dda817ae84d7ca24f1211a24118aa4de5ad40 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 1 Dec 2023 13:32:51 -0800 Subject: [PATCH] ``: Fix ARM64EC and CHPE codegen (#4222) --- stl/inc/__msvc_bit_utils.hpp | 24 +++++------ stl/inc/__msvc_int128.hpp | 6 +-- stl/inc/atomic | 77 ++++++++++++------------------------ stl/inc/bit | 6 +-- stl/inc/xatomic.h | 9 +++-- stl/inc/xcharconv_ryu.h | 44 +++++++++++++-------- 6 files changed, 75 insertions(+), 91 deletions(-) diff --git a/stl/inc/__msvc_bit_utils.hpp b/stl/inc/__msvc_bit_utils.hpp index 4d29b905e1..febf026e59 100644 --- a/stl/inc/__msvc_bit_utils.hpp +++ b/stl/inc/__msvc_bit_utils.hpp @@ -14,8 +14,8 @@ #include _STL_INTRIN_HEADER // TRANSITION, GH-2129, move down to _Arm64_popcount -#if (defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \ - && !defined(__INTEL_COMPILER) && !defined(__clang__) // TRANSITION, LLVM-51488 +#if (defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64)) && !defined(_M_CEE_PURE) \ + && !defined(__CUDACC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) // TRANSITION, LLVM-51488 #define _HAS_NEON_INTRINSICS 1 #else // ^^^ intrinsics available / intrinsics unavailable vvv #define _HAS_NEON_INTRINSICS 0 @@ -70,7 +70,7 @@ _NODISCARD constexpr int _Countl_zero_fallback(_Ty _Val) noexcept { #endif // ^^^ intrinsics unavailable ^^^ #if _HAS_COUNTL_ZERO_INTRINSICS -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) template _NODISCARD int _Countl_zero_lzcnt(const _Ty _Val) noexcept { constexpr int _Digits = _Unsigned_integer_digits<_Ty>; @@ -136,9 +136,9 @@ _NODISCARD int _Checked_x86_x64_countl_zero(const _Ty _Val) noexcept { } #endif // ^^^ !defined(__AVX2__) ^^^ } -#endif // defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#endif // (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) -#if defined(_M_ARM) || defined(_M_ARM64) +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #ifdef __clang__ // TRANSITION, GH-1586 _NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned short _Val) { return __builtin_clzs(_Val); @@ -179,7 +179,7 @@ _NODISCARD int _Checked_arm_arm64_countl_zero(const _Ty _Val) noexcept { } #endif // TRANSITION, GH-1586 } -#endif // defined(_M_ARM) || defined(_M_ARM64) +#endif // defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #endif // _HAS_COUNTL_ZERO_INTRINSICS // Implementation of countr_zero without using specialized CPU instructions. @@ -196,14 +196,14 @@ _NODISCARD constexpr int _Countr_zero_fallback(const _Ty _Val) noexcept { template _NODISCARD constexpr int _Popcount_fallback(_Ty _Val) noexcept { constexpr int _Digits = _Unsigned_integer_digits<_Ty>; -#if defined(_M_IX86) || defined(_M_ARM) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || defined(_M_ARM) if constexpr (_Digits == 64) { // 64-bit bit operations on architectures without 64-bit registers are less efficient, // hence we split the value so that it fits in 32-bit registers return _Popcount_fallback(static_cast(_Val)) + _Popcount_fallback(static_cast(_Val >> 32)); } -#endif // defined(_M_IX86) || defined(_M_ARM) +#endif // (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || defined(_M_ARM) // we static_cast these bit patterns in order to truncate them to the correct size _Val = static_cast<_Ty>(_Val - ((_Val >> 1) & static_cast<_Ty>(0x5555'5555'5555'5555ull))); _Val = static_cast<_Ty>((_Val & static_cast<_Ty>(0x3333'3333'3333'3333ull)) @@ -215,8 +215,8 @@ _NODISCARD constexpr int _Popcount_fallback(_Ty _Val) noexcept { return static_cast(_Val >> (_Digits - 8)); } -#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \ - && !defined(__INTEL_COMPILER) +#if ((defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC))) \ + && !defined(_M_CEE_PURE) && !defined(__CUDACC__) && !defined(__INTEL_COMPILER) #define _HAS_TZCNT_BSF_INTRINSICS 1 #else // ^^^ intrinsics available / intrinsics unavailable vvv #define _HAS_TZCNT_BSF_INTRINSICS 0 @@ -310,8 +310,8 @@ _NODISCARD int _Checked_x86_x64_countr_zero(const _Ty _Val) noexcept { #endif // _HAS_TZCNT_BSF_INTRINSICS -#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \ - && !defined(__INTEL_COMPILER) +#if ((defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC))) \ + && !defined(_M_CEE_PURE) && !defined(__CUDACC__) && !defined(__INTEL_COMPILER) #define _HAS_POPCNT_INTRINSICS 1 #else // ^^^ intrinsics available / intrinsics unavailable vvv #define _HAS_POPCNT_INTRINSICS 0 diff --git a/stl/inc/__msvc_int128.hpp b/stl/inc/__msvc_int128.hpp index 7c7443c407..146f2f85cb 100644 --- a/stl/inc/__msvc_int128.hpp +++ b/stl/inc/__msvc_int128.hpp @@ -56,15 +56,15 @@ template _NODISCARD constexpr int _Countl_zero_internal(const _Ty _Val) noexcept { _STL_INTERNAL_STATIC_ASSERT(_Is_standard_unsigned_integer<_Ty>); #if _HAS_COUNTL_ZERO_INTRINSICS -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) if (!_Is_constant_evaluated()) { return _Checked_x86_x64_countl_zero(_Val); } -#elif defined(_M_ARM) || defined(_M_ARM64) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) if (!_Is_constant_evaluated()) { return _Checked_arm_arm64_countl_zero(_Val); } -#endif // defined(_M_ARM) || defined(_M_ARM64) +#endif // defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #endif // _HAS_COUNTL_ZERO_INTRINSICS return _Countl_zero_fallback(_Val); diff --git a/stl/inc/atomic b/stl/inc/atomic index f1f55f1838..a984e04021 100644 --- a/stl/inc/atomic +++ b/stl/inc/atomic @@ -54,13 +54,13 @@ extern "C" _NODISCARD char __stdcall __std_atomic_has_cmpxchg16b() noexcept; // Controls whether ARM64 ldar/ldapr/stlr should be used #ifndef _STD_ATOMIC_USE_ARM64_LDAR_STLR -#if defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #ifdef __clang__ // TRANSITION, LLVM-62103 #define _STD_ATOMIC_USE_ARM64_LDAR_STLR 0 #else // ^^^ Clang doesn't support new intrinsics / __load_acquire/__stlr intrinsics are available vvv #define _STD_ATOMIC_USE_ARM64_LDAR_STLR 1 #endif // ^^^ __load_acquire/__stlr intrinsics are available ^^^ -#else // ^^^ ARM64/ARM64EC / Other architectures vvv +#else // ^^^ ARM64/ARM64EC/HYBRID_X86_ARM64 / Other architectures vvv #define _STD_ATOMIC_USE_ARM64_LDAR_STLR 0 #endif // ^^^ Other architectures ^^^ #endif // _STD_ATOMIC_USE_ARM64_LDAR_STLR @@ -112,11 +112,11 @@ extern "C" inline void _Check_memory_order(const unsigned int _Order) noexcept { // we avoid wrapping them in do {} while (0) because MSVC generates code for such loops // in debug mode. -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) #define _ATOMIC_CHOOSE_INTRINSIC(_Order, _Result, _Intrinsic, ...) \ _Check_memory_order(_Order); \ _Result = _Intrinsic(__VA_ARGS__) -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #define _ATOMIC_CHOOSE_INTRINSIC(_Order, _Result, _Intrinsic, ...) \ switch (_Order) { \ case _Atomic_memory_order_relaxed: \ @@ -238,11 +238,12 @@ extern "C" inline void _Check_memory_order(const unsigned int _Order) noexcept { #define _ATOMIC_STORE_SEQ_CST(_Width, _Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM(_Width, (_Ptr), (_Desired)) #define _ATOMIC_STORE_32_SEQ_CST(_Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM(32, (_Ptr), (_Desired)) #define _ATOMIC_STORE_64_SEQ_CST(_Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM(64, (_Ptr), (_Desired)) -#elif defined(_M_ARM64) || defined(_M_ARM64EC) // ^^^ ARM32 / ARM64/ARM64EC vvv +#elif defined(_M_ARM64) || defined(_M_ARM64EC) \ + || defined(_M_HYBRID_X86_ARM64) // ^^^ ARM32 / ARM64/ARM64EC/HYBRID_X86_ARM64 vvv #define _ATOMIC_STORE_SEQ_CST(_Width, _Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM64(_Width, (_Ptr), (_Desired)) #define _ATOMIC_STORE_32_SEQ_CST(_Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM64(32, (_Ptr), (_Desired)) #define _ATOMIC_STORE_64_SEQ_CST(_Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_ARM64(64, (_Ptr), (_Desired)) -#elif defined(_M_IX86) || defined(_M_X64) // ^^^ ARM64/ARM64EC / x86/x64 vvv +#elif defined(_M_IX86) || defined(_M_X64) // ^^^ ARM64/ARM64EC/HYBRID_X86_ARM64 / x86/x64 vvv #define _ATOMIC_STORE_SEQ_CST(_Width, _Ptr, _Desired) _ATOMIC_STORE_SEQ_CST_X86_X64(_Width, (_Ptr), (_Desired)) #define _ATOMIC_STORE_32_SEQ_CST(_Ptr, _Desired) _ATOMIC_STORE_32_SEQ_CST_X86_X64((_Ptr), (_Desired)) #ifdef _M_IX86 @@ -263,7 +264,7 @@ extern "C" inline void _Atomic_thread_fence(const unsigned int _Order) noexcept return; } -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) _Compiler_barrier(); if (_Order == _Atomic_memory_order_seq_cst) { volatile long _Guard; // Not initialized to avoid an unnecessary operation; the value does not matter @@ -274,9 +275,9 @@ extern "C" inline void _Atomic_thread_fence(const unsigned int _Order) noexcept (void) _InterlockedIncrement(&_Guard); _Compiler_barrier(); } -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) _Memory_barrier(); -#else // ^^^ ARM32/ARM64/ARM64EC / unsupported hardware vvv +#else // ^^^ ARM32/ARM64/ARM64EC/HYBRID_X86_ARM64 / unsupported hardware vvv #error Unsupported hardware #endif // ^^^ unsupported hardware ^^^ } @@ -523,7 +524,7 @@ void _Atomic_wait_direct( #if 1 // TRANSITION, ABI, GH-1151 inline void _Atomic_lock_acquire(long& _Spinlock) noexcept { -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) // Algorithm from Intel(R) 64 and IA-32 Architectures Optimization Reference Manual, May 2020 // Example 2-4. Contended Locks with Increasing Back-off Example - Improved Version, page 2-22 // The code in mentioned manual is covered by the 0BSD license. @@ -537,25 +538,25 @@ inline void _Atomic_lock_acquire(long& _Spinlock) noexcept { _Current_backoff = _Current_backoff < _Max_backoff ? _Current_backoff << 1 : _Max_backoff; } } -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) while (_InterlockedExchange(&_Spinlock, 1) != 0) { // TRANSITION, GH-1133: _InterlockedExchange_acq while (__iso_volatile_load32(&reinterpret_cast(_Spinlock)) != 0) { __yield(); } } -#else // ^^^ defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ +#else // ^^^ defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) ^^^ #error Unsupported hardware #endif } inline void _Atomic_lock_release(long& _Spinlock) noexcept { -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) _InterlockedExchange(&_Spinlock, 0); // TRANSITION, GH-1133: same as ARM -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) _Memory_barrier(); __iso_volatile_store32(reinterpret_cast(&_Spinlock), 0); _Memory_barrier(); // TRANSITION, GH-1133: remove -#else // ^^^ defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ +#else // ^^^ defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) ^^^ #error Unsupported hardware #endif } @@ -777,14 +778,8 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics } } - _NODISCARD _TVal load() const noexcept { // load with sequential consistency - const auto _Mem = _Atomic_address_as(_Storage); - char _As_bytes = __iso_volatile_load8(_Mem); - _Compiler_or_memory_barrier(); - return reinterpret_cast<_TVal&>(_As_bytes); - } - - _NODISCARD _TVal load(const memory_order _Order) const noexcept { // load with given memory order + _NODISCARD _TVal load( + const memory_order _Order = memory_order_seq_cst) const noexcept { // load with given memory order const auto _Mem = _Atomic_address_as(_Storage); char _As_bytes; #if _STD_ATOMIC_USE_ARM64_LDAR_STLR == 1 @@ -885,14 +880,8 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics } } - _NODISCARD _TVal load() const noexcept { // load with sequential consistency - const auto _Mem = _Atomic_address_as(_Storage); - short _As_bytes = __iso_volatile_load16(_Mem); - _Compiler_or_memory_barrier(); - return reinterpret_cast<_TVal&>(_As_bytes); - } - - _NODISCARD _TVal load(const memory_order _Order) const noexcept { // load with given memory order + _NODISCARD _TVal load( + const memory_order _Order = memory_order_seq_cst) const noexcept { // load with given memory order const auto _Mem = _Atomic_address_as(_Storage); short _As_bytes; #if _STD_ATOMIC_USE_ARM64_LDAR_STLR == 1 @@ -992,14 +981,8 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics } } - _NODISCARD _TVal load() const noexcept { // load with sequential consistency - const auto _Mem = _Atomic_address_as(_Storage); - int _As_bytes = __iso_volatile_load32(_Mem); - _Compiler_or_memory_barrier(); - return reinterpret_cast<_TVal&>(_As_bytes); - } - - _NODISCARD _TVal load(const memory_order _Order) const noexcept { // load with given memory order + _NODISCARD _TVal load( + const memory_order _Order = memory_order_seq_cst) const noexcept { // load with given memory order const auto _Mem = _STD _Atomic_address_as(_Storage); int _As_bytes; #if _STD_ATOMIC_USE_ARM64_LDAR_STLR == 1 @@ -1100,18 +1083,8 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics } } - _NODISCARD _TVal load() const noexcept { // load with sequential consistency - const auto _Mem = _Atomic_address_as(_Storage); -#ifdef _M_ARM - long long _As_bytes = __ldrexd(_Mem); -#else - long long _As_bytes = __iso_volatile_load64(_Mem); -#endif - _Compiler_or_memory_barrier(); - return reinterpret_cast<_TVal&>(_As_bytes); - } - - _NODISCARD _TVal load(const memory_order _Order) const noexcept { // load with given memory order + _NODISCARD _TVal load( + const memory_order _Order = memory_order_seq_cst) const noexcept { // load with given memory order const auto _Mem = _STD _Atomic_address_as(_Storage); long long _As_bytes; #if _STD_ATOMIC_USE_ARM64_LDAR_STLR == 1 @@ -1119,7 +1092,7 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics #else // ^^^ _STD_ATOMIC_USE_ARM64_LDAR_STLR == 1 / _STD_ATOMIC_USE_ARM64_LDAR_STLR != 1 vvv #ifdef _M_ARM - _As_bytes = __ldrexd(_Mem); + _As_bytes = __ldrexd(_Mem); #else _As_bytes = __iso_volatile_load64(_Mem); #endif diff --git a/stl/inc/bit b/stl/inc/bit index e07ebe79c4..ca62fadab6 100644 --- a/stl/inc/bit +++ b/stl/inc/bit @@ -191,15 +191,15 @@ _NODISCARD constexpr _Ty rotr(const _Ty _Val, const int _Rotation) noexcept { _EXPORT_STD template , int> /* = 0 */> _NODISCARD constexpr int countl_zero(const _Ty _Val) noexcept { #if _HAS_COUNTL_ZERO_INTRINSICS -#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) || (defined(_M_X64) && !defined(_M_ARM64EC)) if (!_STD is_constant_evaluated()) { return _Checked_x86_x64_countl_zero(_Val); } -#elif defined(_M_ARM) || defined(_M_ARM64) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) if (!_STD is_constant_evaluated()) { return _Checked_arm_arm64_countl_zero(_Val); } -#endif // defined(_M_ARM) || defined(_M_ARM64) +#endif // defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #endif // _HAS_COUNTL_ZERO_INTRINSICS return _Countl_zero_fallback(_Val); diff --git a/stl/inc/xatomic.h b/stl/inc/xatomic.h index 8e9598e28e..daf370b761 100644 --- a/stl/inc/xatomic.h +++ b/stl/inc/xatomic.h @@ -23,7 +23,8 @@ _STL_DISABLE_CLANG_WARNINGS #define _CONCAT(x, y) _CONCATX(x, y) // Interlocked intrinsic mapping for _nf/_acq/_rel -#if defined(_M_CEE_PURE) || defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC)) +#if defined(_M_CEE_PURE) || (defined(_M_IX86) && !defined(_M_HYBRID_X86_ARM64)) \ + || (defined(_M_X64) && !defined(_M_ARM64EC)) #define _INTRIN_RELAXED(x) x #define _INTRIN_ACQUIRE(x) x #define _INTRIN_RELEASE(x) x @@ -34,7 +35,7 @@ _STL_DISABLE_CLANG_WARNINGS #define _YIELD_PROCESSOR() _mm_pause() #endif // ^^^ !defined(_M_CEE_PURE) ^^^ -#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #define _INTRIN_RELAXED(x) _CONCAT(x, _nf) #define _INTRIN_ACQUIRE(x) _CONCAT(x, _acq) #define _INTRIN_RELEASE(x) _CONCAT(x, _rel) @@ -43,7 +44,7 @@ _STL_DISABLE_CLANG_WARNINGS #define _INTRIN_ACQ_REL(x) x #define _YIELD_PROCESSOR() __yield() -#else // ^^^ ARM32/ARM64 / unsupported hardware vvv +#else // ^^^ ARM32/ARM64/ARM64EC/HYBRID_X86_ARM64 / unsupported hardware vvv #error Unsupported hardware #endif // hardware @@ -54,7 +55,7 @@ _STL_DISABLE_CLANG_WARNINGS // Also: if any macros are added they should be #undefed in vcruntime as well. #define _Compiler_barrier() _STL_DISABLE_DEPRECATED_WARNING _ReadWriteBarrier() _STL_RESTORE_DEPRECATED_WARNING -#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) +#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier #define _Compiler_or_memory_barrier() _Memory_barrier() #elif defined(_M_IX86) || defined(_M_X64) diff --git a/stl/inc/xcharconv_ryu.h b/stl/inc/xcharconv_ryu.h index 343a09226b..0e97cb1231 100644 --- a/stl/inc/xcharconv_ryu.h +++ b/stl/inc/xcharconv_ryu.h @@ -48,14 +48,18 @@ #include #include -#if defined(_M_X64) && !defined(_M_ARM64EC) +#if defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) #define _HAS_CHARCONV_INTRINSICS 1 #else // ^^^ intrinsics available / intrinsics unavailable vvv #define _HAS_CHARCONV_INTRINSICS 0 #endif // ^^^ intrinsics unavailable ^^^ #if _HAS_CHARCONV_INTRINSICS -#include _STL_INTRIN_HEADER // for _umul128() and __shiftright128() +#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) +#include // TRANSITION, VSO-1918426 +#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) / defined(_M_X64) vvv +#include _STL_INTRIN_HEADER // for _umul128(), __umulh(), and __shiftright128() +#endif // ^^^ defined(_M_X64) ^^^ #endif // ^^^ intrinsics available ^^^ #pragma pack(push, _CRT_PACKING) @@ -145,19 +149,12 @@ inline constexpr int __DOUBLE_POW5_BITCOUNT = 121; #if _HAS_CHARCONV_INTRINSICS _NODISCARD inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) { +#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) + *__productHi = __umulh(__a, __b); + return __a * __b; +#else // ^^^ not native X64 / native X64 vvv return _umul128(__a, __b, __productHi); -} - -_NODISCARD inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) { - // For the __shiftright128 intrinsic, the shift value is always - // modulo 64. - // In the current implementation of the double-precision version - // of Ryu, the shift value is always < 64. - // (The shift value is in the range [49, 58].) - // Check this here in case a future change requires larger shift - // values. In this case this function needs to be adjusted. - _STL_INTERNAL_CHECK(__dist < 64); - return __shiftright128(__lo, __hi, static_cast(__dist)); +#endif // defined(_M_ARM64) || defined(_M_ARM64EC) || defined(_M_HYBRID_X86_ARM64) } #else // ^^^ intrinsics available / intrinsics unavailable vvv @@ -193,20 +190,33 @@ _NODISCARD __forceinline uint64_t __ryu_umul128(const uint64_t __a, const uint64 return __pLo; } +#endif // ^^^ intrinsics unavailable ^^^ + _NODISCARD inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) { +#if defined(_M_X64) && !defined(_M_ARM64EC) + // For the __shiftright128 intrinsic, the shift value is always + // modulo 64. + // In the current implementation of the double-precision version + // of Ryu, the shift value is always < 64. + // (The shift value is in the range [49, 58].) + // Check this here in case a future change requires larger shift + // values. In this case this function needs to be adjusted. + _STL_INTERNAL_CHECK(__dist < 64); + return __shiftright128(__lo, __hi, static_cast(__dist)); +#else // ^^^ defined(_M_X64) && !defined(_M_ARM64EC) / !defined(_M_X64) || defined(_M_ARM64EC) vvv // We don't need to handle the case __dist >= 64 here (see above). _STL_INTERNAL_CHECK(__dist < 64); -#ifdef _WIN64 +#if defined(_WIN64) || defined(_M_HYBRID_X86_ARM64) _STL_INTERNAL_CHECK(__dist > 0); return (__hi << (64 - __dist)) | (__lo >> __dist); -#else // ^^^ 64-bit / 32-bit vvv +#else // ^^^ 64-bit or _M_HYBRID_X86_ARM64 / 32-bit vvv // Avoid a 64-bit shift by taking advantage of the range of shift values. _STL_INTERNAL_CHECK(__dist >= 32); return (__hi << (64 - __dist)) | (static_cast(__lo >> 32) >> (__dist - 32)); #endif // ^^^ 32-bit ^^^ +#endif // defined(_M_X64) && !defined(_M_ARM64EC) } -#endif // ^^^ intrinsics unavailable ^^^ #ifndef _WIN64