diff --git a/stl/inc/atomic b/stl/inc/atomic index 8226695a92b..96e2c387467 100644 --- a/stl/inc/atomic +++ b/stl/inc/atomic @@ -30,25 +30,9 @@ _STL_DISABLE_CLANG_WARNINGS #if defined(_M_ARM) || defined(_M_ARM64) #define _Memory_barrier() __dmb(0xB) // inner shared data memory barrier #define _Compiler_or_memory_barrier() _Memory_barrier() - -#define _ISO_VOLATILE_STORE8(_Storage, _Value) __iso_volatile_store8(_Atomic_address_as(_Storage), _Value) -#define _ISO_VOLATILE_STORE16(_Storage, _Value) __iso_volatile_store16(_Atomic_address_as(_Storage), _Value) -#define _ISO_VOLATILE_STORE32(_Storage, _Value) __iso_volatile_store32(_Atomic_address_as(_Storage), _Value) -#define _ISO_VOLATILE_STORE64(_Storage, _Value) __iso_volatile_store64(_Atomic_address_as(_Storage), _Value) -#define _ISO_VOLATILE_LOAD8(_Storage) __iso_volatile_load8(_Atomic_address_as(_Storage)) -#define _ISO_VOLATILE_LOAD16(_Storage) __iso_volatile_load16(_Atomic_address_as(_Storage)) - #elif defined(_M_IX86) || defined(_M_X64) // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics #define _Compiler_or_memory_barrier() _Compiler_barrier() - -#define _ISO_VOLATILE_STORE8(_Storage, _Value) (*_Atomic_address_as(_Storage) = _Value) -#define _ISO_VOLATILE_STORE16(_Storage, _Value) (*_Atomic_address_as(_Storage) = _Value) -#define _ISO_VOLATILE_STORE32(_Storage, _Value) (*_Atomic_address_as(_Storage) = _Value) -#define _ISO_VOLATILE_STORE64(_Storage, _Value) (*_Atomic_address_as(_Storage) = _Value) -#define _ISO_VOLATILE_LOAD8(_Storage) (*_Atomic_address_as(_Storage)) -#define _ISO_VOLATILE_LOAD16(_Storage) (*_Atomic_address_as(_Storage)) - #else // ^^^ x86/x64 / unsupported hardware vvv #error Unsupported hardware #endif // hardware @@ -133,6 +117,39 @@ _NODISCARD extern "C" bool __cdecl __std_atomic_has_cmpxchg16b() noexcept; _STD_BEGIN +// FENCES +extern "C" inline void atomic_thread_fence(const memory_order _Order) noexcept { + if (_Order == memory_order_relaxed) { + return; + } + +#if defined(_M_IX86) || defined(_M_X64) + _Compiler_barrier(); + if (_Order == memory_order_seq_cst) { + volatile long _Guard; // Not initialized to avoid an unnecessary operation; the value does not matter + + // _mm_mfence could have been used, but it is not supported on older x86 CPUs and is slower on some recent CPUs. + // The memory fence provided by interlocked operations has some exceptions, but this is fine: + // std::atomic_thread_fence works with respect to other atomics only; it may not be a full fence for all ops. +#pragma warning(suppress : 6001) // "Using uninitialized memory '_Guard'" +#pragma warning(suppress : 28113) // "Accessing a local variable _Guard via an Interlocked function: This is an unusual + // usage which could be reconsidered." + (void) _InterlockedIncrement(&_Guard); + _Compiler_barrier(); + } +#elif defined(_M_ARM) || defined(_M_ARM64) + _Memory_barrier(); +#else // ^^^ ARM32/ARM64 / unsupported hardware vvv +#error Unsupported hardware +#endif // unsupported hardware +} + +extern "C" inline void atomic_signal_fence(const memory_order _Order) noexcept { + if (_Order != memory_order_relaxed) { + _Compiler_barrier(); + } +} + // FUNCTION TEMPLATE kill_dependency template _Ty kill_dependency(_Ty _Arg) noexcept { // "magic" template that kills dependency ordering when called @@ -417,14 +434,15 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics } void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order + const auto _Mem = _Atomic_address_as(_Storage); const char _As_bytes = _Atomic_reinterpret_as(_Value); switch (_Order) { case memory_order_relaxed: - _ISO_VOLATILE_STORE8(_Storage, _As_bytes); + __iso_volatile_store8(_Mem, _As_bytes); return; case memory_order_release: _Compiler_or_memory_barrier(); - _ISO_VOLATILE_STORE8(_Storage, _As_bytes); + __iso_volatile_store8(_Mem, _As_bytes); return; default: case memory_order_consume: @@ -439,13 +457,15 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics } _NODISCARD _Ty load() const noexcept { // load with sequential consistency - char _As_bytes = _ISO_VOLATILE_LOAD8(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + char _As_bytes = __iso_volatile_load8(_Mem); _Compiler_or_memory_barrier(); return reinterpret_cast<_Ty&>(_As_bytes); } _NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order - char _As_bytes = _ISO_VOLATILE_LOAD8(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + char _As_bytes = __iso_volatile_load8(_Mem); _Load_barrier(_Order); return reinterpret_cast<_Ty&>(_As_bytes); } @@ -496,14 +516,15 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics } void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order + const auto _Mem = _Atomic_address_as(_Storage); const short _As_bytes = _Atomic_reinterpret_as(_Value); switch (_Order) { case memory_order_relaxed: - _ISO_VOLATILE_STORE16(_Storage, _As_bytes); + __iso_volatile_store16(_Mem, _As_bytes); return; case memory_order_release: _Compiler_or_memory_barrier(); - _ISO_VOLATILE_STORE16(_Storage, _As_bytes); + __iso_volatile_store16(_Mem, _As_bytes); return; default: case memory_order_consume: @@ -518,13 +539,15 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics } _NODISCARD _Ty load() const noexcept { // load with sequential consistency - short _As_bytes = _ISO_VOLATILE_LOAD16(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + short _As_bytes = __iso_volatile_load16(_Mem); _Compiler_or_memory_barrier(); return reinterpret_cast<_Ty&>(_As_bytes); } _NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order - short _As_bytes = _ISO_VOLATILE_LOAD16(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + short _As_bytes = __iso_volatile_load16(_Mem); _Load_barrier(_Order); return reinterpret_cast<_Ty&>(_As_bytes); } @@ -565,7 +588,7 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics void store(const _Ty _Value) noexcept { // store with sequential consistency #if defined(_M_ARM) || defined(_M_ARM64) _Memory_barrier(); - _ISO_VOLATILE_STORE32(_Storage, _Atomic_reinterpret_as(_Value)); + __iso_volatile_store32(_Atomic_address_as(_Storage), _Atomic_reinterpret_as(_Value)); _Memory_barrier(); #else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv (void) _InterlockedExchange(_Atomic_address_as(_Storage), _Atomic_reinterpret_as(_Value)); @@ -573,14 +596,15 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics } void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order + const auto _Mem = _Atomic_address_as(_Storage); const int _As_bytes = _Atomic_reinterpret_as(_Value); switch (_Order) { case memory_order_relaxed: - _ISO_VOLATILE_STORE32(_Storage, _As_bytes); + __iso_volatile_store32(_Mem, _As_bytes); return; case memory_order_release: _Compiler_or_memory_barrier(); - _ISO_VOLATILE_STORE32(_Storage, _As_bytes); + __iso_volatile_store32(_Mem, _As_bytes); return; default: case memory_order_consume: @@ -595,13 +619,15 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics } _NODISCARD _Ty load() const noexcept { // load with sequential consistency - auto _As_bytes = _ISO_VOLATILE_LOAD32(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + auto _As_bytes = __iso_volatile_load32(_Mem); _Compiler_or_memory_barrier(); return reinterpret_cast<_Ty&>(_As_bytes); } _NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order - auto _As_bytes = _ISO_VOLATILE_LOAD32(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); + auto _As_bytes = __iso_volatile_load32(_Mem); _Load_barrier(_Order); return reinterpret_cast<_Ty&>(_As_bytes); } @@ -639,18 +665,14 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics // non-atomically initialize this atomic } -#ifdef _M_IX86 - void store(const _Ty _Value, const memory_order _Order = memory_order_seq_cst) noexcept { - // store with (effectively) sequential consistency - _Check_store_memory_order(_Order); - (void) exchange(_Value, _Order); - } -#else // ^^^ _M_IX86 / !_M_IX86 vvv - void store(const _Ty _Value) noexcept { // store with sequential consistency const auto _Mem = _Atomic_address_as(_Storage); const long long _As_bytes = _Atomic_reinterpret_as(_Value); -#ifdef _M_ARM64 +#if defined(_M_IX86) + _Compiler_barrier(); + __iso_volatile_store64(_Mem, _As_bytes); + _STD atomic_thread_fence(memory_order_seq_cst); +#elif defined(_M_ARM64) _Memory_barrier(); __iso_volatile_store64(_Mem, _As_bytes); _Memory_barrier(); @@ -660,14 +682,15 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics } void store(const _Ty _Value, const memory_order _Order) noexcept { // store with given memory order + const auto _Mem = _Atomic_address_as(_Storage); const long long _As_bytes = _Atomic_reinterpret_as(_Value); switch (_Order) { case memory_order_relaxed: - _ISO_VOLATILE_STORE64(_Storage, _As_bytes); + __iso_volatile_store64(_Mem, _As_bytes); return; case memory_order_release: _Compiler_or_memory_barrier(); - _ISO_VOLATILE_STORE64(_Storage, _As_bytes); + __iso_volatile_store64(_Mem, _As_bytes); return; default: case memory_order_consume: @@ -680,33 +703,27 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics return; } } -#endif // _M_IX86 _NODISCARD _Ty load() const noexcept { // load with sequential consistency - const auto _Mem = _Atomic_address_as(_Storage); + const auto _Mem = _Atomic_address_as(_Storage); long long _As_bytes; -#if defined(_M_ARM) +#ifdef _M_ARM _As_bytes = __ldrexd(_Mem); _Memory_barrier(); -#elif defined(_M_IX86) || defined(_M_ARM64) +#else _As_bytes = __iso_volatile_load64(_Mem); _Compiler_or_memory_barrier(); -#else // _M_X64 - _As_bytes = *_Mem; - _Compiler_barrier(); -#endif // hardware +#endif return reinterpret_cast<_Ty&>(_As_bytes); } _NODISCARD _Ty load(const memory_order _Order) const noexcept { // load with given memory order - const auto _Mem = _Atomic_address_as(_Storage); -#if defined(_M_ARM) + const auto _Mem = _Atomic_address_as(_Storage); +#ifdef _M_ARM long long _As_bytes = __ldrexd(_Mem); -#elif defined(_M_IX86) || defined(_M_ARM64) +#else long long _As_bytes = __iso_volatile_load64(_Mem); -#else // _M_X64 - long long _As_bytes = *_Mem; -#endif // hardware +#endif _Load_barrier(_Order); return reinterpret_cast<_Ty&>(_As_bytes); } @@ -1929,37 +1946,6 @@ _Ty atomic_fetch_xor_explicit( return _Mem->fetch_xor(_Value, _Order); } -// FENCES -extern "C" inline void atomic_thread_fence(const memory_order _Order) noexcept { - if (_Order == memory_order_relaxed) { - return; - } - -#if defined(_M_ARM) || defined(_M_ARM64) - _Memory_barrier(); -#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv - _Compiler_barrier(); - if (_Order == memory_order_seq_cst) { - volatile long _Guard; // Not initialized to avoid an unnecessary operation; the value does not matter - - // _mm_mfence could have been used, but it is not supported on older x86 CPUs and is slower on some recent CPUs. - // The memory fence provided by interlocked operations has some exceptions, but this is fine: - // std::atomic_thread_fence works with respect to other atomics only; it may not be a full fence for all ops. -#pragma warning(suppress : 6001) // "Using uninitialized memory '_Guard'" -#pragma warning(suppress : 28113) // "Accessing a local variable _Guard via an Interlocked function: This is an unusual - // usage which could be reconsidered." - (void) _InterlockedIncrement(&_Guard); - _Compiler_barrier(); - } -#endif // hardware -} - -extern "C" inline void atomic_signal_fence(const memory_order _Order) noexcept { - if (_Order != memory_order_relaxed) { - _Compiler_barrier(); - } -} - // ATOMIC TYPEDEFS using atomic_bool = atomic; @@ -2119,13 +2105,7 @@ _STD_END #undef _ATOMIC_CHOOSE_INTRINSIC #undef _ATOMIC_HAS_DCAS -#undef _ISO_VOLATILE_LOAD8 -#undef _ISO_VOLATILE_LOAD16 -// #undef _ISO_VOLATILE_LOAD32 // Used in -#undef _ISO_VOLATILE_STORE8 -#undef _ISO_VOLATILE_STORE16 -#undef _ISO_VOLATILE_STORE32 -#undef _ISO_VOLATILE_STORE64 + #undef _STD_COMPARE_EXCHANGE_128 #undef _INVALID_MEMORY_ORDER #undef _Compiler_or_memory_barrier diff --git a/stl/inc/chrono b/stl/inc/chrono index 120f989cca2..f5d0b5354ae 100644 --- a/stl/inc/chrono +++ b/stl/inc/chrono @@ -604,6 +604,9 @@ namespace chrono { using time_point = chrono::time_point; static constexpr bool is_steady = true; +#pragma warning(push) +#pragma warning(disable : 28112) // A variable which is accessed via an Interlocked function must + // always be accessed via an Interlocked function. _NODISCARD static time_point now() noexcept { // get current time #if (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE) // Implement atomics avoiding header dependency @@ -623,10 +626,10 @@ namespace chrono { const long long _Freq = _Query_perf_frequency(); const long long _Ctr = _Query_perf_counter(); const long long _Result = _Scale_large_counter(_Ctr, _Freq); - if (_Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_freq, _Freq, LLONG_MAX)) { + if (_InterlockedCompareExchange64(&_Cached_freq, _Freq, LLONG_MAX) == LLONG_MAX) { // This is the first result, save current result as base for fast path - _Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_ctr_base, _Ctr, LLONG_MAX); - _Atomic_compare_exchange_strong_ll_seq_cst(&_Cached_result_base, _Result, LLONG_MAX); + _InterlockedCompareExchange64(&_Cached_ctr_base, _Ctr, LLONG_MAX); + _InterlockedCompareExchange64(&_Cached_result_base, _Result, LLONG_MAX); } // if _Result is not saved as first, it is still compatible with fast result return time_point(duration(_Result)); @@ -636,6 +639,7 @@ namespace chrono { return time_point(duration(_Scale_large_counter(_Ctr, _Freq))); #endif // (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE) } +#pragma warning(pop) private: _NODISCARD static long long _Scale_large_counter(const long long _Ctr, const long long _Freq) noexcept { diff --git a/stl/inc/memory b/stl/inc/memory index 0b8d97156db..3c713204eec 100644 --- a/stl/inc/memory +++ b/stl/inc/memory @@ -514,8 +514,11 @@ public: bool _Incref_nz() noexcept { // increment use count if not zero, return true if successful auto& _Volatile_uses = reinterpret_cast(_Uses); - long _Count = _ISO_VOLATILE_LOAD32(_Volatile_uses); - +#ifdef _M_CEE_PURE + long _Count = *_Atomic_address_as(&_Volatile_uses); +#else + long _Count = __iso_volatile_load32(reinterpret_cast(&_Volatile_uses)); +#endif while (_Count != 0) { const long _Old_value = _INTRIN_RELAXED(_InterlockedCompareExchange)(&_Volatile_uses, _Count + 1, _Count); if (_Old_value == _Count) { diff --git a/stl/inc/xatomic.h b/stl/inc/xatomic.h index 970d2fd4340..648ff2b5594 100644 --- a/stl/inc/xatomic.h +++ b/stl/inc/xatomic.h @@ -46,16 +46,6 @@ _STL_DISABLE_CLANG_WARNINGS #define _MT_INCR(x) _INTRIN_RELAXED(_InterlockedIncrement)(reinterpret_cast(&x)) #define _MT_DECR(x) _INTRIN_ACQ_REL(_InterlockedDecrement)(reinterpret_cast(&x)) -#if defined(_M_CEE_PURE) || defined(_M_IX86) || defined(_M_X64) -#define _ISO_VOLATILE_LOAD32(_Storage) (*_Atomic_address_as(_Storage)) - -#elif defined(_M_ARM) || defined(_M_ARM64) -#define _ISO_VOLATILE_LOAD32(_Storage) __iso_volatile_load32(_Atomic_address_as(_Storage)) - -#else // ^^^ ARM32/ARM64 / unsupported hardware vvv -#error Unsupported hardware -#endif // hardware - _STD_BEGIN #if _HAS_CXX20 @@ -104,24 +94,22 @@ _NODISCARD volatile _Integral* _Atomic_address_as(_Ty& _Source) noexcept { return &reinterpret_cast(_Source); } +template +_NODISCARD const volatile _Integral* _Atomic_address_as(const _Ty& _Source) noexcept { + // gets a pointer to the argument as an integral type (to pass to intrinsics) + static_assert(is_integral_v<_Integral>, "Tried to reinterpret memory as non-integral"); + return &reinterpret_cast(_Source); +} + // FUNCTION TEMPLATE _Atomic_load_ll_relaxed #if (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE) _NODISCARD inline long long _Atomic_load_ll_relaxed(volatile long long* _Mem) noexcept { // Copy from _Atomic_storage<_Ty, 8>::load -#if defined(_M_IX86) || defined(_M_ARM64) - return __iso_volatile_load64(_Mem); -#elif defined(_M_X64) - return *_Mem; -#else // _M_ARM +#ifdef _M_ARM return __ldrexd(_Mem); -#endif // hardware -} - -// FUNCTION TEMPLATE _Atomic_compare_exchange_strong_ll_seq_cst -inline bool _Atomic_compare_exchange_strong_ll_seq_cst( - volatile long long* _Mem, long long _Value, long long _Comparand) noexcept { - // Copy from _Atomic_storage<_Ty, 8>::store - return _InterlockedCompareExchange64(_Mem, _Value, _Comparand) == _Comparand; +#else + return __iso_volatile_load64(_Mem); +#endif } #endif // (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) && !defined(_M_CEE_PURE)