From e4b75dcd447ec7fdd640f8b28eb73ffe096c3832 Mon Sep 17 00:00:00 2001
From: Alex Guteniev <gutenev@gmail.com>
Date: Sun, 2 Aug 2020 03:54:07 +0300
Subject: [PATCH] Implement std::atomic::wait (#593)

Co-authored-by: Curtis J Bezault <curtbezault@gmail.com>
Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
Co-authored-by: Billy Robert O'Neal III <bion@microsoft.com>
Co-authored-by: Casey Carter <cartec69@gmail.com>
---
 CMakeLists.txt                                |   8 +-
 stl/CMakeLists.txt                            |  33 +-
 stl/inc/atomic                                | 359 ++++++++++++++++--
 stl/inc/execution                             |  74 ++--
 stl/inc/memory                                |  47 ++-
 stl/inc/xatomic_wait.h                        |  72 ++++
 stl/inc/yvals.h                               |  16 +
 stl/inc/yvals_core.h                          |  16 +
 stl/msbuild/stl_atomic_wait/dirs.proj         |  15 +
 stl/msbuild/stl_atomic_wait/md/dirs.proj      |  17 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.settings.targets        |  90 +++++
 .../stl_atomic_wait/msvcprt_atomic_wait.rc    |  23 ++
 .../stl_atomic_wait.files.settings.targets    |  15 +
 stl/msbuild/stl_atomic_wait/xmd/dirs.proj     |  24 ++
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../msvcp_atomic_wait.nativeproj              |  15 +
 .../stl_base/stl.files.settings.targets       |   5 +-
 .../stl_post/msvcp_post.settings.targets      |   3 +
 stl/src/atomic_wait.cpp                       | 334 ++++++++++++++++
 stl/src/msvcp_atomic_wait.src                 |  25 ++
 stl/src/parallel_algorithms.cpp               | 257 +------------
 tests/libcxx/expected_results.txt             |   1 -
 tests/libcxx/skipped_tests.txt                |   1 -
 tests/std/include/test_atomic_wait.hpp        | 203 ++++++++++
 tests/std/test.lst                            |   2 +
 tests/std/tests/P1135R6_atomic_wait/env.lst   |   4 +
 tests/std/tests/P1135R6_atomic_wait/test.cpp  |  10 +
 .../tests/P1135R6_atomic_wait_vista/env.lst   |   4 +
 .../tests/P1135R6_atomic_wait_vista/test.cpp  |   9 +
 .../VSO_0157762_feature_test_macros/test.cpp  |  14 +
 36 files changed, 1467 insertions(+), 334 deletions(-)
 create mode 100644 stl/inc/xatomic_wait.h
 create mode 100644 stl/msbuild/stl_atomic_wait/dirs.proj
 create mode 100644 stl/msbuild/stl_atomic_wait/md/dirs.proj
 create mode 100644 stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/msvcp_atomic_wait.settings.targets
 create mode 100644 stl/msbuild/stl_atomic_wait/msvcprt_atomic_wait.rc
 create mode 100644 stl/msbuild/stl_atomic_wait/stl_atomic_wait.files.settings.targets
 create mode 100644 stl/msbuild/stl_atomic_wait/xmd/dirs.proj
 create mode 100644 stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
 create mode 100644 stl/src/atomic_wait.cpp
 create mode 100644 stl/src/msvcp_atomic_wait.src
 create mode 100644 tests/std/include/test_atomic_wait.hpp
 create mode 100644 tests/std/tests/P1135R6_atomic_wait/env.lst
 create mode 100644 tests/std/tests/P1135R6_atomic_wait/test.cpp
 create mode 100644 tests/std/tests/P1135R6_atomic_wait_vista/env.lst
 create mode 100644 tests/std/tests/P1135R6_atomic_wait_vista/test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ed6054821..bf5cf6d9dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,27 +32,27 @@ if("${VCLIBS_TARGET_ARCHITECTURE}" MATCHES "^[xX]86$")
     set(VCLIBS_X86_OR_X64 "x86")
     # Note that we set _WIN32_WINNT to a high level to make declarations available, but still engage downlevel
     # runtime dynamic linking by setting our own _STL_WIN32_WINNT back to Windows XP.
-    add_compile_definitions(_X86_ _VCRT_WIN32_WINNT=_WIN32_WINNT_WINXP _STL_WIN32_WINNT=_WIN32_WINNT_WINXP
+    add_compile_definitions(_X86_ _VCRT_WIN32_WINNT=0x0501 _STL_WIN32_WINNT=0x0501
         _WIN32_WINNT=0x0602 NTDDI_VERSION=NTDDI_WIN8)
     add_compile_options(/arch:IA32)
 elseif(VCLIBS_TARGET_ARCHITECTURE MATCHES "^[xX]64$")
     set(VCLIBS_TARGET_ARCHITECTURE "x64")
     set(VCLIBS_I386_OR_AMD64 "amd64")
     set(VCLIBS_X86_OR_X64 "x64")
-    add_compile_definitions(_AMD64_ _VCRT_WIN32_WINNT=_WIN32_WINNT_WINXP _STL_WIN32_WINNT=_WIN32_WINNT_WINXP
+    add_compile_definitions(_AMD64_ _VCRT_WIN32_WINNT=0x0501 _STL_WIN32_WINNT=0x0501
         _WIN32_WINNT=0x0602 NTDDI_VERSION=NTDDI_WIN8)
 elseif(VCLIBS_TARGET_ARCHITECTURE MATCHES "^[aA][rR][mM][vV]7$")
     set(VCLIBS_TARGET_ARCHITECTURE "arm")
     set(VCLIBS_I386_OR_AMD64 "arm")
     set(VCLIBS_X86_OR_X64 "arm")
-    add_compile_definitions(_ARM_ _VCRT_WIN32_WINNT=_WIN32_WINNT_WIN8 _STL_WIN32_WINNT=_WIN32_WINNT_WIN8
+    add_compile_definitions(_ARM_ _VCRT_WIN32_WINNT=0x0602 _STL_WIN32_WINNT=0x0602
         _WIN32_WINNT=0x0602 NTDDI_VERSION=NTDDI_WIN8)
     string(APPEND CMAKE_CXX_STANDARD_LIBRARIES " Synchronization.lib")
 elseif(VCLIBS_TARGET_ARCHITECTURE MATCHES "^[aA][rR][mM]64$")
     set(VCLIBS_TARGET_ARCHITECTURE "arm64")
     set(VCLIBS_I386_OR_AMD64 "arm64")
     set(VCLIBS_X86_OR_X64 "arm64")
-    add_compile_definitions(_ARM64_ _VCRT_WIN32_WINNT=_WIN32_WINNT_WIN10 _STL_WIN32_WINNT=_WIN32_WINNT_WIN10
+    add_compile_definitions(_ARM64_ _VCRT_WIN32_WINNT=0x0A00 _STL_WIN32_WINNT=0x0A00
         _WIN32_WINNT=0x0A00 NTDDI_VERSION=NTDDI_WIN10)
     string(APPEND CMAKE_CXX_STANDARD_LIBRARIES " Synchronization.lib")
 else()
diff --git a/stl/CMakeLists.txt b/stl/CMakeLists.txt
index 4d03fa0e18..23dc688099 100644
--- a/stl/CMakeLists.txt
+++ b/stl/CMakeLists.txt
@@ -196,6 +196,7 @@ set(HEADERS
     ${CMAKE_CURRENT_LIST_DIR}/inc/vector
     ${CMAKE_CURRENT_LIST_DIR}/inc/version
     ${CMAKE_CURRENT_LIST_DIR}/inc/xatomic.h
+    ${CMAKE_CURRENT_LIST_DIR}/inc/xatomic_wait.h
     ${CMAKE_CURRENT_LIST_DIR}/inc/xbit_ops.h
     ${CMAKE_CURRENT_LIST_DIR}/inc/xcall_once.h
     ${CMAKE_CURRENT_LIST_DIR}/inc/xcharconv.h
@@ -241,7 +242,6 @@ set(IMPLIB_SOURCES
     ${CMAKE_CURRENT_LIST_DIR}/src/filesystem.cpp
     ${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp
     ${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp
-    ${CMAKE_CURRENT_LIST_DIR}/src/parallel_algorithms.cpp
     ${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp
     ${CMAKE_CURRENT_LIST_DIR}/src/syserror_import_lib.cpp
     ${CMAKE_CURRENT_LIST_DIR}/src/vector_algorithms.cpp
@@ -388,6 +388,11 @@ set(SOURCES_SATELLITE_2
     ${CMAKE_CURRENT_LIST_DIR}/src/special_math.cpp
 )
 
+set(SOURCES_SATELLITE_ATOMIC_WAIT
+    ${CMAKE_CURRENT_LIST_DIR}/src/atomic_wait.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/parallel_algorithms.cpp
+)
+
 set(SOURCES_SATELLITE_CODECVT_IDS
     ${CMAKE_CURRENT_LIST_DIR}/src/ulocale.cpp
 )
@@ -396,6 +401,7 @@ set(SOURCES_SATELLITE_CODECVT_IDS
 set(STATIC_SOURCES
     ${SOURCES_SATELLITE_1}
     ${SOURCES_SATELLITE_2}
+    ${SOURCES_SATELLITE_ATOMIC_WAIT}
     ${SOURCES_SATELLITE_CODECVT_IDS}
 )
 
@@ -459,6 +465,27 @@ function(add_stl_dlls D_SUFFIX THIS_CONFIG_DEFINITIONS THIS_CONFIG_COMPILE_OPTIO
     set_target_properties(msvcp_2${D_SUFFIX} PROPERTIES OUTPUT_NAME "msvcp140_2${D_SUFFIX}${VCLIBS_SUFFIX}")
     target_link_options(msvcp_2${D_SUFFIX} PRIVATE "${THIS_CONFIG_LINK_OPTIONS}")
 
+    # msvcp140_atomic_wait.dll (the atomic wait satellite)
+    add_library(msvcp${D_SUFFIX}_atomic_wait_objects OBJECT ${SOURCES_SATELLITE_ATOMIC_WAIT})
+    target_compile_definitions(msvcp${D_SUFFIX}_atomic_wait_objects PRIVATE "_BUILDING_SATELLITE_ATOMIC_WAIT;_DLL;${THIS_CONFIG_DEFINITIONS}")
+    target_compile_options(msvcp${D_SUFFIX}_atomic_wait_objects PRIVATE "${THIS_CONFIG_COMPILE_OPTIONS};${GL_FLAG};/EHsc")
+
+    # generate the .def for msvcp140_atomic_wait.dll
+    set(_ATOMIC_WAIT_OUTPUT_NAME "msvcp140${D_SUFFIX}_atomic_wait${VCLIBS_SUFFIX}")
+    string(TOUPPER "${_ATOMIC_WAIT_OUTPUT_NAME}" _ATOMIC_WAIT_OUTPUT_NAME_UPPER)
+    set(_ATOMIC_WAIT_DEF_NAME "${CMAKE_BINARY_DIR}/msvcp_atomic_wait${D_SUFFIX}.def")
+    set(_ATOMIC_WAIT_DEF_FILE_SRC "${CMAKE_CURRENT_LIST_DIR}/src/msvcp_atomic_wait.src")
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${_ATOMIC_WAIT_DEF_FILE_SRC}")
+    file(READ "${_ATOMIC_WAIT_DEF_FILE_SRC}" _ATOMIC_WAIT_SRC_CONTENTS)
+    string(REPLACE "LIBRARYNAME" "${_ATOMIC_WAIT_OUTPUT_NAME_UPPER}" _ATOMIC_WAIT_DEF_CONTENTS "${_ATOMIC_WAIT_SRC_CONTENTS}")
+    file(WRITE "${_ATOMIC_WAIT_DEF_NAME}" "${_ATOMIC_WAIT_DEF_CONTENTS}")
+
+    add_library(msvcp${D_SUFFIX}_atomic_wait SHARED "${_ATOMIC_WAIT_DEF_NAME}")
+    target_link_libraries(msvcp${D_SUFFIX}_atomic_wait PRIVATE msvcp${D_SUFFIX}_atomic_wait_objects "msvcp${D_SUFFIX}" "${TOOLSET_LIB}/vcruntime${D_SUFFIX}.lib" "${TOOLSET_LIB}/msvcrt${D_SUFFIX}.lib" "ucrt${D_SUFFIX}.lib")
+    set_target_properties(msvcp${D_SUFFIX}_atomic_wait PROPERTIES ARCHIVE_OUTPUT_NAME "msvcp140_atomic_wait${D_SUFFIX}${VCLIBS_SUFFIX}")
+    set_target_properties(msvcp${D_SUFFIX}_atomic_wait PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+    set_target_properties(msvcp${D_SUFFIX}_atomic_wait PROPERTIES OUTPUT_NAME "${_ATOMIC_WAIT_OUTPUT_NAME}")
+
     # msvcp140_codecvt_ids.dll
     add_library(msvcp${D_SUFFIX}_codecvt_ids_objects OBJECT ${SOURCES_SATELLITE_CODECVT_IDS})
     target_compile_definitions(msvcp${D_SUFFIX}_codecvt_ids_objects PRIVATE "_BUILDING_SATELLITE_CODECVT_IDS;_DLL;${THIS_CONFIG_DEFINITIONS}")
@@ -474,8 +501,8 @@ function(add_stl_dlls D_SUFFIX THIS_CONFIG_DEFINITIONS THIS_CONFIG_COMPILE_OPTIO
     # import library
     add_library(msvcp${D_SUFFIX}_implib STATIC ${HEADERS})
     target_link_libraries(msvcp${D_SUFFIX}_implib msvcp${D_SUFFIX}_implib_objects std_init_once_begin_initialize std_init_once_complete)
-    add_dependencies(msvcp${D_SUFFIX}_implib msvcp${D_SUFFIX} msvcp_1${D_SUFFIX} msvcp_2${D_SUFFIX} msvcp${D_SUFFIX}_codecvt_ids)
-    set_target_properties(msvcp${D_SUFFIX}_implib PROPERTIES STATIC_LIBRARY_OPTIONS "/NOLOGO;/NODEFAULTLIB;/IGNORE:4006;$<TARGET_LINKER_FILE:msvcp${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp_1${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp_2${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp${D_SUFFIX}_codecvt_ids>")
+    add_dependencies(msvcp${D_SUFFIX}_implib msvcp${D_SUFFIX} msvcp_1${D_SUFFIX} msvcp_2${D_SUFFIX} msvcp${D_SUFFIX}_atomic_wait msvcp${D_SUFFIX}_codecvt_ids)
+    set_target_properties(msvcp${D_SUFFIX}_implib PROPERTIES STATIC_LIBRARY_OPTIONS "/NOLOGO;/NODEFAULTLIB;/IGNORE:4006;$<TARGET_LINKER_FILE:msvcp${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp_1${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp_2${D_SUFFIX}>;$<TARGET_LINKER_FILE:msvcp${D_SUFFIX}_codecvt_ids>;$<TARGET_LINKER_FILE:msvcp${D_SUFFIX}_atomic_wait>")
     set_target_properties(msvcp${D_SUFFIX}_implib PROPERTIES ARCHIVE_OUTPUT_NAME "msvcprt${D_SUFFIX}")
 endfunction()
 
diff --git a/stl/inc/atomic b/stl/inc/atomic
index 80377e2c72..633280f5d5 100644
--- a/stl/inc/atomic
+++ b/stl/inc/atomic
@@ -17,6 +17,9 @@
 #include <stdint.h>
 #include <string.h>
 #include <xatomic.h>
+#if _HAS_CXX20
+#include <xatomic_wait.h>
+#endif // _HAS_CXX20
 
 #pragma pack(push, _CRT_PACKING)
 #pragma warning(push, _STL_WARNING_LEVEL)
@@ -357,6 +360,83 @@ template <class _Ty, size_t = sizeof(_Ty)>
 #else // ^^^ don't break ABI / break ABI vvv
 template <class _Ty, size_t = _Atomic_storage_traits<_Ty>::_Storage_size>
 #endif // TRANSITION, ABI
+struct _Atomic_storage;
+
+#if _HAS_CXX20
+template <class _Ty, class _Value_type>
+void _Atomic_wait_direct(
+    const _Atomic_storage<_Ty>* const _This, _Value_type _Expected_bytes, const memory_order _Order) noexcept {
+    const auto _Storage_ptr = _STD addressof(_This->_Storage);
+    for (;;) {
+        const _Value_type _Observed_bytes = _Atomic_reinterpret_as<_Value_type>(_This->load(_Order));
+        if (_Expected_bytes != _Observed_bytes) {
+#if _CMPXCHG_MASK_OUT_PADDING_BITS
+            if constexpr (_Might_have_non_value_bits<_Ty>) {
+                _Storage_for<_Ty> _Mask{_Form_mask};
+                const _Value_type _Mask_val = _Atomic_reinterpret_as<_Value_type>(_Mask._Ref());
+
+                if (((_Expected_bytes ^ _Observed_bytes) & _Mask_val) == 0) {
+                    _Expected_bytes = _Observed_bytes;
+                    continue;
+                }
+            }
+#endif // _CMPXCHG_MASK_OUT_PADDING_BITS
+
+            return;
+        }
+
+        __std_atomic_wait_direct(_Storage_ptr, &_Expected_bytes, sizeof(_Value_type), _Atomic_wait_no_timeout);
+    }
+}
+#endif // _HAS_CXX20
+
+#if 1 // TRANSITION, ABI
+inline void _Atomic_lock_spinlock(long& _Spinlock) noexcept {
+    while (_InterlockedExchange(&_Spinlock, 1)) {
+        _YIELD_PROCESSOR();
+    }
+}
+
+inline void _Atomic_unlock_spinlock(long& _Spinlock) noexcept {
+#if defined(_M_ARM) || defined(_M_ARM64)
+    _Memory_barrier();
+    __iso_volatile_store32(reinterpret_cast<int*>(&_Spinlock), 0);
+    _Memory_barrier();
+#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
+    _InterlockedExchange(&_Spinlock, 0);
+#endif // hardware
+}
+
+class _Spinlock_guard {
+public:
+    explicit _Spinlock_guard(long& _Spinlock_) noexcept : _Spinlock(_Spinlock_) {
+        _Atomic_lock_spinlock(_Spinlock);
+    }
+
+    ~_Spinlock_guard() {
+        _Atomic_unlock_spinlock(_Spinlock);
+    }
+
+    _Spinlock_guard(const _Spinlock_guard&) = delete;
+    _Spinlock_guard& operator=(const _Spinlock_guard&) = delete;
+
+private:
+    long& _Spinlock;
+};
+
+#if _HAS_CXX20
+inline bool __stdcall _Atomic_wait_compare_non_lock_free(
+    const void* _Storage, void* _Comparand, size_t _Size, void* _Spinlock_raw) noexcept {
+    long& _Spinlock = *static_cast<long*>(_Spinlock_raw);
+    _Atomic_lock_spinlock(_Spinlock);
+    const auto _Cmp_result = _CSTD memcmp(_Storage, _Comparand, _Size);
+    _Atomic_unlock_spinlock(_Spinlock);
+    return _Cmp_result == 0;
+}
+#endif // _HAS_CXX20
+#endif // TRANSITION, ABI
+
+template <class _Ty, size_t /* = ... */>
 struct _Atomic_storage {
     // Provides operations common to all specializations of std::atomic, load, store, exchange, and CAS.
     // Locking version used when hardware has no atomic operations for sizeof(_Ty).
@@ -427,21 +507,59 @@ struct _Atomic_storage {
         return _Result;
     }
 
+#if _HAS_CXX20
+    void wait(_Ty _Expected, memory_order = memory_order_seq_cst) const noexcept {
+        const auto _Storage_ptr  = _STD addressof(_Storage);
+        const auto _Expected_ptr = _STD addressof(_Expected);
+        for (;;) {
+            {
+                _Spinlock_guard _Lock{_Spinlock};
+                if (_CSTD memcmp(_Storage_ptr, _Expected_ptr, sizeof(_Ty)) != 0) {
+                    // contents differed, we might be done, check for padding
+#if _CMPXCHG_MASK_OUT_PADDING_BITS
+                    if constexpr (_Might_have_non_value_bits<_Ty>) {
+                        _Storage_for<_Ty> _Local;
+                        const auto _Local_ptr = _Local._Ptr();
+                        _CSTD memcpy(_Local_ptr, _Storage_ptr, sizeof(_Ty));
+                        __builtin_zero_non_value_bits(_Local_ptr);
+                        __builtin_zero_non_value_bits(_Expected_ptr);
+                        if (_CSTD memcmp(_Local_ptr, _Expected_ptr, sizeof(_Ty)) == 0) {
+                            // _Storage differs from _Expected only by padding; copy the padding from _Storage into
+                            // _Expected
+                            _CSTD memcpy(_Expected_ptr, _Storage_ptr, sizeof(_Ty));
+                        } else {
+                            // truly different, we're done
+                            return;
+                        }
+                    } else
+#endif // #if _CMPXCHG_MASK_OUT_PADDING_BITS
+                    {
+                        return;
+                    }
+                }
+            } // unlock
+
+            __std_atomic_wait_indirect(_Storage_ptr, _Expected_ptr, sizeof(_Ty), &_Spinlock,
+                &_Atomic_wait_compare_non_lock_free, _Atomic_wait_no_timeout);
+        }
+    }
+
+    void notify_one() noexcept {
+        __std_atomic_notify_one_indirect(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_indirect(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
 #if 1 // TRANSITION, ABI
     void _Lock() const noexcept { // lock the spinlock
-        while (_InterlockedExchange(&_Spinlock, 1)) {
-            _YIELD_PROCESSOR();
-        }
+        _Atomic_lock_spinlock(_Spinlock);
     }
 
     void _Unlock() const noexcept { // unlock the spinlock
-#if defined(_M_ARM) || defined(_M_ARM64)
-        _Memory_barrier();
-        __iso_volatile_store32(reinterpret_cast<int*>(&_Spinlock), 0);
-        _Memory_barrier();
-#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
-        _InterlockedExchange(&_Spinlock, 0);
-#endif // hardware
+        _Atomic_unlock_spinlock(_Spinlock);
     }
 
 private:
@@ -452,23 +570,15 @@ public:
 
 #else // ^^^ don't break ABI / break ABI vvv
     void _Lock() const noexcept { // lock the spinlock
-        while (_InterlockedExchange8(&_Spinlock, 1)) {
-            _YIELD_PROCESSOR();
-        }
+        _Smtx_lock_exclusive(&_Spinlock);
     }
 
     void _Unlock() const noexcept { // unlock the spinlock
-#if defined(_M_ARM) || defined(_M_ARM64)
-        _Memory_barrier();
-        __iso_volatile_store8(&_Spinlock, 0);
-        _Memory_barrier();
-#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
-        _InterlockedExchange8(&_Spinlock, 0);
-#endif // hardware
+        _Smtx_unlock_exclusive(&_Spinlock);
     }
 
     _Ty _Storage;
-    mutable char _Spinlock = 0;
+    mutable _Smtx_t _Spinlock = 0;
 #endif // TRANSITION, ABI
 };
 
@@ -572,6 +682,20 @@ struct _Atomic_storage<_Ty, 1> { // lock-free using 1-byte intrinsics
         return false;
     }
 
+#if _HAS_CXX20
+    void wait(const _Ty _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
+        _Atomic_wait_direct(this, _Atomic_reinterpret_as<char>(_Expected), _Order);
+    }
+
+    void notify_one() noexcept {
+        __std_atomic_notify_one_direct(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_direct(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
     _Atomic_padded<_Ty> _Storage;
 };
 
@@ -674,6 +798,20 @@ struct _Atomic_storage<_Ty, 2> { // lock-free using 2-byte intrinsics
         return false;
     }
 
+#if _HAS_CXX20
+    void wait(const _Ty _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
+        _Atomic_wait_direct(this, _Atomic_reinterpret_as<short>(_Expected), _Order);
+    }
+
+    void notify_one() noexcept {
+        __std_atomic_notify_one_direct(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_direct(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
     _Atomic_padded<_Ty> _Storage;
 };
 
@@ -774,6 +912,20 @@ struct _Atomic_storage<_Ty, 4> { // lock-free using 4-byte intrinsics
         return false;
     }
 
+#if _HAS_CXX20
+    void wait(const _Ty _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
+        _Atomic_wait_direct(this, _Atomic_reinterpret_as<long>(_Expected), _Order);
+    }
+
+    void notify_one() noexcept {
+        __std_atomic_notify_one_direct(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_direct(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
     _Atomic_padded<_Ty> _Storage;
 };
 
@@ -903,6 +1055,20 @@ struct _Atomic_storage<_Ty, 8> { // lock-free using 8-byte intrinsics
         return false;
     }
 
+#if _HAS_CXX20
+    void wait(const _Ty _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
+        _Atomic_wait_direct(this, _Atomic_reinterpret_as<long long>(_Expected), _Order);
+    }
+
+    void notify_one() noexcept {
+        __std_atomic_notify_one_direct(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_direct(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
     _Atomic_padded<_Ty> _Storage;
 };
 
@@ -998,6 +1164,16 @@ struct _Atomic_storage<_Ty, 16> { // lock-free using 16-byte intrinsics
         return _Result != 0;
     }
 
+#if _HAS_CXX20
+    void notify_one() noexcept {
+        __std_atomic_notify_one_indirect(_STD addressof(_Storage));
+    }
+
+    void notify_all() noexcept {
+        __std_atomic_notify_all_indirect(_STD addressof(_Storage));
+    }
+#endif // _HAS_CXX20
+
     struct _Int128 {
         alignas(16) long long _Low;
         long long _High;
@@ -1802,6 +1978,23 @@ public:
         return this->compare_exchange_strong(_Expected, _Desired, _Combine_cas_memory_orders(_Success, _Failure));
     }
 
+#if _HAS_CXX20
+    using _Base::wait;
+    void wait(const _Ty _Expected, const memory_order _Order = memory_order_seq_cst) const volatile noexcept {
+        const_cast<const atomic*>(this)->_Base::wait(_Expected, _Order);
+    }
+
+    using _Base::notify_one;
+    void notify_one() volatile noexcept {
+        const_cast<atomic*>(this)->_Base::notify_one();
+    }
+
+    using _Base::notify_all;
+    void notify_all() volatile noexcept {
+        const_cast<atomic*>(this)->_Base::notify_all();
+    }
+#endif // _HAS_CXX20
+
     operator _Ty() const volatile noexcept {
         static_assert(_Deprecate_non_lock_free_volatile<_Ty>, "Never fails");
         return this->load();
@@ -2088,6 +2281,52 @@ _Ty atomic_fetch_xor_explicit(
     return _Mem->fetch_xor(_Value, _Order);
 }
 
+#if _HAS_CXX20
+template <class _Ty>
+void atomic_wait(const volatile atomic<_Ty>* const _Mem, const typename atomic<_Ty>::value_type _Expected) noexcept
+/* strengthened */ {
+    _Mem->wait(_Expected);
+}
+
+template <class _Ty>
+void atomic_wait(const atomic<_Ty>* const _Mem, const typename atomic<_Ty>::value_type _Expected) noexcept
+/* strengthened */ {
+    _Mem->wait(_Expected);
+}
+
+template <class _Ty>
+void atomic_wait_explicit(const volatile atomic<_Ty>* const _Mem, const typename atomic<_Ty>::value_type _Expected,
+    const memory_order _Order) noexcept /* strengthened */ {
+    _Mem->wait(_Expected, _Order);
+}
+
+template <class _Ty>
+void atomic_wait_explicit(const atomic<_Ty>* const _Mem, const typename atomic<_Ty>::value_type _Expected,
+    const memory_order _Order) noexcept /* strengthened */ {
+    _Mem->wait(_Expected, _Order);
+}
+
+template <class _Ty>
+void atomic_notify_one(volatile atomic<_Ty>* const _Mem) noexcept /* strengthened */ {
+    _Mem->notify_one();
+}
+
+template <class _Ty>
+void atomic_notify_one(atomic<_Ty>* const _Mem) noexcept /* strengthened */ {
+    _Mem->notify_one();
+}
+
+template <class _Ty>
+void atomic_notify_all(volatile atomic<_Ty>* const _Mem) noexcept /* strengthened */ {
+    _Mem->notify_all();
+}
+
+template <class _Ty>
+void atomic_notify_all(atomic<_Ty>* const _Mem) noexcept /* strengthened */ {
+    _Mem->notify_all();
+}
+#endif // _HAS_CXX20
+
 // ATOMIC TYPEDEFS
 using atomic_bool = atomic<bool>;
 
@@ -2183,6 +2422,32 @@ struct atomic_flag { // flag with test-and-set semantics
 
     constexpr atomic_flag() noexcept = default;
 
+#if _HAS_CXX20
+    void wait(const bool _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
+        _Storage.wait(static_cast<decltype(_Storage)::value_type>(_Expected), _Order);
+    }
+
+    void wait(const bool _Expected, const memory_order _Order = memory_order_seq_cst) const volatile noexcept {
+        _Storage.wait(static_cast<decltype(_Storage)::value_type>(_Expected), _Order);
+    }
+
+    void notify_one() noexcept {
+        _Storage.notify_one();
+    }
+
+    void notify_one() volatile noexcept {
+        _Storage.notify_one();
+    }
+
+    void notify_all() noexcept {
+        _Storage.notify_all();
+    }
+
+    void notify_all() volatile noexcept {
+        _Storage.notify_all();
+    }
+#endif // _HAS_CXX20
+
 #if 1 // TRANSITION, ABI
     atomic<long> _Storage;
 #else // ^^^ don't break ABI / break ABI vvv
@@ -2211,38 +2476,74 @@ _NODISCARD inline bool atomic_flag_test_explicit(const atomic_flag* const _Flag,
 }
 #endif // _HAS_CXX20
 
-inline bool atomic_flag_test_and_set(atomic_flag* _Flag) noexcept {
+inline bool atomic_flag_test_and_set(atomic_flag* const _Flag) noexcept {
     return _Flag->test_and_set();
 }
 
-inline bool atomic_flag_test_and_set(volatile atomic_flag* _Flag) noexcept {
+inline bool atomic_flag_test_and_set(volatile atomic_flag* const _Flag) noexcept {
     return _Flag->test_and_set();
 }
 
-inline bool atomic_flag_test_and_set_explicit(atomic_flag* _Flag, memory_order _Order) noexcept {
+inline bool atomic_flag_test_and_set_explicit(atomic_flag* const _Flag, const memory_order _Order) noexcept {
     return _Flag->test_and_set(_Order);
 }
 
-inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag* _Flag, memory_order _Order) noexcept {
+inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag* const _Flag, const memory_order _Order) noexcept {
     return _Flag->test_and_set(_Order);
 }
 
-inline void atomic_flag_clear(atomic_flag* _Flag) noexcept {
+inline void atomic_flag_clear(atomic_flag* const _Flag) noexcept {
     _Flag->clear();
 }
 
-inline void atomic_flag_clear(volatile atomic_flag* _Flag) noexcept {
+inline void atomic_flag_clear(volatile atomic_flag* const _Flag) noexcept {
     _Flag->clear();
 }
 
-inline void atomic_flag_clear_explicit(atomic_flag* _Flag, memory_order _Order) noexcept {
+inline void atomic_flag_clear_explicit(atomic_flag* const _Flag, const memory_order _Order) noexcept {
     _Flag->clear(_Order);
 }
 
-inline void atomic_flag_clear_explicit(volatile atomic_flag* _Flag, memory_order _Order) noexcept {
+inline void atomic_flag_clear_explicit(volatile atomic_flag* const _Flag, const memory_order _Order) noexcept {
     _Flag->clear(_Order);
 }
 
+#if _HAS_CXX20
+inline void atomic_flag_wait(const volatile atomic_flag* const _Flag, const bool _Expected) noexcept {
+    return _Flag->wait(_Expected);
+}
+
+inline void atomic_flag_wait(const atomic_flag* const _Flag, const bool _Expected) noexcept {
+    return _Flag->wait(_Expected);
+}
+
+inline void atomic_flag_wait_explicit(
+    const volatile atomic_flag* const _Flag, const bool _Expected, const memory_order _Order) noexcept {
+    return _Flag->wait(_Expected, _Order);
+}
+
+inline void atomic_flag_wait_explicit(
+    const atomic_flag* const _Flag, const bool _Expected, const memory_order _Order) noexcept {
+    return _Flag->wait(_Expected, _Order);
+}
+
+inline void atomic_flag_notify_one(volatile atomic_flag* const _Flag) noexcept {
+    return _Flag->notify_one();
+}
+
+inline void atomic_flag_notify_one(atomic_flag* const _Flag) noexcept {
+    return _Flag->notify_one();
+}
+
+inline void atomic_flag_notify_all(volatile atomic_flag* const _Flag) noexcept {
+    return _Flag->notify_all();
+}
+
+inline void atomic_flag_notify_all(atomic_flag* const _Flag) noexcept {
+    return _Flag->notify_all();
+}
+#endif // _HAS_CXX20
+
 _STD_END
 
 #undef _CMPXCHG_MASK_OUT_PADDING_BITS
diff --git a/stl/inc/execution b/stl/inc/execution
index 728cc71f91..7fea657d08 100644
--- a/stl/inc/execution
+++ b/stl/inc/execution
@@ -18,6 +18,7 @@
 #include <mutex>
 #include <numeric>
 #include <queue>
+#include <thread>
 #include <vector>
 #include <xbit_ops.h>
 
@@ -29,11 +30,6 @@ _STL_DISABLE_CLANG_WARNINGS
 #undef new
 
 _EXTERN_C
-// If on Windows XP, returns 1 (disabling parallelism); otherwise, returns the number of hardware threads available.
-_NODISCARD unsigned int __stdcall __std_parallel_algorithms_hw_threads() noexcept;
-
-// Windows Vista thread pool interface; __std_parallel_algorithms_hw_threads must be called on the current
-// thread before calling any of the below.
 #ifdef _M_CEE
 using __std_TP_WORK              = void;
 using __std_TP_CALLBACK_INSTANCE = void;
@@ -1117,7 +1113,7 @@ struct _Static_partitioned_all_of_family2 { // all_of/any_of/none_of task schedu
 template <bool _Invert, class _FwdIt, class _Pr>
 bool _All_of_family_parallel(_FwdIt _First, const _FwdIt _Last, _Pr _Pred) {
     // test if all elements in [_First, _Last) satisfy _Pred (or !_Pred if _Invert is true) in parallel
-    const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+    const size_t _Hw_threads = thread::hardware_concurrency();
     if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
         const auto _Count = _STD distance(_First, _Last);
         if (_Count >= 2) { // ... with at least 2 elements
@@ -1228,7 +1224,7 @@ void for_each(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Fn _Func) noexcept /* termi
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 2) { // ... with at least 2 elements
@@ -1275,7 +1271,7 @@ _FwdIt for_each_n(_ExPo&&, _FwdIt _First, const _Diff _Count_raw, _Fn _Func) noe
     if (0 < _Count) {
         auto _UFirst = _Get_unwrapped_n(_First, _Count);
         if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-            const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+            const size_t _Hw_threads = thread::hardware_concurrency();
             if (_Hw_threads > 1 && _Count >= 2) { // parallelize on multiprocessor machines with at least 2 elements
                 _TRY_BEGIN
                 auto _Passed_fn = _Pass_fn(_Func);
@@ -1351,7 +1347,7 @@ template <class _ExPo, class _FwdIt, class _Find_fx>
 _FwdIt _Find_parallel_unchecked(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, const _Find_fx _Fx) {
     // find first matching _Val, potentially in parallel
     if (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = _STD distance(_First, _Last);
             if (_Count >= 2) {
@@ -1566,7 +1562,7 @@ _NODISCARD _FwdIt1 find_end(_ExPo&&, _FwdIt1 _First1, const _FwdIt1 _Last1, cons
     const auto _UFirst2 = _Get_unwrapped(_First2);
     const auto _ULast2  = _Get_unwrapped(_Last2);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             if constexpr (_Is_bidi_iter_v<_FwdIt1>) {
                 const auto _Partition_start =
@@ -1686,7 +1682,7 @@ _NODISCARD _FwdIt adjacent_find(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred)
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = static_cast<_Iter_diff_t<_FwdIt>>(_STD distance(_UFirst, _ULast) - 1);
             if (_Count >= 2) {
@@ -1747,7 +1743,7 @@ _NODISCARD _Iter_diff_t<_FwdIt> count_if(_ExPo&&, const _FwdIt _First, const _Fw
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 2) {
@@ -1911,7 +1907,7 @@ _NODISCARD pair<_FwdIt1, _FwdIt2> mismatch(
     const auto _UFirst1 = _Get_unwrapped(_First1);
     const auto _ULast1  = _Get_unwrapped(_Last1);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count   = _STD distance(_UFirst1, _ULast1);
             const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
@@ -1956,7 +1952,7 @@ _NODISCARD pair<_FwdIt1, _FwdIt2> mismatch(
     const auto _UFirst2 = _Get_unwrapped(_First2);
     const auto _ULast2  = _Get_unwrapped(_Last2);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = static_cast<_Iter_diff_t<_FwdIt1>>(_Distance_min(_UFirst1, _ULast1, _UFirst2, _ULast2));
             if (_Count >= 2) {
@@ -2031,7 +2027,7 @@ _NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, cons
     const auto _UFirst1 = _Get_unwrapped(_First1);
     const auto _ULast1  = _Get_unwrapped(_Last1);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count   = _STD distance(_UFirst1, _ULast1);
             const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
@@ -2067,7 +2063,7 @@ _NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, cons
     const auto _UFirst2 = _Get_unwrapped(_First2);
     const auto _ULast2  = _Get_unwrapped(_Last2);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = _Distance_any(_UFirst1, _ULast1, _UFirst2, _ULast2);
             if (_Count >= 2) {
@@ -2157,7 +2153,7 @@ _NODISCARD _FwdItHaystack search(_ExPo&&, const _FwdItHaystack _First1, _FwdItHa
     const auto _ULast1  = _Get_unwrapped(_Last1);
 
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             _Iter_diff_t<_FwdItHaystack> _Count;
             if constexpr (_Is_random_iter_v<_FwdItHaystack> && _Is_random_iter_v<_FwdItPat>) {
@@ -2288,7 +2284,7 @@ _NODISCARD _FwdIt search_n(_ExPo&&, const _FwdIt _First, _FwdIt _Last, const _Di
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Haystack_count = _STD distance(_UFirst, _ULast);
             if (_Count > _Haystack_count) {
@@ -2356,7 +2352,7 @@ _FwdIt2 transform(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _D
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             const auto _Count = _STD distance(_UFirst, _ULast);
             const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
@@ -2436,7 +2432,7 @@ _FwdIt3 transform(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _F
     const auto _UFirst1 = _Get_unwrapped(_First1);
     const auto _ULast1  = _Get_unwrapped(_Last1);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             const auto _Count   = _STD distance(_UFirst1, _ULast1);
             const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
@@ -2616,7 +2612,7 @@ _NODISCARD _FwdIt remove_if(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pre
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 2) {
@@ -2758,7 +2754,7 @@ void sort(_ExPo&&, const _RanIt _First, const _RanIt _Last, _Pr _Pred) noexcept
     const _Iter_diff_t<_RanIt> _Ideal = _ULast - _UFirst;
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
         size_t _Threads;
-        if (_Ideal > _ISORT_MAX && (_Threads = __std_parallel_algorithms_hw_threads()) > 1) {
+        if (_Ideal > _ISORT_MAX && (_Threads = thread::hardware_concurrency()) > 1) {
             // parallelize when input is large enough and we aren't on a uniprocessor machine
             _TRY_BEGIN
             _Sort_operation _Operation(_UFirst, _Pass_fn(_Pred), _Threads, _Ideal); // throws
@@ -3022,7 +3018,7 @@ void stable_sort(_ExPo&&, const _BidIt _First, const _BidIt _Last, _Pr _Pred) no
     size_t _Hw_threads;
     bool _Attempt_parallelism;
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        _Hw_threads          = __std_parallel_algorithms_hw_threads();
+        _Hw_threads          = thread::hardware_concurrency();
         _Attempt_parallelism = _Hw_threads > 1;
     } else {
         _Attempt_parallelism = false;
@@ -3103,7 +3099,7 @@ _NODISCARD _FwdIt is_sorted_until(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pre
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 3) { // ... with at least 3 elements
@@ -3258,7 +3254,7 @@ _NODISCARD bool is_partitioned(_ExPo&&, const _FwdIt _First, const _FwdIt _Last,
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 2) { // ... with at least 2 elements
@@ -3331,7 +3327,7 @@ _NODISCARD _RanIt is_heap_until(_ExPo&&, _RanIt _First, _RanIt _Last, _Pr _Pred)
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _ULast - _UFirst;
             if (_Count >= 3) { // ... with at least 3 elements
@@ -3580,7 +3576,7 @@ _FwdIt partition(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) {
             const auto _Count = _STD distance(_UFirst, _ULast);
             if (_Count >= 2) {
@@ -3922,7 +3918,7 @@ _FwdIt3 set_intersection(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _Firs
     if constexpr (remove_reference_t<_ExPo>::_Parallelize
                   && _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) {
         // only parallelize if desired, and all of the iterators given are random access
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const _Diff _Count1 = _ULast1 - _UFirst1;
             const _Diff _Count2 = _ULast2 - _UFirst2;
@@ -4013,7 +4009,7 @@ _FwdIt3 set_difference(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2
     if constexpr (remove_reference_t<_ExPo>::_Parallelize
                   && _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) {
         // only parallelize if desired, and all of the iterators given are random access
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const _Diff _Count = _ULast1 - _UFirst1;
             if (_Count >= 2) { // ... with at least 2 elements in [_First1, _Last1)
@@ -4107,7 +4103,7 @@ _NODISCARD _Ty reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Ty _Val
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             const auto _Count  = _STD distance(_UFirst, _ULast);
             const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count);
@@ -4209,7 +4205,7 @@ _NODISCARD _Ty transform_reduce(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt
     auto _UFirst1      = _Get_unwrapped(_First1);
     const auto _ULast1 = _Get_unwrapped(_Last1);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             const auto _Count  = _STD distance(_UFirst1, _ULast1);
             auto _UFirst2      = _Get_unwrapped_n(_First2, _Count);
@@ -4304,7 +4300,7 @@ _NODISCARD _Ty transform_reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
             const auto _Count  = _STD distance(_UFirst, _ULast);
             const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count);
@@ -4456,7 +4452,7 @@ _FwdIt2 exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdI
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
@@ -4601,7 +4597,7 @@ _FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _B
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_First, _Last);
             auto _UDest       = _Get_unwrapped_n(_Dest, _Count);
@@ -4644,7 +4640,7 @@ _FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _B
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             auto _UDest       = _Get_unwrapped_n(_Dest, _Count);
@@ -4789,7 +4785,7 @@ _FwdIt2 transform_exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _L
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
@@ -4936,7 +4932,7 @@ _FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _L
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             auto _UDest       = _Get_unwrapped_n(_Dest, _Count);
@@ -4982,7 +4978,7 @@ _FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _L
     const auto _UFirst = _Get_unwrapped(_First);
     const auto _ULast  = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             const auto _Count = _STD distance(_UFirst, _ULast);
             auto _UDest       = _Get_unwrapped_n(_Dest, _Count);
@@ -5084,7 +5080,7 @@ _FwdIt2 adjacent_difference(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last,
     auto _UFirst      = _Get_unwrapped(_First);
     const auto _ULast = _Get_unwrapped(_Last);
     if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
-        const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
+        const size_t _Hw_threads = thread::hardware_concurrency();
         if (_Hw_threads > 1) { // parallelize on multiprocessor machines
             auto _Count       = _STD distance(_UFirst, _ULast);
             const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
diff --git a/stl/inc/memory b/stl/inc/memory
index 5b65f83450..67b1cc410d 100644
--- a/stl/inc/memory
+++ b/stl/inc/memory
@@ -3100,6 +3100,15 @@ _CXX20_DEPRECATE_OLD_SHARED_PTR_ATOMIC_SUPPORT bool atomic_compare_exchange_stro
 template <class _Ty>
 class alignas(2 * sizeof(void*)) _Atomic_ptr_base {
     // overalignment is to allow potential future use of cmpxchg16b
+
+    static_assert(alignof(_Ref_count_base) >= (1 << 2), "Two bits don't fit as low bits");
+
+    static constexpr uintptr_t _Lock_mask                = 3;
+    static constexpr uintptr_t _Not_locked               = 0;
+    static constexpr uintptr_t _Locked_notify_not_needed = 1;
+    static constexpr uintptr_t _Locked_notify_needed     = 2;
+    static constexpr uintptr_t _Ptr_value_mask           = ~_Lock_mask;
+
 protected:
     constexpr _Atomic_ptr_base() noexcept = default;
 
@@ -3107,20 +3116,42 @@ protected:
         : _Ptr(_Px), _Repptr(reinterpret_cast<uintptr_t>(_Ref)) {}
 
     _NODISCARD _Ref_count_base* _Lock_and_load() const noexcept {
-        constexpr uintptr_t _Low_bit = 1;
-        uintptr_t _Rep               = _Repptr.load(memory_order::relaxed);
+        uintptr_t _Rep = _Repptr.load(memory_order::relaxed);
         for (;;) {
-            _Rep &= ~_Low_bit;
-            if (_Repptr.compare_exchange_weak(_Rep, _Rep | _Low_bit)) {
-                return reinterpret_cast<_Ref_count_base*>(_Rep);
+            switch (_Rep & _Lock_mask) {
+            case _Not_locked: // Can try to lock now
+                if (_Repptr.compare_exchange_weak(_Rep, _Rep | _Locked_notify_not_needed)) {
+                    return reinterpret_cast<_Ref_count_base*>(_Rep);
+                }
+                _YIELD_PROCESSOR();
+                break;
+
+            case _Locked_notify_not_needed: // Try to set "notify needed" and wait
+                if (!_Repptr.compare_exchange_weak(_Rep, (_Rep & _Ptr_value_mask) | _Locked_notify_needed)) {
+                    // Failed to put notify needed flag on, try again
+                    _YIELD_PROCESSOR();
+                    break;
+                }
+                _Rep = (_Rep & _Ptr_value_mask) | _Locked_notify_needed;
+                [[fallthrough]];
+
+            case _Locked_notify_needed: // "Notify needed" is already set, just wait
+                _Repptr.wait(_Rep, memory_order::relaxed);
+                _Rep = _Repptr.load(memory_order::relaxed);
+                break;
+
+            default: // Unrecognized bit pattern
+                _CSTD abort();
             }
-
-            _YIELD_PROCESSOR();
         }
     }
 
     void _Store_and_unlock(_Ref_count_base* const _Value) const noexcept {
-        _Repptr.store(reinterpret_cast<uintptr_t>(_Value));
+        uintptr_t _Rep = _Repptr.exchange(reinterpret_cast<uintptr_t>(_Value));
+        if ((_Rep & _Lock_mask) == _Locked_notify_needed) {
+            // As we don't count waiters, every waiter is notified, and then some may re-request notification
+            _Repptr.notify_all();
+        }
     }
 
     _Ty* _Ptr = nullptr;
diff --git a/stl/inc/xatomic_wait.h b/stl/inc/xatomic_wait.h
new file mode 100644
index 0000000000..266aae0c9f
--- /dev/null
+++ b/stl/inc/xatomic_wait.h
@@ -0,0 +1,72 @@
+// xatomic_wait.h internal header
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+#ifndef _XATOMIC_WAIT_H
+#define _XATOMIC_WAIT_H
+#include <yvals.h>
+#if _STL_COMPILER_PREPROCESSOR
+
+#include <stdlib.h>
+#include <xatomic.h>
+
+#pragma pack(push, _CRT_PACKING)
+#pragma warning(push, _STL_WARNING_LEVEL)
+#pragma warning(disable : _STL_DISABLED_WARNINGS)
+_STL_DISABLE_CLANG_WARNINGS
+#pragma push_macro("new")
+#undef new
+
+_INLINE_VAR constexpr unsigned long long _Atomic_wait_no_deadline = 0xFFFF'FFFF'FFFF'FFFF;
+_INLINE_VAR constexpr unsigned long _Atomic_wait_no_timeout       = 0xFFFF'FFFF; // Pass as partial timeout
+
+_EXTERN_C
+enum class __std_atomic_api_level : unsigned long {
+    __not_set,
+    __detecting,
+    __has_srwlock,
+    __has_wait_on_address,
+};
+
+// This function allows testing the atomic wait support while always using the APIs for a platform with fewer
+// capabilities; it attempts to lock the APIs used to the level `_Requested_api_level`, and returns the actual API level
+// in use. Once the API level has been set by calling this function (or detected by a call to one of the atomic wait
+// functions), it can no longer be changed.
+__std_atomic_api_level __stdcall __std_atomic_set_api_level(__std_atomic_api_level _Requested_api_level) noexcept;
+
+// Support for atomic waits.
+// The "direct" functions are used when the underlying infrastructure can use WaitOnAddress directly; that is, _Size is
+// 1, 2, 4, or 8. The contract is the same as the WaitOnAddress function from the Windows SDK. If WaitOnAddress is not
+// available on the current platform, falls back to a similar solution based on SRWLOCK and CONDITION_VARIABLE.
+int __stdcall __std_atomic_wait_direct(
+    const void* _Storage, void* _Comparand, size_t _Size, unsigned long _Remaining_timeout) noexcept;
+void __stdcall __std_atomic_notify_one_direct(const void* _Storage) noexcept;
+void __stdcall __std_atomic_notify_all_direct(const void* _Storage) noexcept;
+
+// The "indirect" functions are used when the size is not 1, 2, 4, or 8; these notionally wait on another value which is
+// of one of those sizes whose value changes upon notify, hence "indirect". (As of 2020-07-24, this always uses the
+// fallback SRWLOCK and CONDITION_VARIABLE implementation but that is not contractual.)
+using _Atomic_wait_indirect_equal_callback_t = bool(__stdcall*)(
+    const void* _Storage, void* _Comparand, size_t _Size, void* _Param) noexcept;
+
+int __stdcall __std_atomic_wait_indirect(const void* _Storage, void* _Comparand, size_t _Size, void* _Param,
+    _Atomic_wait_indirect_equal_callback_t _Are_equal, unsigned long _Remaining_timeout) noexcept;
+void __stdcall __std_atomic_notify_one_indirect(const void* _Storage) noexcept;
+void __stdcall __std_atomic_notify_all_indirect(const void* _Storage) noexcept;
+
+// These functions convert a duration into a time point in order to tolerate spurious wakes in atomic wait, and then
+// convert back from the time point to individual wait attempts (which are limited by DWORD milliseconds to a length of
+// ~49 days)
+unsigned long long __stdcall __std_atomic_wait_get_deadline(unsigned long long _Timeout) noexcept;
+unsigned long __stdcall __std_atomic_wait_get_remaining_timeout(unsigned long long _Deadline) noexcept;
+
+_END_EXTERN_C
+
+#pragma pop_macro("new")
+_STL_RESTORE_CLANG_WARNINGS
+#pragma warning(pop)
+#pragma pack(pop)
+#endif // _STL_COMPILER_PREPROCESSOR
+#endif // _XATOMIC_WAIT_H
diff --git a/stl/inc/yvals.h b/stl/inc/yvals.h
index 1a70aa17ea..ea0b53f457 100644
--- a/stl/inc/yvals.h
+++ b/stl/inc/yvals.h
@@ -306,6 +306,22 @@ _STL_DISABLE_CLANG_WARNINGS
 #define _LOCK_DEBUG          3
 #define _LOCK_AT_THREAD_EXIT 4
 
+#ifndef _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+#if _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WIN8
+#define _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE 1
+#else // ^^^ _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WIN8 // _STL_WIN32_WINNT < _STL_WIN32_WINNT_WIN8 vvv
+#define _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE 0
+#endif // ^^^ _STL_WIN32_WINNT < _STL_WIN32_WINNT_WIN8 ^^^
+#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+
+#ifndef _ALLOW_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE_MISMATCH
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+#pragma detect_mismatch("_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE", "1")
+#else
+#pragma detect_mismatch("_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE", "0")
+#endif
+#endif // !_ALLOW_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE_MISMATCH
+
 #ifdef __cplusplus
 _STD_BEGIN
 enum _Uninitialized { // tag for suppressing initialization
diff --git a/stl/inc/yvals_core.h b/stl/inc/yvals_core.h
index bc55f3dcc2..526ae51d63 100644
--- a/stl/inc/yvals_core.h
+++ b/stl/inc/yvals_core.h
@@ -1138,6 +1138,7 @@
 #define __cpp_lib_atomic_float                  201711L
 #define __cpp_lib_atomic_lock_free_type_aliases 201907L
 #define __cpp_lib_atomic_shared_ptr             201711L
+#define __cpp_lib_atomic_wait                   201907L
 #define __cpp_lib_bind_front                    201907L
 #define __cpp_lib_bit_cast                      201806L
 #define __cpp_lib_bitops                        201907L
@@ -1261,5 +1262,20 @@ compiler option, or define _ALLOW_RTCc_IN_STL to acknowledge that you have recei
 #error In yvals_core.h, defined(MRTDLL) implies defined(_M_CEE_PURE); !defined(_M_CEE_PURE) implies !defined(MRTDLL)
 #endif // defined(MRTDLL) && !defined(_M_CEE_PURE)
 
+#define _STL_WIN32_WINNT_WINXP 0x0501 // _WIN32_WINNT_WINXP from sdkddkver.h
+#define _STL_WIN32_WINNT_VISTA 0x0600 // _WIN32_WINNT_VISTA from sdkddkver.h
+#define _STL_WIN32_WINNT_WIN8  0x0602 // _WIN32_WINNT_WIN8 from sdkddkver.h
+
+// Note that the STL DLL builds will set this to XP for ABI compatibility with VS2015 which supported XP.
+#ifndef _STL_WIN32_WINNT
+#if defined(_M_ARM) || defined(_M_ARM64) || defined(_ONECORE) || defined(_CRT_APP)
+// The first ARM or OneCore or App Windows was Windows 8
+#define _STL_WIN32_WINNT _STL_WIN32_WINNT_WIN8
+#else // ^^^ default to Win8 // default to Vista vvv
+// The earliest Windows supported by this implementation is Windows Vista
+#define _STL_WIN32_WINNT _STL_WIN32_WINNT_VISTA
+#endif // ^^^ !defined(_M_ARM) && !defined(_M_ARM64) && !defined(_ONECORE) && !defined(_CRT_APP) ^^^
+#endif // _STL_WIN32_WINNT
+
 #endif // _STL_COMPILER_PREPROCESSOR
 #endif // _YVALS_CORE_H_
diff --git a/stl/msbuild/stl_atomic_wait/dirs.proj b/stl/msbuild/stl_atomic_wait/dirs.proj
new file mode 100644
index 0000000000..7d349d37ff
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/dirs.proj
@@ -0,0 +1,15 @@
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\..\..\crt-common.settings.targets" />
+
+    <ItemGroup Condition="'$(BuildExePhase)' == '1'">
+        <ProjectFile Condition="'$(CrtBuildMD)'  != 'false'" Include="md\dirs.proj" />
+        <ProjectFile Condition="'$(CrtBuildXMD)' != 'false'" Include="xmd\dirs.proj" />
+    </ItemGroup>
+
+    <Import Project="$(_NTDRIVE)$(_NTROOT)\tools\Microsoft.DevDiv.Traversal.targets" />
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/md/dirs.proj b/stl/msbuild/stl_atomic_wait/md/dirs.proj
new file mode 100644
index 0000000000..8376c0a191
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/md/dirs.proj
@@ -0,0 +1,17 @@
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\..\..\..\crt-common.settings.targets" />
+
+    <ItemGroup Condition="'$(BuildExePhase)' == '1'">
+        <ProjectFile Include="msvcp_atomic_wait_app\msvcp_atomic_wait.nativeproj" />
+        <ProjectFile Include="msvcp_atomic_wait_kernel32\msvcp_atomic_wait.nativeproj" Condition="'$(BuildVCKernel32)' == 'true'" />
+        <ProjectFile Include="msvcp_atomic_wait_onecore\msvcp_atomic_wait.nativeproj" />
+        <ProjectFile Include="msvcp_atomic_wait_netfx\msvcp_atomic_wait.nativeproj" Condition="'$(BuildArchitecture)' != 'chpe'" />
+    </ItemGroup>
+
+    <Import Project="$(_NTDRIVE)$(_NTROOT)\tools\Microsoft.DevDiv.Traversal.targets" />
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..e22fbab9c1
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>md</CrtBuildModel>
+        <MsvcpFlavor>app</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..e009997783
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>md</CrtBuildModel>
+        <MsvcpFlavor>kernel32</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..5b3f631fc0
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>md</CrtBuildModel>
+        <MsvcpFlavor>netfx</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..774d8aa1b5
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/md/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>md</CrtBuildModel>
+        <MsvcpFlavor>onecore</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/msvcp_atomic_wait.settings.targets b/stl/msbuild/stl_atomic_wait/msvcp_atomic_wait.settings.targets
new file mode 100644
index 0000000000..9590f00030
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/msvcp_atomic_wait.settings.targets
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <FinalBinary>p_atomic_wait</FinalBinary>
+
+        <TargetAppFamily Condition="'$(MsvcpFlavor)' == 'app'">true</TargetAppFamily>
+        <TargetCoreSystem Condition="'$(MsvcpFlavor)' == 'onecore'">true</TargetCoreSystem>
+        <TargetNetFx Condition="'$(MsvcpFlavor)' == 'netfx'">true</TargetNetFx>
+
+        <TargetType>DYNLINK</TargetType>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\..\..\crt_build.settings.targets"/>
+
+    <PropertyGroup>
+        <ClrLibSuffix Condition="'$(MsvcpFlavor)' == 'netfx'">_clr</ClrLibSuffix>
+
+        <MsvcpFlavorSuffix Condition="'$(MsvcpFlavor)' == 'app'">_app</MsvcpFlavorSuffix>
+        <MsvcpFlavorSuffix Condition="'$(MsvcpFlavor)' == 'netfx'">_clr$(NetFxVerX)</MsvcpFlavorSuffix>
+
+        <OutputName>msvcp$(VCToolsProdVerSuffix)$(BuildSuffix)_atomic_wait$(MsvcpFlavorSuffix)</OutputName>
+        <LibOutputFileName>msvcprt$(BuildSuffix)_atomic_wait$(ClrLibSuffix)</LibOutputFileName>
+        <LibOutputFile>$(LibOutputFileName).lib</LibOutputFile>
+
+        <ClDefines>_VCRT_ALLOW_INTERNALS;$(ClDefines)</ClDefines>
+        <ClDefines Condition="'$(MsvcpFlavor)' == 'app'">$(ClDefines);_CRT_APP</ClDefines>
+
+        <UseMsvcrt>false</UseMsvcrt>
+        <GenerateImportLib>true</GenerateImportLib>
+        <RCIntermediateOutputDirectory>$(IntermediateOutputDirectory)</RCIntermediateOutputDirectory>
+        <IntermediateImportLibOutput>$(CrtBuildDir)\msvcprt_atomic_wait$(BuildSuffix).$(MsvcpFlavor).import_only.lib</IntermediateImportLibOutput>
+        <DllDefName>$(LibOutputFileName).$(MsvcpFlavor)</DllDefName>
+        <DllDef>$(IntermediateOutputDirectory)\$(DllDefName).def</DllDef>
+
+        <LinkGenerateDebugInformation Condition="'$(BLD_REL_NO_DBINFO)' != '1'">true</LinkGenerateDebugInformation>
+        <LinkProgramDataBaseFileName Condition="'$(BLD_REL_NO_DBINFO)' != '1'">$(OutputPath)\$(OutputName)$(_PDB_VER_NAME_)$(DllPdbFlavorSuffix)</LinkProgramDataBaseFileName>
+
+        <LinkAdditionalOptions Condition="'$(BLD_REL_NO_DBINFO)' != '1'">-debugtype:cv,fixup $(LinkAdditionalOptions)</LinkAdditionalOptions>
+        <LinkAdditionalOptions Condition="$(DebugBuild) != 'true'">-opt:ref,icf=3 $(LinkAdditionalOptions)</LinkAdditionalOptions>
+        <LinkAdditionalOptions Condition="$(DebugBuild) == 'true'">-opt:ref,noicf $(LinkAdditionalOptions)</LinkAdditionalOptions>
+        <LinkAdditionalOptions>-nodefaultlib:libcpmt$(BuildSuffix).lib $(LinkAdditionalOptions)</LinkAdditionalOptions>
+        <LinkAdditionalOptions>-nodefaultlib:$(LibOutputFile) $(LinkAdditionalOptions)</LinkAdditionalOptions>
+
+        <LinkGenerateMapFile>true</LinkGenerateMapFile>
+        <LinkRelease>true</LinkRelease>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <CppPreprocess Include="$(MSBuildThisFileDirectory)\..\..\src\msvcp_atomic_wait.src">
+            <Defines>LIBRARYNAME=$(OutputName.ToUpper())</Defines>
+        </CppPreprocess>
+        <DefFromI Include="$(IntermediateOutputDirectory)\msvcp_atomic_wait.i">
+            <DestFolder1>$(IntermediateOutputDirectory)</DestFolder1>
+            <DestFolder2>$(IntermediateOutputDirectory)</DestFolder2>
+            <DestFile>$(DllDefName)</DestFile>
+        </DefFromI>
+        <RCResourceFile Include="$(MSBuildThisFileDirectory)\msvcprt_atomic_wait.rc"/>
+    </ItemGroup>
+
+    <ItemGroup>
+        <ProjectReference Condition="'$(CanSkipProjectReferenceForTargetLib)' != 'true' and ('$(CrtBuildModel)'=='md' or '$(CrtBuildModel)'=='xmd')" Include="$(VCToolsRootPath)\crt\github\stl\msbuild\stl_base\$(CrtBuildModel)\msvcp_$(MsvcpFlavor)$(CrtSpectreSuffix)\msvcp.nativeproj"/>
+        <TargetLib Include="$(CrtLibPath)\msvcprt_base$(BuildSuffix)$(ClrLibSuffix).lib"/>
+        <TargetLib Include="$(CrtLibPath)\msvcrt$(BuildSuffix)$(ClrLibSuffix).lib"/>
+        <TargetLib Include="$(CrtLibPath)\vcruntime$(BuildSuffix)$(ClrLibSuffix).lib"/>
+        <TargetLib Include="$(UniversalCRTLib)"/>
+    </ItemGroup>
+
+    <!-- Copy the output dll and pdb to various destinations -->
+    <ItemGroup>
+        <CopyFilesForCrossTools Condition="'$(MsvcpFlavor)' == 'kernel32'" Include="
+           $(LinkOutputFile);
+           $(LinkProgramDataBaseFileName);
+        "/>
+        <CopyFilesForNetFx Condition="'$(MsvcpFlavor)' == 'netfx'" Include="
+           $(LinkOutputFile);
+           $(LinkProgramDataBaseFileName);
+        "/>
+    </ItemGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)\stl_atomic_wait.files.settings.targets"/>
+
+    <Import Project="$(VCToolsRootPath)\crt\crt_build.targets"/>
+    <Target Name="GetBaseAddress"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/msvcprt_atomic_wait.rc b/stl/msbuild/stl_atomic_wait/msvcprt_atomic_wait.rc
new file mode 100644
index 0000000000..b43a7e1238
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/msvcprt_atomic_wait.rc
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// msvcprt_atomic_wait.rc : Defines the version resource for the C++ Runtime Library "_atomic_wait" DLL
+//
+//////////////////////////////////////////////////////////////////////////////////////
+
+#include "winver.h"                       // extract from windows header
+#include "verstamp.h"
+
+#define MKARGSTR2(X) #X
+#define MKARGSTR(X)  MKARGSTR2(X)
+
+#define VER_FILETYPE VFT_DLL
+#define VER_FILESUBTYPE VFT_UNKNOWN
+
+#define VER_FILEDESCRIPTION_STR  "Microsoft\256 C Runtime Library _atomic_wait\0"
+#define VER_INTERNALNAME_STR     MKARGSTR(SXS_TARGET)
+#define VER_ORIGINALFILENAME_STR MKARGSTR(SXS_TARGET)
+
+#include <common.ver>
diff --git a/stl/msbuild/stl_atomic_wait/stl_atomic_wait.files.settings.targets b/stl/msbuild/stl_atomic_wait/stl_atomic_wait.files.settings.targets
new file mode 100644
index 0000000000..0de6759e9c
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/stl_atomic_wait.files.settings.targets
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+    <ItemGroup>
+        <BuildFiles Include="
+            $(CrtRoot)\github\stl\src\atomic_wait.cpp;
+            $(CrtRoot)\github\stl\src\parallel_algorithms.cpp;
+            ">
+            <BuildAs>nativecpp</BuildAs>
+        </BuildFiles>
+    </ItemGroup>
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/xmd/dirs.proj b/stl/msbuild/stl_atomic_wait/xmd/dirs.proj
new file mode 100644
index 0000000000..c6bf75b366
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/xmd/dirs.proj
@@ -0,0 +1,24 @@
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\..\..\..\crt-common.settings.targets" />
+
+    <ItemGroup Condition="'$(BuildExePhase)' == '1' and
+                          ('$(SpectreBuildMode)' == '' or
+                           '$(SpectreBuildDebug)' == 'true' or
+                           '$(_BuildType)' == 'chk' or
+                           '$(_BuildType)' == 'dbg')">
+        <ProjectFile Include="msvcp_atomic_wait_app\msvcp_atomic_wait.nativeproj" />
+        <ProjectFile Include="msvcp_atomic_wait_kernel32\msvcp_atomic_wait.nativeproj" Condition="'$(BuildVCKernel32)' == 'true'" />
+        <ProjectFile Include="msvcp_atomic_wait_onecore\msvcp_atomic_wait.nativeproj" />
+    </ItemGroup>
+
+    <ItemGroup Condition="'$(BuildExePhase)' == '1' and ('$(SpectreBuildMode)' == '' or '$(SpectreBuildDebug)' == 'true')">
+        <ProjectFile Include="msvcp_atomic_wait_netfx\msvcp_atomic_wait.nativeproj" Condition="'$(BuildArchitecture)' != 'chpe'" />
+    </ItemGroup>
+
+    <Import Project="$(_NTDRIVE)$(_NTROOT)\tools\Microsoft.DevDiv.Traversal.targets" />
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..6d99c3ab36
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_app/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>xmd</CrtBuildModel>
+        <MsvcpFlavor>app</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..9fe52b880d
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_kernel32/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>xmd</CrtBuildModel>
+        <MsvcpFlavor>kernel32</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..e0629c9e50
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_netfx/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>xmd</CrtBuildModel>
+        <MsvcpFlavor>netfx</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
new file mode 100644
index 0000000000..54f964b787
--- /dev/null
+++ b/stl/msbuild/stl_atomic_wait/xmd/msvcp_atomic_wait_onecore/msvcp_atomic_wait.nativeproj
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" ToolsVersion="Dogfood">
+<!--
+Copyright (c) Microsoft Corporation.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-->
+
+    <PropertyGroup>
+        <CrtBuildModel>xmd</CrtBuildModel>
+        <MsvcpFlavor>onecore</MsvcpFlavor>
+    </PropertyGroup>
+
+    <Import Project="$(MSBuildThisFileDirectory)..\..\msvcp_atomic_wait.settings.targets"/>
+
+</Project>
diff --git a/stl/msbuild/stl_base/stl.files.settings.targets b/stl/msbuild/stl_base/stl.files.settings.targets
index 4c371170a5..5fd1b26310 100644
--- a/stl/msbuild/stl_base/stl.files.settings.targets
+++ b/stl/msbuild/stl_base/stl.files.settings.targets
@@ -12,8 +12,10 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
         <!-- Objs that exist only in libcpmt[d][01].lib. -->
         <BuildFiles Include="
-            $(CrtRoot)\github\stl\src\special_math.cpp;
+            $(CrtRoot)\github\stl\src\atomic_wait.cpp;
             $(CrtRoot)\github\stl\src\memory_resource.cpp;
+            $(CrtRoot)\github\stl\src\parallel_algorithms.cpp;
+            $(CrtRoot)\github\stl\src\special_math.cpp;
             $(CrtRoot)\github\stl\src\ulocale.cpp;
             ">
             <BuildAs>nativecpp</BuildAs>
@@ -170,7 +172,6 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
             $(CrtRoot)\github\stl\src\filesystem.cpp;
             $(CrtRoot)\github\stl\src\locale0_implib.cpp;
             $(CrtRoot)\github\stl\src\nothrow.cpp;
-            $(CrtRoot)\github\stl\src\parallel_algorithms.cpp;
             $(CrtRoot)\github\stl\src\sharedmutex.cpp;
             $(CrtRoot)\github\stl\src\syserror_import_lib.cpp;
             $(CrtRoot)\github\stl\src\vector_algorithms.cpp;
diff --git a/stl/msbuild/stl_post/msvcp_post.settings.targets b/stl/msbuild/stl_post/msvcp_post.settings.targets
index 6d09f6244f..3271bca0e9 100644
--- a/stl/msbuild/stl_post/msvcp_post.settings.targets
+++ b/stl/msbuild/stl_post/msvcp_post.settings.targets
@@ -50,10 +50,13 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
             Include="$(CrtRoot)\github\stl\msbuild\stl_2\$(CrtBuildModel)\msvcp_2_$(MsvcpFlavor)\msvcp_2.nativeproj"/>
         <ProjectReference Condition="'$(CanSkipProjectReferenceForTargetLib)' != 'true'"
             Include="$(CrtRoot)\github\stl\msbuild\stl_codecvt_ids\$(CrtBuildModel)\msvcp_codecvt_ids_$(MsvcpFlavor)\msvcp_codecvt_ids.nativeproj"/>
+        <ProjectReference Condition="'$(CanSkipProjectReferenceForTargetLib)' != 'true'"
+            Include="$(CrtRoot)\github\stl\msbuild\stl_atomic_wait\$(CrtBuildModel)\msvcp_atomic_wait_$(MsvcpFlavor)\msvcp_atomic_wait.nativeproj"/>
         <Lib Include="$(CrtLibPath)\msvcprt_base$(BuildSuffix)$(ClrLibSuffix).lib"/>
         <Lib Include="$(CrtLibPath)\msvcprt_1$(BuildSuffix)$(ClrLibSuffix).lib"/>
         <Lib Include="$(CrtLibPath)\msvcprt_2$(BuildSuffix)$(ClrLibSuffix).lib"/>
         <Lib Include="$(CrtLibPath)\msvcprt$(BuildSuffix)_codecvt_ids$(ClrLibSuffix).lib"/>
+        <Lib Include="$(CrtLibPath)\msvcprt$(BuildSuffix)_atomic_wait$(ClrLibSuffix).lib"/>
         <Lib Include="$(MSBuildThisFileDirectory)\..\..\aliases\$(BuildArchitecture)\std_init_once_begin_initialize.obj" />
         <Lib Include="$(MSBuildThisFileDirectory)\..\..\aliases\$(BuildArchitecture)\std_init_once_complete.obj" />
     </ItemGroup>
diff --git a/stl/src/atomic_wait.cpp b/stl/src/atomic_wait.cpp
new file mode 100644
index 0000000000..d16f53f219
--- /dev/null
+++ b/stl/src/atomic_wait.cpp
@@ -0,0 +1,334 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// implement atomic wait / notify_one / notify_all
+
+// clang-format off
+
+#include <atomic>
+#include <cstdint>
+#include <new>
+#include <thread>
+#include <Windows.h>
+
+// clang-format on
+
+namespace {
+
+    constexpr size_t _Wait_table_size_power = 8;
+    constexpr size_t _Wait_table_size       = 1 << _Wait_table_size_power;
+    constexpr size_t _Wait_table_index_mask = _Wait_table_size - 1;
+
+    struct _Wait_context {
+        const void* _Storage; // Pointer to wait on
+        _Wait_context* _Next;
+        _Wait_context* _Prev;
+        CONDITION_VARIABLE _Condition;
+    };
+
+    struct _Guarded_wait_context : _Wait_context {
+        _Guarded_wait_context(const void* _Storage_, _Wait_context* const _Head) noexcept
+            : _Wait_context{_Storage_, _Head, _Head->_Prev, CONDITION_VARIABLE_INIT} {
+            _Prev->_Next = this;
+            _Next->_Prev = this;
+        }
+
+        ~_Guarded_wait_context() {
+            const auto _Next_local = _Next;
+            const auto _Prev_local = _Prev;
+            _Next->_Prev           = _Prev_local;
+            _Prev->_Next           = _Next_local;
+        }
+
+        _Guarded_wait_context(const _Guarded_wait_context&) = delete;
+        _Guarded_wait_context& operator=(const _Guarded_wait_context&) = delete;
+    };
+
+    class _SrwLock_guard {
+    public:
+        explicit _SrwLock_guard(SRWLOCK& _Locked_) noexcept : _Locked(&_Locked_) {
+            AcquireSRWLockExclusive(_Locked);
+        }
+
+        ~_SrwLock_guard() {
+            ReleaseSRWLockExclusive(_Locked);
+        }
+
+        _SrwLock_guard(const _SrwLock_guard&) = delete;
+        _SrwLock_guard& operator=(const _SrwLock_guard&) = delete;
+
+    private:
+        SRWLOCK* _Locked;
+    };
+
+
+#pragma warning(push)
+#pragma warning(disable : 4324) // structure was padded due to alignment specifier
+    struct alignas(_STD hardware_destructive_interference_size) _Wait_table_entry {
+        SRWLOCK _Lock                 = SRWLOCK_INIT;
+        _Wait_context _Wait_list_head = {nullptr, &_Wait_list_head, &_Wait_list_head, CONDITION_VARIABLE_INIT};
+
+        constexpr _Wait_table_entry() noexcept = default;
+    };
+#pragma warning(pop)
+
+    [[nodiscard]] _Wait_table_entry& _Atomic_wait_table_entry(const void* const _Storage) noexcept {
+        static _Wait_table_entry wait_table[_Wait_table_size];
+        auto index = reinterpret_cast<_STD uintptr_t>(_Storage);
+        index ^= index >> (_Wait_table_size_power * 2);
+        index ^= index >> _Wait_table_size_power;
+        return wait_table[index & _Wait_table_index_mask];
+    }
+
+    void _Assume_timeout() noexcept {
+#ifdef _DEBUG
+        if (GetLastError() != ERROR_TIMEOUT) {
+            _CSTD abort();
+        }
+#endif // _DEBUG
+    }
+
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+
+#define __crtWaitOnAddress       WaitOnAddress
+#define __crtWakeByAddressSingle WakeByAddressSingle
+#define __crtWakeByAddressAll    WakeByAddressAll
+
+#else // ^^^ _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE / !_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE vvv
+
+
+    struct _Wait_functions_table {
+        _STD atomic<decltype(&::WaitOnAddress)> _Pfn_WaitOnAddress{nullptr};
+        _STD atomic<decltype(&::WakeByAddressSingle)> _Pfn_WakeByAddressSingle{nullptr};
+        _STD atomic<decltype(&::WakeByAddressAll)> _Pfn_WakeByAddressAll{nullptr};
+        _STD atomic<__std_atomic_api_level> _Api_level{__std_atomic_api_level::__not_set};
+    };
+
+    _Wait_functions_table _Wait_functions;
+
+    void _Force_wait_functions_srwlock_only() noexcept {
+        auto _Local = _Wait_functions._Api_level.load(_STD memory_order_acquire);
+        if (_Local <= __std_atomic_api_level::__detecting) {
+            while (!_Wait_functions._Api_level.compare_exchange_weak(_Local, __std_atomic_api_level::__has_srwlock)) {
+                if (_Local > __std_atomic_api_level::__detecting) {
+                    return;
+                }
+            }
+        }
+    }
+
+    [[nodiscard]] __std_atomic_api_level _Init_wait_functions(__std_atomic_api_level _Level) {
+        while (!_Wait_functions._Api_level.compare_exchange_weak(_Level, __std_atomic_api_level::__detecting)) {
+            if (_Level > __std_atomic_api_level::__detecting) {
+                return _Level;
+            }
+        }
+
+        _Level = __std_atomic_api_level::__has_srwlock;
+
+        const HMODULE _Sync_module = GetModuleHandleW(L"api-ms-win-core-synch-l1-2-0.dll");
+        if (_Sync_module != nullptr) {
+            const auto _Wait_on_address =
+                reinterpret_cast<decltype(&::WaitOnAddress)>(GetProcAddress(_Sync_module, "WaitOnAddress"));
+            const auto _Wake_by_address_single =
+                reinterpret_cast<decltype(&::WakeByAddressSingle)>(GetProcAddress(_Sync_module, "WakeByAddressSingle"));
+            const auto _Wake_by_address_all =
+                reinterpret_cast<decltype(&::WakeByAddressAll)>(GetProcAddress(_Sync_module, "WakeByAddressAll"));
+
+            if (_Wait_on_address != nullptr && _Wake_by_address_single != nullptr && _Wake_by_address_all != nullptr) {
+                _Wait_functions._Pfn_WaitOnAddress.store(_Wait_on_address, _STD memory_order_relaxed);
+                _Wait_functions._Pfn_WakeByAddressSingle.store(_Wake_by_address_single, _STD memory_order_relaxed);
+                _Wait_functions._Pfn_WakeByAddressAll.store(_Wake_by_address_all, _STD memory_order_relaxed);
+                _Level = __std_atomic_api_level::__has_wait_on_address;
+            }
+        }
+
+        // for __has_srwlock, relaxed would have been enough, not distinguishing for consistency
+        _Wait_functions._Api_level.store(_Level, _STD memory_order_release);
+        return _Level;
+    }
+
+    [[nodiscard]] __std_atomic_api_level _Acquire_wait_functions() noexcept {
+        auto _Level = _Wait_functions._Api_level.load(_STD memory_order_acquire);
+        if (_Level <= __std_atomic_api_level::__detecting) {
+            _Level = _Init_wait_functions(_Level);
+        }
+
+        return _Level;
+    }
+
+    [[nodiscard]] BOOL __crtWaitOnAddress(
+        volatile VOID* Address, PVOID CompareAddress, SIZE_T AddressSize, DWORD dwMilliseconds) {
+        const auto _Wait_on_address = _Wait_functions._Pfn_WaitOnAddress.load(_STD memory_order_relaxed);
+        return _Wait_on_address(Address, CompareAddress, AddressSize, dwMilliseconds);
+    }
+
+    VOID __crtWakeByAddressSingle(PVOID Address) {
+        const auto _Wake_by_address_single = _Wait_functions._Pfn_WakeByAddressSingle.load(_STD memory_order_relaxed);
+        _Wake_by_address_single(Address);
+    }
+
+    VOID __crtWakeByAddressAll(PVOID Address) {
+        const auto _Wake_by_address_all = _Wait_functions._Pfn_WakeByAddressAll.load(_STD memory_order_relaxed);
+        _Wake_by_address_all(Address);
+    }
+
+    bool __stdcall _Atomic_wait_are_equal_direct_fallback(
+        const void* _Storage, void* _Comparand, size_t _Size, void*) noexcept {
+        switch (_Size) {
+        case 1:
+            return __iso_volatile_load8(static_cast<const char*>(_Storage)) == *static_cast<const char*>(_Comparand);
+        case 2:
+            return __iso_volatile_load16(static_cast<const short*>(_Storage)) == *static_cast<const short*>(_Comparand);
+        case 4:
+            return __iso_volatile_load32(static_cast<const int*>(_Storage)) == *static_cast<const int*>(_Comparand);
+        case 8:
+            return __iso_volatile_load64(static_cast<const long long*>(_Storage))
+                   == *static_cast<const long long*>(_Comparand);
+        default:
+            _CSTD abort();
+        }
+    }
+#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+} // unnamed namespace
+
+
+_EXTERN_C
+int __stdcall __std_atomic_wait_direct(const void* const _Storage, void* const _Comparand, const size_t _Size,
+    const unsigned long _Remaining_timeout) noexcept {
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
+    if (_Acquire_wait_functions() < __std_atomic_api_level::__has_wait_on_address) {
+        return __std_atomic_wait_indirect(
+            _Storage, _Comparand, _Size, nullptr, &_Atomic_wait_are_equal_direct_fallback, _Remaining_timeout);
+    }
+#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
+
+    const auto _Result = __crtWaitOnAddress(
+        const_cast<volatile void*>(_Storage), const_cast<void*>(_Comparand), _Size, _Remaining_timeout);
+
+    if (!_Result) {
+        _Assume_timeout();
+    }
+    return _Result;
+}
+
+void __stdcall __std_atomic_notify_one_direct(const void* const _Storage) noexcept {
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
+    if (_Acquire_wait_functions() < __std_atomic_api_level::__has_wait_on_address) {
+        __std_atomic_notify_one_indirect(_Storage);
+        return;
+    }
+#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE = 0
+
+    __crtWakeByAddressSingle(const_cast<void*>(_Storage));
+}
+
+void __stdcall __std_atomic_notify_all_direct(const void* const _Storage) noexcept {
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
+    if (_Acquire_wait_functions() < __std_atomic_api_level::__has_wait_on_address) {
+        __std_atomic_notify_all_indirect(_Storage);
+        return;
+    }
+#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
+
+    __crtWakeByAddressAll(const_cast<void*>(_Storage));
+}
+
+void __stdcall __std_atomic_notify_one_indirect(const void* const _Storage) noexcept {
+    auto& _Entry = _Atomic_wait_table_entry(_Storage);
+    _SrwLock_guard _Guard(_Entry._Lock);
+    _Wait_context* _Context = _Entry._Wait_list_head._Next;
+    for (; _Context != &_Entry._Wait_list_head; _Context = _Context->_Next) {
+        if (_Context->_Storage == _Storage) {
+            // Can't move wake outside SRWLOCKed section: SRWLOCK also protects the _Context itself
+            WakeAllConditionVariable(&_Context->_Condition);
+            break;
+        }
+    }
+}
+
+void __stdcall __std_atomic_notify_all_indirect(const void* const _Storage) noexcept {
+    auto& _Entry = _Atomic_wait_table_entry(_Storage);
+    _SrwLock_guard _Guard(_Entry._Lock);
+    _Wait_context* _Context = _Entry._Wait_list_head._Next;
+    for (; _Context != &_Entry._Wait_list_head; _Context = _Context->_Next) {
+        if (_Context->_Storage == _Storage) {
+            // Can't move wake outside SRWLOCKed section: SRWLOCK also protects the _Context itself
+            WakeAllConditionVariable(&_Context->_Condition);
+        }
+    }
+}
+
+int __stdcall __std_atomic_wait_indirect(const void* _Storage, void* _Comparand, size_t _Size, void* _Param,
+    _Atomic_wait_indirect_equal_callback_t _Are_equal, unsigned long _Remaining_timeout) noexcept {
+    auto& _Entry = _Atomic_wait_table_entry(_Storage);
+
+    _SrwLock_guard _Guard(_Entry._Lock);
+    _Guarded_wait_context _Context{_Storage, &_Entry._Wait_list_head};
+    for (;;) {
+        if (!_Are_equal(_Storage, _Comparand, _Size, _Param)) { // note: under lock to prevent lost wakes
+            return TRUE;
+        }
+
+        if (!SleepConditionVariableSRW(&_Context._Condition, &_Entry._Lock, _Remaining_timeout, 0)) {
+            _Assume_timeout();
+            return FALSE;
+        }
+
+        if (_Remaining_timeout != _Atomic_wait_no_timeout) {
+            // spurious wake to recheck the clock
+            return TRUE;
+        }
+    }
+}
+
+unsigned long long __stdcall __std_atomic_wait_get_deadline(const unsigned long long _Timeout) noexcept {
+    if (_Timeout == _Atomic_wait_no_deadline) {
+        return _Atomic_wait_no_deadline;
+    } else {
+        return GetTickCount64() + _Timeout;
+    }
+}
+
+unsigned long __stdcall __std_atomic_wait_get_remaining_timeout(unsigned long long _Deadline) noexcept {
+    static_assert(_Atomic_wait_no_timeout == INFINITE,
+        "_Atomic_wait_no_timeout is passed directly to underlying API, so should match it");
+
+    if (_Deadline == _Atomic_wait_no_deadline) {
+        return INFINITE;
+    }
+
+    const unsigned long long _Current_time = GetTickCount64();
+    if (_Current_time >= _Deadline) {
+        return 0;
+    }
+
+    unsigned long long _Remaining     = _Deadline - _Current_time;
+    constexpr unsigned long _Ten_days = 864'000'000;
+    if (_Remaining > _Ten_days) {
+        return _Ten_days;
+    }
+    return static_cast<unsigned long>(_Remaining);
+}
+
+__std_atomic_api_level __stdcall __std_atomic_set_api_level(__std_atomic_api_level _Requested_api_level) noexcept {
+#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+    (void) _Requested_api_level;
+    return __std_atomic_api_level::__has_wait_on_address;
+#else // ^^^ _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE // !_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE vvv
+    switch (_Requested_api_level) {
+    case __std_atomic_api_level::__not_set:
+    case __std_atomic_api_level::__detecting:
+        _CSTD abort();
+    case __std_atomic_api_level::__has_srwlock:
+        _Force_wait_functions_srwlock_only();
+        break;
+    case __std_atomic_api_level::__has_wait_on_address:
+    default: // future compat: new header using an old DLL will get the highest requested level supported
+        break;
+    }
+
+    return _Acquire_wait_functions();
+#endif // !_ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
+}
+_END_EXTERN_C
diff --git a/stl/src/msvcp_atomic_wait.src b/stl/src/msvcp_atomic_wait.src
new file mode 100644
index 0000000000..ec335cc161
--- /dev/null
+++ b/stl/src/msvcp_atomic_wait.src
@@ -0,0 +1,25 @@
+; Copyright (c) Microsoft Corporation.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; atomic wait satellite DLL definition
+
+LIBRARY LIBRARYNAME
+
+EXPORTS
+    __std_atomic_wait_get_deadline
+    __std_atomic_wait_get_remaining_timeout
+    __std_atomic_notify_all_direct
+    __std_atomic_notify_all_indirect
+    __std_atomic_notify_one_direct
+    __std_atomic_notify_one_indirect
+    __std_atomic_set_api_level
+    __std_atomic_wait_direct
+    __std_atomic_wait_indirect
+    __std_bulk_submit_threadpool_work
+    __std_close_threadpool_work
+    __std_create_threadpool_work
+    __std_execution_wait_on_uchar
+    __std_execution_wake_by_address_all
+    __std_parallel_algorithms_hw_threads
+    __std_submit_threadpool_work
+    __std_wait_for_threadpool_work_callbacks
diff --git a/stl/src/parallel_algorithms.cpp b/stl/src/parallel_algorithms.cpp
index c59577b7d5..b661116d31 100644
--- a/stl/src/parallel_algorithms.cpp
+++ b/stl/src/parallel_algorithms.cpp
@@ -3,297 +3,60 @@
 
 // support for <execution>
 
-#include <corecrt_terminate.h>
 #include <internal_shared.h>
-#include <intrin0.h>
+#include <thread>
+#include <xatomic_wait.h>
 
-// This must be as small as possible, because its contents are
-// injected into the msvcprt.lib and msvcprtd.lib import libraries.
-// Do not include or define anything else here.
-// In particular, basic_string must not be included here.
-
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8
-#pragma comment(lib, "synchronization") // for WaitOnAddress family
-#endif // _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8
-
-#if _STL_WIN32_WINNT < _WIN32_WINNT_WIN8
 namespace {
-    struct _Parallel_init_info {
-        unsigned int _Hw_threads;
-#if _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-        decltype(CreateThreadpoolWork)* _Pfn_CreateThreadpoolWork;
-        decltype(SubmitThreadpoolWork)* _Pfn_SubmitThreadpoolWork;
-        decltype(CloseThreadpoolWork)* _Pfn_CloseThreadpoolWork;
-        decltype(WaitForThreadpoolWorkCallbacks)* _Pfn_WaitForThreadpoolWorkCallbacks;
-        decltype(AcquireSRWLockExclusive)* _Pfn_AcquireSRWLockExclusive; // nullptr if _Pfn_WaitOnAddress is non-nullptr
-        decltype(ReleaseSRWLockExclusive)* _Pfn_ReleaseSRWLockExclusive; // ditto
-        decltype(SleepConditionVariableSRW)* _Pfn_SleepConditionVariableSRW; // ditto
-        decltype(WakeAllConditionVariable)* _Pfn_WakeAllConditionVariable; // ditto
-#endif // _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-        decltype(WaitOnAddress)* _Pfn_WaitOnAddress;
-        decltype(WakeByAddressAll)* _Pfn_WakeByAddressAll;
-    };
-
-    _Parallel_init_info _Parallel_info;
-
-    struct _Wait_semaphore {
-        SRWLOCK _Mtx;
-        CONDITION_VARIABLE _Cv;
-    };
-
-    constexpr int _Wait_table_size      = 256; // one 4k page
-    constexpr int _Wait_table_max_index = _Wait_table_size - 1;
-    _Wait_semaphore _Wait_table[_Wait_table_size]{};
-    size_t _Choose_wait_entry(const volatile void* _Target) noexcept {
-        auto _Num = reinterpret_cast<uintptr_t>(_Target);
-#ifdef _WIN64
-        _Num = (_Num & ((1ull << 32) - 1ull)) ^ (_Num >> 32); // down to 32 bits
-#endif // _WIN64
-        _Num = (_Num & ((1u << 16) - 1u)) ^ (_Num >> 16); // to 16 bits
-        _Num = (_Num & ((1u << 8) - 1u)) ^ (_Num >> 8); // to 8 bits
-        static_assert(_Wait_table_max_index == (1 << 8) - 1, "Bad wait table size assumption");
-        return _Num;
-    }
-
     unsigned char _Atomic_load_uchar(const volatile unsigned char* _Ptr) noexcept {
         // atomic load of unsigned char, copied from <atomic> except ARM and ARM64 bits
         unsigned char _Value;
-#if defined(_M_IX86) || defined(_M_X64)
-        _Value = *_Ptr;
+#if defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+        _Value = __iso_volatile_load8(reinterpret_cast<const volatile char*>(_Ptr));
         _ReadWriteBarrier();
-#else // architecture, no ARM support as this is guarded by _STL_WIN32_WINNT < _WIN32_WINNT_WIN8
+#else
 #error Unsupported architecture
-#endif // architecture
+#endif
         return _Value;
     }
-
-    unsigned int _Atomic_load_uint(const volatile unsigned int* _Ptr) noexcept {
-        // atomic load of unsigned int, copied from <atomic> except ARM and ARM64 bits
-        unsigned int _Value;
-#if defined(_M_IX86) || defined(_M_X64)
-        _Value = *_Ptr;
-        _ReadWriteBarrier();
-#else // architecture, ditto no ARM support
-#error Unsupported architecture
-#endif // architecture
-        return _Value;
-    }
-
-    void _Atomic_store_uint(volatile unsigned int* _Tgt, unsigned int _Value) {
-        // atomic store of unsigned int, copied from <atomic>
-#if defined(_M_IX86) || defined(_M_X64)
-        _InterlockedExchange(reinterpret_cast<volatile long*>(_Tgt), static_cast<long>(_Value));
-#else // architecture, ditto no ARM support
-#error Unsupported architecture
-#endif // architecture
-    }
-
-    bool _Initialize_parallel_init_info() { // try to fill in _Parallel_info
-#if !(defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
-#error Check hardware assumption: Assumes that write races of identical values to pointer-sized variables are benign
-#endif // !(defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
-
-        HMODULE _Kernel32 = GetModuleHandleW(L"kernel32.dll");
-#if _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-        _Parallel_info._Pfn_CreateThreadpoolWork =
-            reinterpret_cast<decltype(CreateThreadpoolWork)*>(GetProcAddress(_Kernel32, "CreateThreadpoolWork"));
-        _Parallel_info._Pfn_SubmitThreadpoolWork =
-            reinterpret_cast<decltype(SubmitThreadpoolWork)*>(GetProcAddress(_Kernel32, "SubmitThreadpoolWork"));
-        _Parallel_info._Pfn_CloseThreadpoolWork =
-            reinterpret_cast<decltype(CloseThreadpoolWork)*>(GetProcAddress(_Kernel32, "CloseThreadpoolWork"));
-        _Parallel_info._Pfn_WaitForThreadpoolWorkCallbacks =
-            reinterpret_cast<decltype(WaitForThreadpoolWorkCallbacks)*>(
-                GetProcAddress(_Kernel32, "WaitForThreadpoolWorkCallbacks"));
-        if (!_Parallel_info._Pfn_CreateThreadpoolWork || !_Parallel_info._Pfn_SubmitThreadpoolWork
-            || !_Parallel_info._Pfn_CloseThreadpoolWork || !_Parallel_info._Pfn_WaitForThreadpoolWorkCallbacks) {
-            // don't parallelize without the Windows Vista threadpool
-            return false;
-        }
-#endif // _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-
-        HMODULE _KernelBase = GetModuleHandleW(L"kernelbase.dll");
-        if (_KernelBase) {
-            _Parallel_info._Pfn_WaitOnAddress =
-                reinterpret_cast<decltype(WaitOnAddress)*>(GetProcAddress(_KernelBase, "WaitOnAddress"));
-            _Parallel_info._Pfn_WakeByAddressAll =
-                reinterpret_cast<decltype(WakeByAddressAll)*>(GetProcAddress(_KernelBase, "WakeByAddressAll"));
-            if ((_Parallel_info._Pfn_WaitOnAddress == nullptr) != (_Parallel_info._Pfn_WakeByAddressAll == nullptr)) {
-                // if we don't have both we can use neither
-                _Parallel_info._Pfn_WaitOnAddress    = nullptr;
-                _Parallel_info._Pfn_WakeByAddressAll = nullptr;
-            }
-        }
-
-#if _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-        if (_Parallel_info._Pfn_WaitOnAddress) { // no need for SRWLOCK or CONDITION_VARIABLE if we have WaitOnAddress
-            return true;
-        }
-
-        _Parallel_info._Pfn_AcquireSRWLockExclusive =
-            reinterpret_cast<decltype(AcquireSRWLockExclusive)*>(GetProcAddress(_Kernel32, "AcquireSRWLockExclusive"));
-        _Parallel_info._Pfn_ReleaseSRWLockExclusive =
-            reinterpret_cast<decltype(ReleaseSRWLockExclusive)*>(GetProcAddress(_Kernel32, "ReleaseSRWLockExclusive"));
-        _Parallel_info._Pfn_SleepConditionVariableSRW = reinterpret_cast<decltype(SleepConditionVariableSRW)*>(
-            GetProcAddress(_Kernel32, "SleepConditionVariableSRW"));
-        _Parallel_info._Pfn_WakeAllConditionVariable = reinterpret_cast<decltype(WakeAllConditionVariable)*>(
-            GetProcAddress(_Kernel32, "WakeAllConditionVariable"));
-
-        if (!_Parallel_info._Pfn_AcquireSRWLockExclusive || !_Parallel_info._Pfn_ReleaseSRWLockExclusive
-            || !_Parallel_info._Pfn_SleepConditionVariableSRW || !_Parallel_info._Pfn_WakeAllConditionVariable) {
-            // no fallback for WaitOnAddress; shouldn't be possible as these
-            // APIs were added at the same time as the Windows Vista threadpool API
-            return false;
-        }
-#endif // _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-
-        return true;
-    }
 } // unnamed namespace
-#endif // _STL_WIN32_WINNT < _WIN32_WINNT_WIN8
-
-static DWORD _Get_number_of_processors() noexcept {
-    SYSTEM_INFO _Info;
-    GetNativeSystemInfo(&_Info);
-    return _Info.dwNumberOfProcessors;
-}
 
 extern "C" {
 
+// TRANSITION, ABI
 _NODISCARD unsigned int __stdcall __std_parallel_algorithms_hw_threads() noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8
-    return _Get_number_of_processors();
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8 ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 vvv
-      // _Atomic_load_uint enforces memory ordering in _Initialize_parallel_init_info:
-    unsigned int _Result = _Atomic_load_uint(&_Parallel_info._Hw_threads);
-    if (_Result == 0) {
-        if (_Initialize_parallel_init_info()) {
-            _Result = _Get_number_of_processors();
-        } else {
-            _Result = 1;
-        }
-
-        // _Atomic_store_uint enforces memory ordering in _Initialize_parallel_init_info:
-        _Atomic_store_uint(&_Parallel_info._Hw_threads, _Result);
-    }
-
-    return _Result;
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 ^^^
+    return _STD thread::hardware_concurrency();
 }
 
-// Relaxed reads of _Parallel_info below because __std_parallel_algorithms_hw_threads must be called
-// before any of these on each thread.
-
 _NODISCARD PTP_WORK __stdcall __std_create_threadpool_work(
     PTP_WORK_CALLBACK _Callback, void* _Context, PTP_CALLBACK_ENVIRON _Callback_environ) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA
     return CreateThreadpoolWork(_Callback, _Context, _Callback_environ);
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_VISTA vvv
-    return _Parallel_info._Pfn_CreateThreadpoolWork(_Callback, _Context, _Callback_environ);
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^
 }
 
 void __stdcall __std_submit_threadpool_work(PTP_WORK _Work) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA
     SubmitThreadpoolWork(_Work);
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_VISTA vvv
-    _Parallel_info._Pfn_SubmitThreadpoolWork(_Work);
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^
 }
 
 void __stdcall __std_bulk_submit_threadpool_work(PTP_WORK _Work, const size_t _Submissions) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA
     for (size_t _Idx = 0; _Idx < _Submissions; ++_Idx) {
         SubmitThreadpoolWork(_Work);
     }
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_VISTA vvv
-    const auto _Fn = _Parallel_info._Pfn_SubmitThreadpoolWork;
-    for (size_t _Idx = 0; _Idx < _Submissions; ++_Idx) {
-        _Fn(_Work);
-    }
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^
 }
 
 void __stdcall __std_close_threadpool_work(PTP_WORK _Work) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA
     CloseThreadpoolWork(_Work);
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_VISTA vvv
-    _Parallel_info._Pfn_CloseThreadpoolWork(_Work);
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^
 }
 
 void __stdcall __std_wait_for_threadpool_work_callbacks(PTP_WORK _Work, BOOL _Cancel) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA
     WaitForThreadpoolWorkCallbacks(_Work, _Cancel);
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_VISTA vvv
-    _Parallel_info._Pfn_WaitForThreadpoolWorkCallbacks(_Work, _Cancel);
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^
 }
 
 void __stdcall __std_execution_wait_on_uchar(const volatile unsigned char* _Address, unsigned char _Compare) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8
-    if (WaitOnAddress(const_cast<volatile unsigned char*>(_Address), &_Compare, 1, INFINITE) == FALSE) {
-        // this API failing should only be possible with a timeout, and we asked for INFINITE
-        ::terminate();
-    }
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8 ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 vvv
-    if (_Parallel_info._Pfn_WaitOnAddress) {
-        if (_Parallel_info._Pfn_WaitOnAddress(const_cast<volatile unsigned char*>(_Address), &_Compare, 1, INFINITE)
-            == FALSE) {
-            ::terminate();
-        }
-
-        return;
-    }
-
-    // fake WaitOnAddress via SRWLOCK and CONDITION_VARIABLE
-    for (int _Idx = 0; _Idx < 4096; ++_Idx) { // optimistic non-backoff spin
-        if (_Atomic_load_uchar(_Address) == _Compare) {
-            return;
-        }
-    }
-
-    auto& _Wait_entry = _Wait_table[_Choose_wait_entry(_Address)];
-#if _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-    _Parallel_info._Pfn_AcquireSRWLockExclusive(&_Wait_entry._Mtx);
-    while (_Atomic_load_uchar(_Address) == _Compare) {
-        if (_Parallel_info._Pfn_SleepConditionVariableSRW(&_Wait_entry._Cv, &_Wait_entry._Mtx, INFINITE, 0) == 0) {
-            ::terminate();
-        }
-    }
-
-    _Parallel_info._Pfn_ReleaseSRWLockExclusive(&_Wait_entry._Mtx);
-#else // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA vvv
-    AcquireSRWLockExclusive(&_Wait_entry._Mtx);
-    while (_Atomic_load_uchar(_Address) == _Compare) {
-        if (SleepConditionVariableSRW(&_Wait_entry._Cv, &_Wait_entry._Mtx, INFINITE, 0) == 0) {
-            ::terminate();
-        }
-    }
-
-    ReleaseSRWLockExclusive(&_Wait_entry._Mtx);
-#endif // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 ^^^
+    __std_atomic_wait_direct(const_cast<const unsigned char*>(_Address), &_Compare, 1, _Atomic_wait_no_timeout);
 }
 
 void __stdcall __std_execution_wake_by_address_all(const volatile void* _Address) noexcept {
-#if _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8
-    WakeByAddressAll(const_cast<void*>(_Address));
-#else // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_WIN8 ^^^ / vvv _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 vvv
-    if (_Parallel_info._Pfn_WakeByAddressAll) {
-        _Parallel_info._Pfn_WakeByAddressAll(const_cast<void*>(_Address));
-    } else {
-        auto& _Wait_entry = _Wait_table[_Choose_wait_entry(_Address)];
-#if _STL_WIN32_WINNT < _WIN32_WINNT_VISTA
-        _Parallel_info._Pfn_AcquireSRWLockExclusive(&_Wait_entry._Mtx);
-        _Parallel_info._Pfn_ReleaseSRWLockExclusive(&_Wait_entry._Mtx);
-        _Parallel_info._Pfn_WakeAllConditionVariable(&_Wait_entry._Cv);
-#else // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_VISTA ^^^ / vvv _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA vvv
-        AcquireSRWLockExclusive(&_Wait_entry._Mtx);
-        ReleaseSRWLockExclusive(&_Wait_entry._Mtx);
-        WakeAllConditionVariable(&_Wait_entry._Cv);
-#endif // ^^^ _STL_WIN32_WINNT >= _WIN32_WINNT_VISTA ^^^
-    }
-#endif // ^^^ _STL_WIN32_WINNT < _WIN32_WINNT_WIN8 ^^^
+    __std_atomic_notify_all_direct(const_cast<const void*>(_Address));
 }
 
 } // extern "C"
diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt
index 88a11df330..366bc6b380 100644
--- a/tests/libcxx/expected_results.txt
+++ b/tests/libcxx/expected_results.txt
@@ -472,7 +472,6 @@ std/language.support/support.limits/support.limits.general/iterator.version.pass
 std/language.support/support.limits/support.limits.general/memory.version.pass.cpp FAIL
 
 # C++20 P1135R6 "The C++20 Synchronization Library"
-std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp FAIL
 std/thread/thread.barrier/arrive.pass.cpp FAIL
 std/thread/thread.barrier/arrive_and_drop.pass.cpp FAIL
 std/thread/thread.barrier/arrive_and_wait.pass.cpp FAIL
diff --git a/tests/libcxx/skipped_tests.txt b/tests/libcxx/skipped_tests.txt
index 133b8b06e8..ca2297c46b 100644
--- a/tests/libcxx/skipped_tests.txt
+++ b/tests/libcxx/skipped_tests.txt
@@ -472,7 +472,6 @@ language.support\support.limits\support.limits.general\iterator.version.pass.cpp
 language.support\support.limits\support.limits.general\memory.version.pass.cpp
 
 # C++20 P1135R6 "The C++20 Synchronization Library"
-atomics\atomics.types.operations\atomics.types.operations.wait\atomic_wait.pass.cpp
 thread\thread.barrier\arrive.pass.cpp
 thread\thread.barrier\arrive_and_drop.pass.cpp
 thread\thread.barrier\arrive_and_wait.pass.cpp
diff --git a/tests/std/include/test_atomic_wait.hpp b/tests/std/include/test_atomic_wait.hpp
new file mode 100644
index 0000000000..248615cdf4
--- /dev/null
+++ b/tests/std/include/test_atomic_wait.hpp
@@ -0,0 +1,203 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <assert.h>
+#include <atomic>
+#include <chrono>
+#include <string.h>
+#include <thread>
+
+template <class UnderlyingType>
+void test_atomic_wait_func(const UnderlyingType old_value, const UnderlyingType new_value,
+    const std::chrono::steady_clock::duration waiting_duration) {
+    constexpr int seq_max_size = 10;
+    char seq[seq_max_size + 1];
+    std::atomic<char*> base = seq;
+    auto add_seq            = [&](char ch) {
+        char* p = base.fetch_add(1, std::memory_order_relaxed);
+        assert(p - seq < seq_max_size);
+        *p = ch;
+    };
+
+    std::atomic<UnderlyingType> a{old_value};
+    a.wait(new_value);
+
+    add_seq('1');
+
+    std::thread thd([&] {
+        std::this_thread::sleep_for(waiting_duration);
+        add_seq('2');
+        a.notify_all();
+        std::this_thread::sleep_for(waiting_duration);
+        add_seq('3');
+        a.store(old_value);
+        a.notify_one();
+        std::this_thread::sleep_for(waiting_duration);
+        add_seq('4');
+        a.store(new_value);
+        a.notify_one();
+        // timing assumption that the main thread evaluates the `wait(old_value)` before this timeout expires
+        std::this_thread::sleep_for(waiting_duration);
+        add_seq('6');
+    });
+
+    a.wait(old_value);
+    const auto loaded = a.load();
+    assert(memcmp(&loaded, &new_value, sizeof(UnderlyingType)) == 0);
+
+    add_seq('5');
+
+    thd.join();
+
+    add_seq('\0');
+    assert(strcmp(seq, "123456") == 0);
+}
+
+template <class UnderlyingType>
+void test_notify_all_notifies_all(const UnderlyingType old_value, const UnderlyingType new_value,
+    const std::chrono::steady_clock::duration waiting_duration) {
+    std::atomic<UnderlyingType> c{old_value};
+    const auto waitFn = [&c, old_value] { c.wait(old_value); };
+
+    std::thread w1{waitFn};
+    std::thread w2{waitFn};
+    std::thread w3{waitFn};
+
+    std::this_thread::sleep_for(waiting_duration);
+    c.store(new_value);
+    c.notify_all(); // if this doesn't really notify all, the following joins will deadlock
+
+    w1.join();
+    w2.join();
+    w3.join();
+}
+
+template <class UnderlyingType>
+void test_pad_bits(const std::chrono::steady_clock::duration waiting_duration) {
+    UnderlyingType old_value;
+    memset(&old_value, 0x66, sizeof(UnderlyingType));
+    old_value.set(1);
+
+    UnderlyingType same_old_value;
+    memset(&same_old_value, 0x99, sizeof(UnderlyingType));
+    same_old_value.set(1);
+
+    std::atomic<UnderlyingType> c(old_value);
+
+    bool trigger      = false;
+    const auto waitFn = [&c, same_old_value, &trigger] {
+        c.wait(same_old_value);
+        trigger = true;
+    };
+
+    std::thread w1{waitFn};
+
+    std::this_thread::sleep_for(waiting_duration);
+    assert(!trigger);
+
+    c.store(old_value);
+    c.notify_one();
+
+    std::this_thread::sleep_for(waiting_duration);
+    assert(!trigger);
+
+    UnderlyingType new_value;
+    memset(&new_value, 0x99, sizeof(UnderlyingType));
+    new_value.set(2);
+    c.store(new_value);
+    c.notify_one();
+
+    std::this_thread::sleep_for(waiting_duration);
+    assert(trigger);
+
+    w1.join();
+}
+
+struct two_shorts {
+    short a;
+    short b;
+
+    friend bool operator==(two_shorts, two_shorts) = delete;
+};
+
+struct three_chars {
+    char a;
+    char b;
+    char c;
+
+    friend bool operator==(three_chars, three_chars) = delete;
+};
+
+struct big_char_like {
+    char value;
+    char unused[16];
+
+    explicit big_char_like(char value_) : value(value_), unused{} {}
+
+    friend bool operator==(big_char_like, big_char_like) = delete;
+};
+
+template <size_t size>
+struct with_padding_bits {
+    alignas(size) char value;
+
+    void set(const char value_) {
+        value = value_;
+    }
+
+    friend bool operator==(with_padding_bits, with_padding_bits) = delete;
+};
+
+inline void test_atomic_wait() {
+    // wait for all the threads to be waiting; if this value is too small the test might be ineffective but should not
+    // fail due to timing assumptions except where otherwise noted; if it is too large the test will only take longer
+    // than necessary
+    constexpr std::chrono::milliseconds waiting_duration{100};
+    test_atomic_wait_func<char>(1, 2, waiting_duration);
+    test_atomic_wait_func<signed char>(1, 2, waiting_duration);
+    test_atomic_wait_func<unsigned char>(1, 2, waiting_duration);
+    test_atomic_wait_func<short>(1, 2, waiting_duration);
+    test_atomic_wait_func<unsigned short>(1, 2, waiting_duration);
+    test_atomic_wait_func<int>(1, 2, waiting_duration);
+    test_atomic_wait_func<unsigned int>(1, 2, waiting_duration);
+    test_atomic_wait_func<long>(1, 2, waiting_duration);
+    test_atomic_wait_func<unsigned long>(1, 2, waiting_duration);
+    test_atomic_wait_func<long long>(1, 2, waiting_duration);
+    test_atomic_wait_func<unsigned long long>(1, 2, waiting_duration);
+    test_atomic_wait_func<float>(1, 2, waiting_duration);
+    test_atomic_wait_func<double>(1, 2, waiting_duration);
+    test_atomic_wait_func<long double>(1, 2, waiting_duration);
+    test_atomic_wait_func<const void*>("1", "2", waiting_duration);
+    test_atomic_wait_func(two_shorts{1, 1}, two_shorts{1, 2}, waiting_duration);
+    test_atomic_wait_func(three_chars{1, 1, 3}, three_chars{1, 2, 3}, waiting_duration);
+    test_atomic_wait_func(big_char_like{'a'}, big_char_like{'b'}, waiting_duration);
+
+    test_notify_all_notifies_all<char>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<signed char>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<unsigned char>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<short>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<unsigned short>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<int>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<unsigned int>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<long>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<unsigned long>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<long long>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<unsigned long long>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<float>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<double>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<long double>(1, 2, waiting_duration);
+    test_notify_all_notifies_all<const void*>("1", "2", waiting_duration);
+    test_notify_all_notifies_all(two_shorts{1, 1}, two_shorts{1, 2}, waiting_duration);
+    test_notify_all_notifies_all(three_chars{1, 1, 3}, three_chars{1, 2, 3}, waiting_duration);
+    test_notify_all_notifies_all(big_char_like{'a'}, big_char_like{'b'}, waiting_duration);
+
+#ifndef __clang__ // TRANSITION, LLVM-46685
+    test_pad_bits<with_padding_bits<2>>(waiting_duration);
+    test_pad_bits<with_padding_bits<4>>(waiting_duration);
+    test_pad_bits<with_padding_bits<8>>(waiting_duration);
+    test_pad_bits<with_padding_bits<16>>(waiting_duration);
+    test_pad_bits<with_padding_bits<32>>(waiting_duration);
+#endif // __clang__, TRANSITION, LLVM-46685
+}
diff --git a/tests/std/test.lst b/tests/std/test.lst
index a8a70ccb76..f6eda1e96f 100644
--- a/tests/std/test.lst
+++ b/tests/std/test.lst
@@ -311,6 +311,8 @@ tests\P0966R1_string_reserve_should_not_shrink
 tests\P1023R0_constexpr_for_array_comparisons
 tests\P1032R1_miscellaneous_constexpr
 tests\P1135R6_atomic_flag_test
+tests\P1135R6_atomic_wait
+tests\P1135R6_atomic_wait_vista
 tests\P1165R1_consistently_propagating_stateful_allocators
 tests\P1423R3_char8_t_remediation
 tests\P1645R1_constexpr_numeric
diff --git a/tests/std/tests/P1135R6_atomic_wait/env.lst b/tests/std/tests/P1135R6_atomic_wait/env.lst
new file mode 100644
index 0000000000..642f530ffa
--- /dev/null
+++ b/tests/std/tests/P1135R6_atomic_wait/env.lst
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+RUNALL_INCLUDE ..\usual_latest_matrix.lst
diff --git a/tests/std/tests/P1135R6_atomic_wait/test.cpp b/tests/std/tests/P1135R6_atomic_wait/test.cpp
new file mode 100644
index 0000000000..7b7e0ea1fd
--- /dev/null
+++ b/tests/std/tests/P1135R6_atomic_wait/test.cpp
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "test_atomic_wait.hpp"
+
+int main() {
+    assert(__std_atomic_set_api_level(__std_atomic_api_level::__has_wait_on_address)
+           == __std_atomic_api_level::__has_wait_on_address);
+    test_atomic_wait();
+}
diff --git a/tests/std/tests/P1135R6_atomic_wait_vista/env.lst b/tests/std/tests/P1135R6_atomic_wait_vista/env.lst
new file mode 100644
index 0000000000..642f530ffa
--- /dev/null
+++ b/tests/std/tests/P1135R6_atomic_wait_vista/env.lst
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+RUNALL_INCLUDE ..\usual_latest_matrix.lst
diff --git a/tests/std/tests/P1135R6_atomic_wait_vista/test.cpp b/tests/std/tests/P1135R6_atomic_wait_vista/test.cpp
new file mode 100644
index 0000000000..bc42736559
--- /dev/null
+++ b/tests/std/tests/P1135R6_atomic_wait_vista/test.cpp
@@ -0,0 +1,9 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "test_atomic_wait.hpp"
+
+int main() {
+    assert(__std_atomic_set_api_level(__std_atomic_api_level::__has_srwlock) == __std_atomic_api_level::__has_srwlock);
+    test_atomic_wait();
+}
diff --git a/tests/std/tests/VSO_0157762_feature_test_macros/test.cpp b/tests/std/tests/VSO_0157762_feature_test_macros/test.cpp
index 880ccf1135..8f3dc6c3d3 100644
--- a/tests/std/tests/VSO_0157762_feature_test_macros/test.cpp
+++ b/tests/std/tests/VSO_0157762_feature_test_macros/test.cpp
@@ -161,6 +161,20 @@ STATIC_ASSERT(__cpp_lib_atomic_shared_ptr == 201711L);
 STATIC_ASSERT(__cpp_lib_atomic_value_initialization == 201911L);
 #endif
 
+#if _HAS_CXX20
+#ifndef __cpp_lib_atomic_wait
+#error __cpp_lib_atomic_wait is not defined
+#elif __cpp_lib_atomic_wait != 201907L
+#error __cpp_lib_atomic_wait is not 201907L
+#else
+STATIC_ASSERT(__cpp_lib_atomic_wait == 201907L);
+#endif
+#else
+#ifdef __cpp_lib_atomic_wait
+#error __cpp_lib_atomic_wait is defined
+#endif
+#endif
+
 #if _HAS_CXX20
 #ifndef __cpp_lib_bind_front
 #error __cpp_lib_bind_front is not defined