Skip to content

bpo-30747: Attempt to fix atomic load/store #2383

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
305 changes: 295 additions & 10 deletions Include/pyatomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
#include <stdatomic.h>
#endif


#if defined(_MSC_VER)
#include <intrin.h>
#include <immintrin.h>
#endif

/* This is modeled after the atomics interface from C1x, according to
* the draft at
* http://www.open-std.org/JTC1/SC22/wg14/www/docs/n1425.pdf.
Expand Down Expand Up @@ -87,8 +93,9 @@ typedef struct _Py_atomic_int {
|| (ORDER) == __ATOMIC_CONSUME), \
__atomic_load_n(&(ATOMIC_VAL)->_value, ORDER))

#else

/* Only support GCC (for expression statements) and x86 (for simple
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps remove this comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking of updating it to reflect the current state.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like you forgot to update it :-)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed now

* atomic semantics) and MSVC x86/x64/ARM */
#elif defined(__GNUC__) && (defined(__i386__) || defined(__amd64))
typedef enum _Py_memory_order {
_Py_memory_order_relaxed,
_Py_memory_order_acquire,
Expand All @@ -105,9 +112,6 @@ typedef struct _Py_atomic_int {
int _value;
} _Py_atomic_int;

/* Only support GCC (for expression statements) and x86 (for simple
* atomic semantics) for now */
#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64))

static __inline__ void
_Py_atomic_signal_fence(_Py_memory_order order)
Expand All @@ -127,7 +131,7 @@ _Py_atomic_thread_fence(_Py_memory_order order)
static __inline__ void
_Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
{
(void)address; /* shut up -Wunused-parameter */
(void)address; /* shut up -Wunused-parameter */
switch(order) {
case _Py_memory_order_release:
case _Py_memory_order_acq_rel:
Expand Down Expand Up @@ -219,7 +223,291 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
result; \
})

#else /* !gcc x86 */
#elif defined(_MSC_VER)
/* _Interlocked* functions provide a full memory barrier and are therefore
enough for acq_rel and seq_cst. If the HLE variants aren't available
in hardware they will fall back to a full memory barrier as well.

This might affect performance but likely only in some very specific and
hard to meassure scenario.
*/
#if defined(_M_IX86) || defined(_M_X64)
typedef enum _Py_memory_order {
_Py_memory_order_relaxed,
_Py_memory_order_acquire,
_Py_memory_order_release,
_Py_memory_order_acq_rel,
_Py_memory_order_seq_cst
} _Py_memory_order;

typedef struct _Py_atomic_address {
volatile uintptr_t _value;
} _Py_atomic_address;

typedef struct _Py_atomic_int {
volatile int _value;
} _Py_atomic_int;


#if defined(_M_X64)
#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think the load and store macros are ok as defined, but that's because MSDN says "Most of the interlocked functions provide full memory barriers on all Windows platforms". Perhaps you can add a comment to that effect?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do.

switch (ORDER) { \
case _Py_memory_order_acquire: \
_InterlockedExchange64_HLEAcquire((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a runtime check whether HLE is available or not, or is it a compile-time check? If compile-time, it means we may mistakingly produce builds that won't run on non-HLE processors. @zooba

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the documentation these will use HLEAcquire if the cpu supports it or use a regular _InterlockedExchange64 docs here https://msdn.microsoft.com/en-us/library/1s26w950.aspx

specifically this bit

On Intel platforms that support Hardware Lock Elision (HLE) instructions, the intrinsics with _HLEAcquire and _HLERelease suffixes include a hint to the processor that can accelerate performance by eliminating a lock write step in hardware. If these intrinsics are called on platforms that do not support HLE, the hint is ignored.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that. My question is whether the CPU detection happens at compile time or at run time. I can't find any information online...

Copy link
Contributor Author

@Paxxi Paxxi Jun 30, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no detection happening. Intel seem to have found a way to make it backwards compatible so older cpus will ignore the extra hint.

I tested this quickly and here's the generated assembly for the HLE and non HLE versions

; 12   :   auto a = _InterlockedExchange_HLEAcquire(&test, 1);

mov	 eax, 1
lea	 ecx, DWORD PTR _test$[ebp]
xacquire  xchg	   DWORD PTR [ecx], eax
mov	 DWORD PTR _a$[ebp], eax

; 13   :   auto b = _InterlockedExchange(&test, 2);

mov	 eax, 2
lea	 ecx, DWORD PTR _test$[ebp]
xchg	 DWORD PTR [ecx], eax
mov	 DWORD PTR _b$[ebp], eax

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's great, thank you!

break; \
case _Py_memory_order_release: \
_InterlockedExchange64_HLERelease((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
break; \
default: \
_InterlockedExchange64((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
break; \
}
#else
#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
#endif

#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
switch (ORDER) { \
case _Py_memory_order_acquire: \
_InterlockedExchange_HLEAcquire((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
case _Py_memory_order_release: \
_InterlockedExchange_HLERelease((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
default: \
_InterlockedExchange((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
}

#if defined(_M_X64)
/* This has to be an intptr_t for now.
gil_created() uses -1 as a sentinel value, if this returns
a uintptr_t it will do an unsigned compare and crash
*/
inline intptr_t _Py_atomic_load_64bit(volatile uintptr_t* value, int order) {
uintptr_t old;
switch (order) {
case _Py_memory_order_acquire:
{
do {
old = *value;
} while(_InterlockedCompareExchange64_HLEAcquire(value, old, old) != old);
break;
}
case _Py_memory_order_release:
{
do {
old = *value;
} while(_InterlockedCompareExchange64_HLERelease(value, old, old) != old);
break;
}
case _Py_memory_order_relaxed:
old = *value;
break;
default:
{
do {
old = *value;
} while(_InterlockedCompareExchange64(value, old, old) != old);
break;
}
}
return old;
}

#else
#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
#endif

inline int _Py_atomic_load_32bit(volatile int* value, int order) {
int old;
switch (order) {
case _Py_memory_order_acquire:
{
do {
old = *value;
} while(_InterlockedCompareExchange_HLEAcquire(value, old, old) != old);
break;
}
case _Py_memory_order_release:
{
do {
old = *value;
} while(_InterlockedCompareExchange_HLERelease(value, old, old) != old);
break;
}
case _Py_memory_order_relaxed:
old = *value;
break;
default:
{
do {
old = *value;
} while(_InterlockedCompareExchange(value, old, old) != old);
break;
}
}
return old;
}

#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
if (sizeof(*ATOMIC_VAL._value) == 8) { \
_Py_atomic_store_64bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
_Py_atomic_store_32bit(ATOMIC_VAL._value, NEW_VAL, ORDER) }

#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
( \
sizeof(*(ATOMIC_VAL._value)) == 8 ? \
_Py_atomic_load_64bit(ATOMIC_VAL._value, ORDER) : \
_Py_atomic_load_32bit(ATOMIC_VAL._value, ORDER) \
)
#elif defined(_M_ARM) || defined(_M_ARM64)
typedef enum _Py_memory_order {
_Py_memory_order_relaxed,
_Py_memory_order_acquire,
_Py_memory_order_release,
_Py_memory_order_acq_rel,
_Py_memory_order_seq_cst
} _Py_memory_order;

typedef struct _Py_atomic_address {
volatile uintptr_t _value;
} _Py_atomic_address;

typedef struct _Py_atomic_int {
volatile int _value;
} _Py_atomic_int;


#if defined(_M_ARM64)
#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
switch (ORDER) { \
case _Py_memory_order_acquire: \
_InterlockedExchange64_acq((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
break; \
case _Py_memory_order_release: \
_InterlockedExchange64_rel((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
break; \
default: \
_InterlockedExchange64((__int64 volatile*)ATOMIC_VAL, (__int64)NEW_VAL); \
break; \
}
#else
#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
#endif

#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
switch (ORDER) { \
case _Py_memory_order_acquire: \
_InterlockedExchange_acq((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
case _Py_memory_order_release: \
_InterlockedExchange_rel((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
default: \
_InterlockedExchange((volatile long*)ATOMIC_VAL, (int)NEW_VAL); \
break; \
}

#if defined(_M_ARM64)
/* This has to be an intptr_t for now.
gil_created() uses -1 as a sentinel value, if this returns
a uintptr_t it will do an unsigned compare and crash
*/
inline intptr_t _Py_atomic_load_64bit(volatile uintptr_t* value, int order) {
uintptr_t old;
switch (order) {
case _Py_memory_order_acquire:
{
do {
old = *value;
} while(_InterlockedCompareExchange64_acq(value, old, old) != old);
break;
}
case _Py_memory_order_release:
{
do {
old = *value;
} while(_InterlockedCompareExchange64_rel(value, old, old) != old);
break;
}
case _Py_memory_order_relaxed:
old = *value;
break;
default:
{
do {
old = *value;
} while(_InterlockedCompareExchange64(value, old, old) != old);
break;
}
}
return old;
}

#else
#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) *ATOMIC_VAL
#endif

inline int _Py_atomic_load_32bit(volatile int* value, int order) {
int old;
switch (order) {
case _Py_memory_order_acquire:
{
do {
old = *value;
} while(_InterlockedCompareExchange_acq(value, old, old) != old);
break;
}
case _Py_memory_order_release:
{
do {
old = *value;
} while(_InterlockedCompareExchange_rel(value, old, old) != old);
break;
}
case _Py_memory_order_relaxed:
old = *value;
break;
default:
{
do {
old = *value;
} while(_InterlockedCompareExchange(value, old, old) != old);
break;
}
}
return old;
}

#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
if (sizeof(*ATOMIC_VAL._value) == 8) { \
_Py_atomic_store_64bit(ATOMIC_VAL._value, NEW_VAL, ORDER) } else { \
_Py_atomic_store_32bit(ATOMIC_VAL._value, NEW_VAL, ORDER) }

#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
( \
sizeof(*(ATOMIC_VAL._value)) == 8 ? \
_Py_atomic_load_64bit(ATOMIC_VAL._value, ORDER) : \
_Py_atomic_load_32bit(ATOMIC_VAL._value, ORDER) \
)
#endif
#else /* !gcc x86 !_msc_ver */
typedef enum _Py_memory_order {
_Py_memory_order_relaxed,
_Py_memory_order_acquire,
_Py_memory_order_release,
_Py_memory_order_acq_rel,
_Py_memory_order_seq_cst
} _Py_memory_order;

typedef struct _Py_atomic_address {
uintptr_t _value;
} _Py_atomic_address;

typedef struct _Py_atomic_int {
int _value;
} _Py_atomic_int;
/* Fall back to other compilers and processors by assuming that simple
volatile accesses are atomic. This is false, so people should port
this. */
Expand All @@ -229,8 +517,6 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
((ATOMIC_VAL)->_value = NEW_VAL)
#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
((ATOMIC_VAL)->_value)

#endif /* !gcc x86 */
#endif

/* Standardized shortcuts. */
Expand All @@ -245,6 +531,5 @@ _Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
_Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, _Py_memory_order_relaxed)
#define _Py_atomic_load_relaxed(ATOMIC_VAL) \
_Py_atomic_load_explicit(ATOMIC_VAL, _Py_memory_order_relaxed)

#endif /* Py_BUILD_CORE */
#endif /* Py_ATOMIC_H */
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Add a non-dummy implementation of _Py_atomic_store and _Py_atomic_load on
MSVC.