From 3e87c259e5847b536478ac2a47346c33a704e7cb Mon Sep 17 00:00:00 2001
From: Andrey Semashev <andrey.semashev@gmail.com>
Date: Mon, 28 Dec 2020 23:37:46 +0300
Subject: [PATCH 1/3] Added optimized x86 atomic_fence for gcc-compatible
 compilers.

On x86 (32 and 64-bit) any lock-prefixed instruction provides sequential
consistency guarantees for atomic operations and is more efficient than
mfence.

We are choosing a "lock not" on a dummy byte on the stack for the following
reasons:

 - The "not" instruction does not affect flags or clobber any registers.
   The memory operand is presumably accessible through esp/rsp.
 - The dummy byte variable is at the top of the stack, which is likely
   hot in cache.
 - The dummy variable does not alias any other data on the stack, which
   means the "lock not" instruction won't introduce any false data
   dependencies with prior or following instructions.

In order to avoid various sanitizers and valgrind complaining, we have to
initialize the dummy variable to zero prior to the operation.

Additionally, for memory orders weaker than seq_cst there is no need for
any special instructions, and we only need a compiler fence. For the relaxed
memory order we don't need even that.

This optimization is only enabled for gcc up to version 11. In gcc 11 the
compiler implements a similar optimization for std::atomic_thread_fence.
Compilers compatible with gcc (namely, clang up to 13 and icc up to 2021.3.0,
inclusively) identify themselves as gcc < 11 and also benefit from this
optimization, as they otherwise generate mfence for
std::atomic_thread_fence(std::memory_order_seq_cst).

Signed-off-by: Andrey Semashev <andrey.semashev@gmail.com>
---
 include/oneapi/tbb/detail/_machine.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/oneapi/tbb/detail/_machine.h b/include/oneapi/tbb/detail/_machine.h
index c85442fa3e..87b3520b24 100644
--- a/include/oneapi/tbb/detail/_machine.h
+++ b/include/oneapi/tbb/detail/_machine.h
@@ -84,6 +84,17 @@ using std::this_thread::yield;
 #endif
 
 static inline void atomic_fence(std::memory_order order) {
+#if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
+    if (order == std::memory_order_seq_cst)
+    {
+        unsigned char dummy = 0u;
+        __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
+    }
+    else if (order != std::memory_order_relaxed)
+    {
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+#else
 #if _MSC_VER && (__TBB_x86_64 || __TBB_x86_32)
     if (order == std::memory_order_seq_cst ||
         order == std::memory_order_acq_rel ||
@@ -95,6 +106,7 @@ static inline void atomic_fence(std::memory_order order) {
     }
 #endif /*_MSC_VER && (__TBB_x86_64 || __TBB_x86_32)*/
     std::atomic_thread_fence(order);
+#endif
 }
 
 //--------------------------------------------------------------------------------------------------

From de51c6067d84daba0bd5fc47dff5c5731a85d0d8 Mon Sep 17 00:00:00 2001
From: Andrey Semashev <andrey.semashev@gmail.com>
Date: Mon, 28 Dec 2020 23:58:25 +0300
Subject: [PATCH 2/3] Removed explicit mfence in atomic_fence on Windows.

The necessary instructions according to the memory order argument
should already be generated by std::atomic_thread_fence.

Signed-off-by: Andrey Semashev <andrey.semashev@gmail.com>
---
 include/oneapi/tbb/detail/_machine.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/include/oneapi/tbb/detail/_machine.h b/include/oneapi/tbb/detail/_machine.h
index 87b3520b24..301e95e676 100644
--- a/include/oneapi/tbb/detail/_machine.h
+++ b/include/oneapi/tbb/detail/_machine.h
@@ -79,10 +79,6 @@ using std::this_thread::yield;
 // atomic_fence implementation
 //--------------------------------------------------------------------------------------------------
 
-#if _MSC_VER && (__TBB_x86_64 || __TBB_x86_32)
-#pragma intrinsic(_mm_mfence)
-#endif
-
 static inline void atomic_fence(std::memory_order order) {
 #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
     if (order == std::memory_order_seq_cst)
@@ -95,16 +91,6 @@ static inline void atomic_fence(std::memory_order order) {
         __asm__ __volatile__ ("" ::: "memory");
     }
 #else
-#if _MSC_VER && (__TBB_x86_64 || __TBB_x86_32)
-    if (order == std::memory_order_seq_cst ||
-        order == std::memory_order_acq_rel ||
-        order == std::memory_order_acquire ||
-        order == std::memory_order_release )
-    {
-        _mm_mfence();
-        return;
-    }
-#endif /*_MSC_VER && (__TBB_x86_64 || __TBB_x86_32)*/
     std::atomic_thread_fence(order);
 #endif
 }

From 8feefce3cdbf6c4b16f06b71397f7af1b9be4d8e Mon Sep 17 00:00:00 2001
From: Andrey Semashev <andrey.semashev@gmail.com>
Date: Fri, 26 Nov 2021 01:18:34 +0300
Subject: [PATCH 3/3] Removed memory order argument from atomic_fence.

The code uses memory_order_seq_cst in all call sites of atomic_fence,
so remove the argument and simplifiy the implementation a bit. Also, renamed
the function to make the memory order it implements apparent.

Signed-off-by: Andrey Semashev <andrey.semashev@gmail.com>
---
 include/oneapi/tbb/detail/_machine.h | 17 +++++------------
 src/tbb/arena.h                      |  4 ++--
 src/tbb/concurrent_monitor.h         | 10 +++++-----
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/tbb/detail/_machine.h b/include/oneapi/tbb/detail/_machine.h
index 301e95e676..994d418ec1 100644
--- a/include/oneapi/tbb/detail/_machine.h
+++ b/include/oneapi/tbb/detail/_machine.h
@@ -76,22 +76,15 @@ using std::this_thread::yield;
 #endif
 
 //--------------------------------------------------------------------------------------------------
-// atomic_fence implementation
+// atomic_fence_seq_cst implementation
 //--------------------------------------------------------------------------------------------------
 
-static inline void atomic_fence(std::memory_order order) {
+static inline void atomic_fence_seq_cst() {
 #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
-    if (order == std::memory_order_seq_cst)
-    {
-        unsigned char dummy = 0u;
-        __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
-    }
-    else if (order != std::memory_order_relaxed)
-    {
-        __asm__ __volatile__ ("" ::: "memory");
-    }
+    unsigned char dummy = 0u;
+    __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
 #else
-    std::atomic_thread_fence(order);
+    std::atomic_thread_fence(std::memory_order_seq_cst);
 #endif
 }
 
diff --git a/src/tbb/arena.h b/src/tbb/arena.h
index 0479cf53e8..da125398fb 100644
--- a/src/tbb/arena.h
+++ b/src/tbb/arena.h
@@ -494,7 +494,7 @@ void arena::advertise_new_work() {
     };
 
     if( work_type == work_enqueued ) {
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
 #if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
         if ( my_market->my_num_workers_soft_limit.load(std::memory_order_acquire) == 0 &&
             my_global_concurrency_mode.load(std::memory_order_acquire) == false )
@@ -508,7 +508,7 @@ void arena::advertise_new_work() {
         // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable.
     }
     else if( work_type == wakeup ) {
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
     }
 
     // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences.
diff --git a/src/tbb/concurrent_monitor.h b/src/tbb/concurrent_monitor.h
index b67158eed9..3d20ef5b98 100644
--- a/src/tbb/concurrent_monitor.h
+++ b/src/tbb/concurrent_monitor.h
@@ -220,7 +220,7 @@ class concurrent_monitor_base {
 
         // Prepare wait guarantees Write Read memory barrier.
         // In C++ only full fence covers this type of barrier.
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
     }
 
     //! Commit wait if event count has not changed; otherwise, cancel wait.
@@ -272,7 +272,7 @@ class concurrent_monitor_base {
 
     //! Notify one thread about the event
     void notify_one() {
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
         notify_one_relaxed();
     }
 
@@ -301,7 +301,7 @@ class concurrent_monitor_base {
 
     //! Notify all waiting threads of the event
     void notify_all() {
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
         notify_all_relaxed();
     }
 
@@ -337,7 +337,7 @@ class concurrent_monitor_base {
     //! Notify waiting threads of the event that satisfies the given predicate
     template <typename P>
     void notify( const P& predicate ) {
-        atomic_fence(std::memory_order_seq_cst);
+        atomic_fence_seq_cst();
         notify_relaxed( predicate );
     }
 
@@ -409,7 +409,7 @@ class concurrent_monitor_base {
 
     //! Abort any sleeping threads at the time of the call
     void abort_all() {
-        atomic_fence( std::memory_order_seq_cst );
+        atomic_fence_seq_cst();
         abort_all_relaxed();
     }