x86_64: Use lock or instead of mfence

Based on x86_32 64-bit atomic SeqCst store using SSE generated by LLVM. https://godbolt.org/z/9sKEr8YWc Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake. Below are the results of the microbenchmark on an Intel Core i7-9750H (Coffee Lake) with the ORDERING constant set to SeqCst. ``` bench_portable_atomic_arch/u128_store time: [11.610 ns 11.670 ns 11.738 ns] change: [-36.119% -35.236% -34.348%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 4 (4.00%) high mild bench_portable_atomic_arch/u128_concurrent_load_store time: [202.30 µs 203.54 µs 205.24 µs] change: [-32.313% -31.167% -29.845%] (p = 0.00 < 0.05) Performance has improved. Found 8 outliers among 100 measurements (8.00%) 2 (2.00%) high mild 6 (6.00%) high severe bench_portable_atomic_arch/u128_concurrent_store time: [395.74 µs 397.37 µs 398.98 µs] change: [-18.517% -17.560% -16.582%] (p = 0.00 < 0.05) Performance has improved. Found 15 outliers among 100 measurements (15.00%) 1 (1.00%) low mild 2 (2.00%) high mild 12 (12.00%) high severe bench_portable_atomic_arch/u128_concurrent_store_swap time: [791.21 µs 793.43 µs 795.69 µs] change: [-10.682% -10.197% -9.6789%] (p = 0.00 < 0.05) Performance has improved. Found 7 outliers among 100 measurements (7.00%) 2 (2.00%) low severe 2 (2.00%) low mild 2 (2.00%) high mild 1 (1.00%) high severe ```
taiki-e · Mar 21, 2024 · 6267661 · 6267661
1 parent 573e025
commit 6267661
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 8 deletions.
diff --git a/bench/benches/bench.rs b/bench/benches/bench.rs
@@ -51,6 +51,8 @@ mod spinlock_fallback;
 
 const THREADS: usize = 2;
 const N: u32 = 5000;
+const ORDERING: Ordering = Ordering::AcqRel;
+// const ORDERING: Ordering = Ordering::SeqCst;
 
 trait AtomicInt<T: Copy>: Sized + Send + Sync {
     fn new(v: T) -> Self;
@@ -69,24 +71,32 @@ macro_rules! impl_atomic {
             }
             #[inline]
             fn load_(&self) -> $int_type {
-                self.load(Ordering::Acquire)
+                self.load(if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING })
             }
             #[inline]
             fn store_(&self, val: $int_type) {
-                self.store(val, Ordering::Release);
+                self.store(
+                    val,
+                    if ORDERING == Ordering::AcqRel { Ordering::Release } else { ORDERING },
+                );
             }
             #[inline]
             fn swap_(&self, val: $int_type) -> $int_type {
-                self.swap(val, Ordering::AcqRel)
+                self.swap(val, ORDERING)
             }
             #[inline]
             fn compare_exchange_(&self, old: $int_type, new: $int_type) -> $int_type {
-                self.compare_exchange(old, new, Ordering::AcqRel, Ordering::Acquire)
-                    .unwrap_or_else(|x| x)
+                self.compare_exchange(
+                    old,
+                    new,
+                    ORDERING,
+                    if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING },
+                )
+                .unwrap_or_else(|x| x)
             }
             #[inline]
             fn fetch_add_(&self, val: $int_type) -> $int_type {
-                self.fetch_add(val, Ordering::AcqRel)
+                self.fetch_add(val, ORDERING)
             }
         }
     };
@@ -97,6 +107,7 @@ macro_rules! impl_atomic_no_order {
         impl AtomicInt<$int_type> for $atomic_type {
             #[inline]
             fn new(v: $int_type) -> Self {
+                assert_eq!(ORDERING, Ordering::AcqRel);
                 Self::new(v)
             }
             #[inline]

diff --git a/src/imp/atomic128/x86_64.rs b/src/imp/atomic128/x86_64.rs
@@ -68,6 +68,23 @@ macro_rules! ptr_modifier {
     };
 }
 
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[cfg(target_pointer_width = "32")]
+macro_rules! sp {
+    () => {
+        "esp"
+    };
+}
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[cfg(target_pointer_width = "64")]
+macro_rules! sp {
+    () => {
+        "rsp"
+    };
+}
+
 // Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
 // requirements for the currently enabled target features. In the first place,
 // there is no option in the x86 assembly for such case, like ARM .arch_extension,
@@ -185,10 +202,14 @@ unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
             Ordering::SeqCst => {
                 asm!(
                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
-                    "mfence",
+                    // Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake (https://github.com/taiki-e/portable-atomic/pull/156).
+                    // Based on x86_32 64-bit atomic SeqCst store using SSE generated by LLVM.
+                    // https://godbolt.org/z/9sKEr8YWc
+                    concat!("lock or dword ptr [", sp!(), "], 0"),
                     dst = in(reg) dst,
                     val = in(xmm_reg) val,
-                    options(nostack, preserves_flags),
+                    // Do not use `preserves_flags` because OR modifies the OF, CF, SF, ZF, and PF flags.
+                    options(nostack),
                 );
             }
             _ => unreachable!(),