Skip to content

Commit

Permalink
x86_64: Use lock or instead of mfence
Browse files Browse the repository at this point in the history
Based on x86_32 64-bit atomic SeqCst store using SSE generated by LLVM.
https://godbolt.org/z/9sKEr8YWc

Equivalent to mfence, but is 10-35% faster at least in simple cases on
Coffee Lake.

Below are the results of the microbenchmark on an Intel Core i7-9750H
(Coffee Lake) with the ORDERING constant set to SeqCst.

```
bench_portable_atomic_arch/u128_store
                        time:   [11.610 ns 11.670 ns 11.738 ns]
                        change: [-36.119% -35.236% -34.348%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
bench_portable_atomic_arch/u128_concurrent_load_store
                        time:   [202.30 µs 203.54 µs 205.24 µs]
                        change: [-32.313% -31.167% -29.845%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
  2 (2.00%) high mild
  6 (6.00%) high severe
bench_portable_atomic_arch/u128_concurrent_store
                        time:   [395.74 µs 397.37 µs 398.98 µs]
                        change: [-18.517% -17.560% -16.582%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 15 outliers among 100 measurements (15.00%)
  1 (1.00%) low mild
  2 (2.00%) high mild
  12 (12.00%) high severe
bench_portable_atomic_arch/u128_concurrent_store_swap
                        time:   [791.21 µs 793.43 µs 795.69 µs]
                        change: [-10.682% -10.197% -9.6789%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  2 (2.00%) low severe
  2 (2.00%) low mild
  2 (2.00%) high mild
  1 (1.00%) high severe
```
  • Loading branch information
taiki-e committed Mar 21, 2024
1 parent 573e025 commit 6267661
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 8 deletions.
23 changes: 17 additions & 6 deletions bench/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ mod spinlock_fallback;

const THREADS: usize = 2;
const N: u32 = 5000;
const ORDERING: Ordering = Ordering::AcqRel;
// const ORDERING: Ordering = Ordering::SeqCst;

trait AtomicInt<T: Copy>: Sized + Send + Sync {
fn new(v: T) -> Self;
Expand All @@ -69,24 +71,32 @@ macro_rules! impl_atomic {
}
#[inline]
fn load_(&self) -> $int_type {
self.load(Ordering::Acquire)
self.load(if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING })
}
#[inline]
fn store_(&self, val: $int_type) {
self.store(val, Ordering::Release);
self.store(
val,
if ORDERING == Ordering::AcqRel { Ordering::Release } else { ORDERING },
);
}
#[inline]
fn swap_(&self, val: $int_type) -> $int_type {
self.swap(val, Ordering::AcqRel)
self.swap(val, ORDERING)
}
#[inline]
fn compare_exchange_(&self, old: $int_type, new: $int_type) -> $int_type {
self.compare_exchange(old, new, Ordering::AcqRel, Ordering::Acquire)
.unwrap_or_else(|x| x)
self.compare_exchange(
old,
new,
ORDERING,
if ORDERING == Ordering::AcqRel { Ordering::Acquire } else { ORDERING },
)
.unwrap_or_else(|x| x)
}
#[inline]
fn fetch_add_(&self, val: $int_type) -> $int_type {
self.fetch_add(val, Ordering::AcqRel)
self.fetch_add(val, ORDERING)
}
}
};
Expand All @@ -97,6 +107,7 @@ macro_rules! impl_atomic_no_order {
impl AtomicInt<$int_type> for $atomic_type {
#[inline]
fn new(v: $int_type) -> Self {
assert_eq!(ORDERING, Ordering::AcqRel);
Self::new(v)
}
#[inline]
Expand Down
25 changes: 23 additions & 2 deletions src/imp/atomic128/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,23 @@ macro_rules! ptr_modifier {
};
}

#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
#[cfg(target_feature = "sse")]
#[cfg(target_pointer_width = "32")]
macro_rules! sp {
() => {
"esp"
};
}
#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
#[cfg(target_feature = "sse")]
#[cfg(target_pointer_width = "64")]
macro_rules! sp {
() => {
"rsp"
};
}

// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
// requirements for the currently enabled target features. In the first place,
// there is no option in the x86 assembly for such case, like ARM .arch_extension,
Expand Down Expand Up @@ -185,10 +202,14 @@ unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
Ordering::SeqCst => {
asm!(
concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
"mfence",
// Equivalent to mfence, but is 10-35% faster at least in simple cases on Coffee Lake (https://github.com/taiki-e/portable-atomic/pull/156).
// Based on x86_32 64-bit atomic SeqCst store using SSE generated by LLVM.
// https://godbolt.org/z/9sKEr8YWc
concat!("lock or dword ptr [", sp!(), "], 0"),
dst = in(reg) dst,
val = in(xmm_reg) val,
options(nostack, preserves_flags),
// Do not use `preserves_flags` because OR modifies the OF, CF, SF, ZF, and PF flags.
options(nostack),
);
}
_ => unreachable!(),
Expand Down

0 comments on commit 6267661

Please sign in to comment.