diff --git a/bwosqueue/Cargo.toml b/bwosqueue/Cargo.toml new file mode 100644 index 00000000000..317d375011a --- /dev/null +++ b/bwosqueue/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "bwosqueue" +version = "1.0.0" +edition = "2018" + +[dependencies] +# msrv currently 1.31 -> okay! +crossbeam-utils = { version = "0.8", default-features = false} +# MSRV # 1.51 +array-init = "2.1" + + +[dev-dependencies] +criterion = { version = "0.4.0", features = ["html_reports"] } +core_affinity = "0.7.6" +rand = "0.8.5" +tracing = "0.1.37" + + +[target.'cfg(loom)'.dependencies] +loom = "0.5" + +[features] +default = ["std"] +std = [] +unstable = [] +stats = [] + + +[[bench]] +name = "bench" +harness = false + +[profile.bench] +debug-assertions = false +lto = true +opt-level = 3 diff --git a/bwosqueue/benches/bench.rs b/bwosqueue/benches/bench.rs new file mode 100644 index 00000000000..cbc9b87aa14 --- /dev/null +++ b/bwosqueue/benches/bench.rs @@ -0,0 +1,616 @@ +//! Microbenchmarks to benchmark the BWoS queue and compare to the original queue in tokio. +//! +//! Please note that the tokio queue stores `task::Notified` items, which boils down to a +//! `NonNull` pointer, so we benchmark with a u64 as the queue item. + +use core::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst}; +use std::arch::asm; +use std::sync::atomic::fence; +use std::time::{Duration, Instant}; +use std::{ + sync::{ + atomic::{AtomicBool, AtomicUsize}, + Arc, + }, + thread::{self}, +}; + +#[path = "support/original_tokio_queue.rs"] +mod original_tokio_queue; + +#[path = "support/original_bwos.rs"] +mod original_bwos; + +use bwosqueue::{Owner, Stealer}; +use criterion::{ + black_box, criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, BenchmarkId, + Criterion, Throughput, +}; + +fn bench_stealing(c: &mut Criterion) { + let mut stealer_group = c.benchmark_group("Stealing"); + let bwos_single = QueueType::BwosStealSingleItems; + let bwos_block = QueueType::BwosStealBlocks; + let tokio_q_single = QueueType::TokioStealSingleItems; + let tokio_q_batch = QueueType::TokioStealHalf; + + bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 0); + bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 1); + bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 2); + bench_steal::<8, 32>(&mut stealer_group, tokio_q_batch, 1); + bench_steal::<8, 32>(&mut stealer_group, tokio_q_batch, 2); + + bench_steal::<8, 32>(&mut stealer_group, bwos_single, 0); + bench_steal::<8, 32>(&mut stealer_group, bwos_single, 1); + bench_steal::<8, 32>(&mut stealer_group, bwos_block, 0); + bench_steal::<8, 32>(&mut stealer_group, bwos_block, 1); +} + +fn simple_enqueue_dequeue(c: &mut Criterion) { + let mut group = c.benchmark_group("Simple Enqueue Dequeue"); + simple_enqueue_dequeue_original_queue_inner::<{ 8 * 32 }>(&mut group); + simple_enqueue_dequeue_original_queue_inner::<{ 8 * 128 }>(&mut group); + simple_enqueue_dequeue_original_queue_inner::<{ 8 * 512 }>(&mut group); + simple_enqueue_dequeue_original_queue_inner::<{ 8 * 1024 }>(&mut group); + simple_enqueue_dequeue_inner::<8, 32>(&mut group); + simple_enqueue_dequeue_inner::<8, 128>(&mut group); + simple_enqueue_dequeue_inner::<8, 512>(&mut group); + simple_enqueue_dequeue_inner::<8, 1024>(&mut group); + simple_enqueue_dequeue_inner::<32, 8>(&mut group); + simple_enqueue_dequeue_inner::<128, 2>(&mut group); + simple_enqueue_dequeue_inner::<256, 1>(&mut group); + simple_enqueue_dequeue_original_bwos_queue(&mut group); +} + +#[inline(never)] +fn bwos_enq_deq(owner: &mut bwosqueue::Owner) { + while owner.enqueue(black_box(5)).is_ok() {} + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(black_box(val), 5_u64); + } else { + break; + }; + } +} + +fn simple_enqueue_dequeue_inner( + group: &mut BenchmarkGroup, +) { + let (mut owner, _) = bwosqueue::new::(); + let num_elements = NE; + + group.throughput(Throughput::Elements((NB * NE * 2) as u64)); + group.bench_with_input( + BenchmarkId::new( + format!("BWoS {NE} Elems per Block"), + format!("{} Total size", NB * NE), + ), + &num_elements, + |b, _num_elements| { + b.iter(|| { + bwos_enq_deq(&mut owner); + }); + #[cfg(feature = "stats")] + assert!(!owner.can_consume()) + }, + ); +} + +fn simple_enqueue_dequeue_original_queue_inner( + group: &mut BenchmarkGroup, +) { + let (_, mut owner) = original_tokio_queue::local::(); + group.throughput(Throughput::Elements((SIZE * 2) as u64)); + // todo: we could do a binary search here by doing dry runs to determine how much + // idle time we need to reach a certain stealing percentage + group.bench_with_input( + BenchmarkId::new("Original tokio queue", format!("{SIZE} Total size")), + &SIZE, + |b, _num_elements| { + b.iter(|| { + while owner.push_back(black_box(5)).is_ok() {} + loop { + if let Some(val) = owner.pop() { + assert_eq!(black_box(val), 5_u64); + } else { + break; + }; + } + }); + assert!(!owner.has_tasks()) + }, + ); + + // one full enqueue + one full dequeue +} + +#[inline(never)] +fn simple_enqueue_dequeue_original_bwos_queue(group: &mut BenchmarkGroup) { + #[inline(never)] + fn enq_deq(prod: &mut original_bwos::Producer, cons: &mut original_bwos::Consumer) { + while prod.enqueue(black_box(5)) {} + loop { + if let Some(val) = cons.dequeue() { + assert_eq!(black_box(val), 5_u64); + } else { + break; + }; + } + } + let (mut prod, mut cons, _stealer) = original_bwos::new(); + + const SIZE: u64 = (1024 * 8) as u64; + group.throughput(Throughput::Elements(SIZE * 2)); + group.bench_with_input( + BenchmarkId::new("Unsafe Rust: enq-deq", format!("{SIZE} Total size")), + &SIZE, + |b, _num_elements| { + b.iter(|| { + enq_deq(&mut prod, &mut cons); + }); + // use owner outside of iter to control drop + assert!(cons.dequeue().is_none()) + }, + ); + + // one full enqueue + one full dequeue +} + +enum StealKind { + BwosSingleSteal(Stealer), + BwosBlockSteal(Stealer), + // Just use 8K for the tokio queue, since the size doesn't really matter for this queue and we can't + // use generic const expressions here. + TokioSingleSteal(original_tokio_queue::Steal), + TokioBatchSteal(original_tokio_queue::Steal), + + // This version also currently has 8K hardcoded, and it's not really worth it to modify it, since it is + // only used as a baseline for benchmarks. + BwosUnsafeSingleSteal(original_bwos::Stealer), +} + +struct StealTest { + owner: QueueOwner, + stealer: StealKind, + params: StealTestParams, +} + +#[derive(Clone)] +struct StealTestParams { + num_stealers: usize, + num_ready_stealers: Arc, + start: Arc, + stop: Arc, + stealer_idle_ops: usize, +} + +fn bwos_steal_block_thread( + stealer: Stealer, + params: StealTestParams, +) { + params.num_ready_stealers.fetch_add(1, Release); + let mut idle_iterations: u64 = 0; + loop { + if let Some(stolen_iter) = stealer.steal_block() { + for element in stolen_iter { + assert_eq!(element, 5); + } + // Stealing should be a "rare" operation, so configure the stealing frequency by waiting for + // a while after a successfully stolen item. + for _ in 0..params.stealer_idle_ops { + unsafe { asm!("nop") }; + } + } else { + // Only check the atomic variable once in a while to reduce the overhead. + idle_iterations = idle_iterations.wrapping_add(1); + if (idle_iterations % 1024) == 0 { + if params.stop.load(Relaxed) { + params.num_ready_stealers.fetch_sub(1, SeqCst); + return (); + } + } + } + } +} + +fn bwos_steal_single_item_thread( + stealer: Stealer, + params: StealTestParams, +) { + params.num_ready_stealers.fetch_add(1, Release); + let mut iterations: u64 = 0; + loop { + if let Some(val) = stealer.steal() { + assert_eq!(val, 5); + for _ in 0..params.stealer_idle_ops { + unsafe { asm!("nop") }; + } + } else { + iterations = iterations.wrapping_add(1); + if (iterations % 1024) == 0 { + if params.stop.load(Relaxed) { + params.num_ready_stealers.fetch_sub(1, SeqCst); + return (); + } + } + } + } +} + +fn bwos_unsafe_steal_single_item_thread( + mut stealer: original_bwos::Stealer, + params: StealTestParams, +) { + params.num_ready_stealers.fetch_add(1, Release); + let mut iterations: u64 = 0; + loop { + if let Some(val) = stealer.steal() { + assert_eq!(val, 5); + for _ in 0..params.stealer_idle_ops { + unsafe { asm!("nop") }; + } + } else { + iterations = iterations.wrapping_add(1); + if (iterations % 1024) == 0 { + if params.stop.load(Relaxed) { + params.num_ready_stealers.fetch_sub(1, SeqCst); + return (); + } + } + } + } +} + +fn tokio_q_steal_block_thread( + stealer: original_tokio_queue::Steal, + params: StealTestParams, +) { + params.num_ready_stealers.fetch_add(1, Release); + let mut iterations: u64 = 0; + loop { + if stealer.bench_tokio_q_steal(1024) != 0 { + for _ in 0..params.stealer_idle_ops { + unsafe { asm!("nop") }; + } + } else { + iterations = iterations.wrapping_add(1); + if (iterations % 1024) == 0 { + if params.stop.load(Relaxed) { + params.num_ready_stealers.fetch_sub(1, SeqCst); + return (); + } + } + } + } +} + +fn tokio_q_steal_single_item_thread( + stealer: original_tokio_queue::Steal, + params: StealTestParams, +) { + params.num_ready_stealers.fetch_add(1, Release); + let mut iterations: u64 = 0; + loop { + if let Some(val) = stealer.bench_tokio_steal_single() { + assert_eq!(val, 5); + for _ in 0..params.stealer_idle_ops { + unsafe { asm!("nop") }; + } + } else { + iterations = iterations.wrapping_add(1); + if (iterations % 1024) == 0 { + if params.stop.load(Relaxed) { + params.num_ready_stealers.fetch_sub(1, SeqCst); + return (); + } + } + } + } +} + +/// Sets up stealers to only steals items, without enqueuing them into a different queue. +/// This allows us to measure only the overhead of the stealing operation, without +/// any side effects from an enqueue into a different queue. +fn setup_stealers(steal_test: &StealTest) { + let params = &steal_test.params; + // ensure any remaining stealer threads from previous run have shutdown. + while params.num_ready_stealers.load(SeqCst) != 0 {} + params.stop.store(false, SeqCst); + params.start.store(0, SeqCst); + + assert_eq!(params.num_ready_stealers.load(SeqCst), 0); + for _ in 0..params.num_stealers { + let l_params = params.clone(); + + match &steal_test.stealer { + StealKind::BwosSingleSteal(stealer) => { + let l_stealer = stealer.clone(); + thread::spawn(|| bwos_steal_single_item_thread(l_stealer, l_params)); + } + StealKind::BwosBlockSteal(stealer) => { + let l_stealer = stealer.clone(); + thread::spawn(|| bwos_steal_block_thread(l_stealer, l_params)); + } + StealKind::TokioSingleSteal(stealer) => { + let l_stealer = stealer.clone(); + thread::spawn(|| tokio_q_steal_single_item_thread(l_stealer, l_params)); + } + StealKind::TokioBatchSteal(stealer) => { + let l_stealer = stealer.clone(); + thread::spawn(|| tokio_q_steal_block_thread(l_stealer, l_params)); + } + StealKind::BwosUnsafeSingleSteal(stealer) => { + let l_stealer = stealer.clone(); + thread::spawn(|| bwos_unsafe_steal_single_item_thread(l_stealer, l_params)); + } + } + } + while params.num_ready_stealers.load(Acquire) != params.num_stealers {} +} + +// Owner thread implementation which enqueues for a configurable amount of items +// as fast as possible, dequeuing until empty once the queue is full. +fn bwos_owner_thread( + owner: &mut Owner, + num_enqueues: u64, + total_enqueues: &mut u64, + total_dequeues: &mut u64, +) -> Duration { + let mut enq_count: u64 = 0; + let mut deq_count: u64 = 0; + let start = Instant::now(); + + while enq_count < num_enqueues { + while owner.enqueue(black_box(5)).is_ok() { + enq_count += 1; + if enq_count >= num_enqueues { + break; + } + } + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(black_box(val), 5); + deq_count += 1; + } else { + break; + }; + } + } + // This adds some additional overhead even with 0 stealers compared to the simple enq/deq benchmark. + while owner.has_stealers() {} + let duration = start.elapsed(); + + debug_assert!( + enq_count >= deq_count, + "enq: {}, deq: {}", + enq_count, + deq_count + ); + *total_enqueues += enq_count; + *total_dequeues += deq_count; + duration +} + +fn original_bwos_owner_thread( + producer: &mut original_bwos::Producer, + consumer: &mut original_bwos::Consumer, + num_enqueues: u64, + total_enqueues: &mut u64, + total_dequeues: &mut u64, +) -> Duration { + let mut enq_count: u64 = 0; + let mut deq_count: u64 = 0; + let start = Instant::now(); + + while enq_count < num_enqueues { + while producer.enqueue(black_box(5)) { + enq_count += 1; + if enq_count >= num_enqueues { + break; + } + } + loop { + if let Some(val) = consumer.dequeue() { + assert_eq!(black_box(val), 5); + deq_count += 1; + } else { + break; + }; + } + } + // No implementation to check for stealers, so just skip this here. + //while owner.has_stealers() {} + let duration = start.elapsed(); + + debug_assert!( + enq_count >= deq_count, + "enq: {}, deq: {}", + enq_count, + deq_count + ); + *total_enqueues += enq_count; + *total_dequeues += deq_count; + duration +} + +fn tokio_q_owner_thread( + owner: &mut original_tokio_queue::Local, + num_enqueues: u64, + total_enqueues: &mut u64, + total_dequeues: &mut u64, +) -> Duration { + let mut enq_count: u64 = 0; + let mut deq_count: u64 = 0; + let start = Instant::now(); + + while enq_count < num_enqueues { + while owner.push_back(black_box(5)).is_ok() { + enq_count += 1; + if enq_count >= num_enqueues { + break; + } + } + loop { + if let Some(val) = owner.pop() { + assert_eq!(black_box(val), 5_u64); + deq_count += 1; + } else { + break; + }; + } + } + while owner.has_stealers() {} + + let duration = start.elapsed(); + + debug_assert!( + enq_count >= deq_count, + "enq: {}, deq: {}", + enq_count, + deq_count + ); + *total_enqueues += enq_count; + *total_dequeues += deq_count; + duration +} + +#[derive(Copy, Clone, Debug)] +enum QueueType { + BwosStealSingleItems, + BwosStealBlocks, + TokioStealSingleItems, + // Default tokio configuration + TokioStealHalf, + BwosUnsafe, +} + +enum QueueOwner { + Bwos(bwosqueue::Owner), + Tokio(original_tokio_queue::Local), + BwosUnsafe((original_bwos::Producer, original_bwos::Consumer)), +} + +fn bench_steal( + group: &mut BenchmarkGroup, + queue_type: QueueType, + num_stealers: usize, +) { + let setup_params = StealTestParams { + num_stealers, + num_ready_stealers: Arc::new(AtomicUsize::new(0)), + stealer_idle_ops: 5000, + start: Arc::new(AtomicUsize::new(0)), + stop: Arc::new(AtomicBool::new(false)), + }; + let mut test_configuration = match queue_type { + QueueType::BwosStealBlocks => { + let (owner, stealer) = bwosqueue::new::(); + StealTest { + owner: QueueOwner::Bwos(owner), + stealer: StealKind::BwosBlockSteal(stealer), + params: setup_params, + } + } + QueueType::BwosStealSingleItems => { + let (owner, stealer) = bwosqueue::new::(); + StealTest { + owner: QueueOwner::Bwos(owner), + stealer: StealKind::BwosSingleSteal(stealer), + params: setup_params, + } + } + QueueType::TokioStealSingleItems => { + let (stealer, owner) = original_tokio_queue::local(); + StealTest { + owner: QueueOwner::Tokio(owner), + stealer: StealKind::TokioSingleSteal(stealer), + params: setup_params, + } + } + QueueType::TokioStealHalf => { + let (stealer, owner) = original_tokio_queue::local(); + StealTest { + owner: QueueOwner::Tokio(owner), + stealer: StealKind::TokioBatchSteal(stealer), + params: setup_params, + } + } + QueueType::BwosUnsafe => { + let (producer, consumer, stealer) = original_bwos::new(); + StealTest { + owner: QueueOwner::BwosUnsafe((producer, consumer)), + stealer: StealKind::BwosUnsafeSingleSteal(stealer), + params: setup_params, + } + } + }; + + let enqueue_iterations = 1; + // One enqueue + One dequeue is the base throughput. This is scaled up by the number of iterations + // criterion determines and additional enqueue_iterations, since otherwise the time span is too short + group.throughput(Throughput::Elements(enqueue_iterations * 2)); + let mut total_enqueues: u64 = 0; + let mut total_dequeues: u64 = 0; + group.bench_with_input( + // todo: precalculate expected stealing percentage and make that the parameter! + BenchmarkId::new( + format!("{queue_type:?}"), + format!("{num_stealers} stealers"), + ), + &(), + |b, _| { + setup_stealers(&test_configuration); + test_configuration.params.start.store(1, Release); + fence(SeqCst); + match &mut test_configuration.owner { + QueueOwner::Bwos(owner) => { + b.iter_custom(|num_iters| { + bwos_owner_thread( + owner, + num_iters * enqueue_iterations, + &mut total_enqueues, + &mut total_dequeues, + ) + }); + } + QueueOwner::BwosUnsafe((producer, consumer)) => { + b.iter_custom(|num_iters| { + original_bwos_owner_thread( + producer, + consumer, + num_iters * enqueue_iterations, + &mut total_enqueues, + &mut total_dequeues, + ) + }); + } + QueueOwner::Tokio(owner) => { + b.iter_custom(|num_iters| { + tokio_q_owner_thread( + owner, + num_iters * enqueue_iterations, + &mut total_enqueues, + &mut total_dequeues, + ) + }); + } + } + + test_configuration.params.stop.store(true, Relaxed); + test_configuration.params.start.store(2, Release); + }, + ); + let steal_percentage = if total_enqueues == total_dequeues { + "0%".to_string() + } else { + let p = ((total_enqueues - total_dequeues) as f64 / total_enqueues as f64) * 100.0; + format!("{:.1}%", p) + }; + eprintln!("Steal percentage: {steal_percentage}"); +} + +criterion_group! {name = benches; +config = Criterion::default(); +targets = simple_enqueue_dequeue, bench_stealing} +criterion_main!(benches); diff --git a/bwosqueue/benches/support/loom/atomic_u16.rs b/bwosqueue/benches/support/loom/atomic_u16.rs new file mode 100644 index 00000000000..c1c531208c2 --- /dev/null +++ b/bwosqueue/benches/support/loom/atomic_u16.rs @@ -0,0 +1,44 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops::Deref; + +/// `AtomicU16` providing an additional `load_unsync` function. +pub(crate) struct AtomicU16 { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicU16 {} +unsafe impl Sync for AtomicU16 {} + +impl AtomicU16 { + pub(crate) const fn new(val: u16) -> AtomicU16 { + let inner = UnsafeCell::new(std::sync::atomic::AtomicU16::new(val)); + AtomicU16 { inner } + } + + /// Performs an unsynchronized load. + /// + /// # Safety + /// + /// All mutations must have happened before the unsynchronized load. + /// Additionally, there must be no concurrent mutations. + pub(crate) unsafe fn unsync_load(&self) -> u16 { + *(*self.inner.get()).get_mut() + } +} + +impl Deref for AtomicU16 { + type Target = std::sync::atomic::AtomicU16; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl fmt::Debug for AtomicU16 { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/benches/support/loom/atomic_u32.rs b/bwosqueue/benches/support/loom/atomic_u32.rs new file mode 100644 index 00000000000..61f95fb30ce --- /dev/null +++ b/bwosqueue/benches/support/loom/atomic_u32.rs @@ -0,0 +1,34 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops::Deref; + +/// `AtomicU32` providing an additional `load_unsync` function. +pub(crate) struct AtomicU32 { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicU32 {} +unsafe impl Sync for AtomicU32 {} + +impl AtomicU32 { + pub(crate) const fn new(val: u32) -> AtomicU32 { + let inner = UnsafeCell::new(std::sync::atomic::AtomicU32::new(val)); + AtomicU32 { inner } + } +} + +impl Deref for AtomicU32 { + type Target = std::sync::atomic::AtomicU32; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl fmt::Debug for AtomicU32 { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/benches/support/loom/mod.rs b/bwosqueue/benches/support/loom/mod.rs new file mode 100644 index 00000000000..36c46375602 --- /dev/null +++ b/bwosqueue/benches/support/loom/mod.rs @@ -0,0 +1,3 @@ +pub mod atomic_u16; +pub mod atomic_u32; +pub mod unsafe_cell; diff --git a/bwosqueue/benches/support/loom/unsafe_cell.rs b/bwosqueue/benches/support/loom/unsafe_cell.rs new file mode 100644 index 00000000000..66c1d7943e0 --- /dev/null +++ b/bwosqueue/benches/support/loom/unsafe_cell.rs @@ -0,0 +1,16 @@ +#[derive(Debug)] +pub(crate) struct UnsafeCell(std::cell::UnsafeCell); + +impl UnsafeCell { + pub(crate) const fn new(data: T) -> UnsafeCell { + UnsafeCell(std::cell::UnsafeCell::new(data)) + } + + pub(crate) fn with(&self, f: impl FnOnce(*const T) -> R) -> R { + f(self.0.get()) + } + + pub(crate) fn with_mut(&self, f: impl FnOnce(*mut T) -> R) -> R { + f(self.0.get()) + } +} diff --git a/bwosqueue/benches/support/original_bwos.rs b/bwosqueue/benches/support/original_bwos.rs new file mode 100644 index 00000000000..921d76b8409 --- /dev/null +++ b/bwosqueue/benches/support/original_bwos.rs @@ -0,0 +1,368 @@ +#![allow(dead_code)] +use array_init::array_init; +use crossbeam_utils::CachePadded; +use std::cell::UnsafeCell; +use std::cmp::max; +use std::marker::{Send, Sync}; +use std::mem::MaybeUninit; +use std::ptr::null_mut; +use std::sync::atomic::AtomicU64; +use std::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst}; +use std::sync::Arc; + +const NB: usize = 8; +const NE: usize = 1024; +const NB_LOG: usize = 3; +const NE_LOG: usize = 11; + +#[inline(always)] +fn wsq_global_idx(v: u64) -> u64 { + return v & ((1 << NB_LOG) - 1); +} + +#[inline(always)] +fn wsq_local_idx(v: u64) -> u64 { + return v & ((1 << NE_LOG) - 1); +} + +#[inline(always)] +fn wsq_local_vsn(v: u64) -> u64 { + return v >> NE_LOG; +} + +#[inline(always)] +fn wsq_local_compose(h: u64, l: u64) -> u64 { + return (h << NE_LOG) | l; +} + +#[inline(always)] +fn advance(v: &AtomicU64, old_v: u64) { + let _ = v.compare_exchange_weak(old_v, old_v + 1, Relaxed, Relaxed); +} + +struct BlockConfig { + beginning: u8, + prev: *mut Block, + next: *mut Block, +} + +struct Block { + /// producer + committed: CachePadded, + /// consumer + consumed: CachePadded, + /// stealer-head + reserved: CachePadded, + /// stealer-tail + stealed: CachePadded, + conf: CachePadded>, + entries: CachePadded; NE]>>, +} + +struct BwsQueue { + pcache: CachePadded<*mut Block>, + spos: CachePadded, + ccache: CachePadded<*mut Block>, + blocks: CachePadded<[UnsafeCell>; NB]>, +} + +unsafe impl Send for BwsQueue {} +unsafe impl Sync for BwsQueue {} + +impl BlockConfig { + fn new(idx: usize) -> BlockConfig { + BlockConfig { + beginning: if idx == 0 { 1 } else { 0 }, + prev: null_mut(), + next: null_mut(), + } + } +} + +impl Block { + fn new(idx: usize) -> Block { + let empty_val: u64 = if idx != 0 { + NE as u64 + } else { + wsq_local_compose(1, 0) + }; + let full_val: u64 = if idx != 0 { + NE as u64 + } else { + wsq_local_compose(1, NE as u64) + }; + Block { + committed: CachePadded::new(AtomicU64::new(empty_val)), + consumed: CachePadded::new(AtomicU64::new(empty_val)), + reserved: CachePadded::new(AtomicU64::new(full_val)), + stealed: CachePadded::new(AtomicU64::new(full_val)), + conf: CachePadded::new(BlockConfig::::new(idx)), + entries: CachePadded::new(MaybeUninit::uninit()), + } + } + + #[inline(always)] + fn is_consumed(&mut self, vsn: u64) -> bool { + let consumed: u64 = self.consumed.load(SeqCst); + return (wsq_local_idx(consumed) == NE as u64 && wsq_local_vsn(consumed) == vsn) + || wsq_local_vsn(consumed) > vsn; + } + + #[inline(always)] + fn is_stealed(&mut self) -> bool { + let stealed: u64 = self.stealed.load(SeqCst); + return wsq_local_idx(stealed) == NE as u64; + } +} + +impl BwsQueue { + fn new() -> BwsQueue { + BwsQueue { + pcache: CachePadded::new(null_mut()), + spos: CachePadded::new(AtomicU64::new(0)), + ccache: CachePadded::new(null_mut()), + blocks: CachePadded::new(array_init(|idx| UnsafeCell::new(Block::new(idx)))), + } + } + + #[inline(always)] + #[allow(dead_code)] + fn is_empty(&self) -> bool { + unsafe { + /* fast check */ + let blk = self.ccache.into_inner(); + let consumed: u64 = (*blk).consumed.load(SeqCst); + let committed: u64 = (*blk).committed.load(SeqCst); + return committed == consumed; + } + } +} + +pub struct Producer { + queue: Arc>, +} + +impl Clone for Producer { + fn clone(&self) -> Self { + Producer { + queue: self.queue.clone(), + } + } +} + +impl Producer { + #[inline(always)] + pub fn enqueue(&mut self, t: E) -> bool { + unsafe { + loop { + /* get the address of the alloc block */ + let blk: *mut Block = self.queue.pcache.into_inner(); + + /* precheck once */ + let committed: u64 = (*blk).committed.load(Relaxed); + let committed_idx: u64 = wsq_local_idx(committed); + + /* if out of bound, we don't add the space, but help to move the block */ + if committed_idx < NE as u64 { + /* copy the data into the entry and commit it */ + std::ptr::write( + (*(*blk).entries.as_mut_ptr())[committed_idx as usize].get(), + t, + ); + (*blk).committed.store(committed + 1, Release); + return true; + } + + /* slow path, all writers help to move to next block */ + let nblk: *mut Block = (*blk).conf.next; + let next_vsn: u64 = wsq_local_vsn(committed) + (*nblk).conf.beginning as u64; + + /* check if next block is ready */ + if !(*nblk).is_consumed(next_vsn - 1) { + return false; + }; + if !(*nblk).is_stealed() { + return false; + }; + + /* reset cursor and advance block */ + let new_cursor: u64 = wsq_local_compose(next_vsn, 0); + (*nblk).committed.store(new_cursor, Relaxed); + (*nblk).stealed.store(new_cursor, Relaxed); + (*nblk).reserved.store(new_cursor, Release); + let q: *mut BwsQueue = Arc::as_ptr(&self.queue) as *mut _; + (*q).pcache = CachePadded::new(nblk); + } + } + } +} + +pub struct Consumer { + queue: Arc>, +} + +impl Clone for Consumer { + fn clone(&self) -> Self { + Consumer { + queue: self.queue.clone(), + } + } +} + +impl Consumer { + #[inline(always)] + pub fn dequeue(&mut self) -> Option { + unsafe { + loop { + /* get the current block */ + let blk: *mut Block = self.queue.ccache.into_inner(); + + /* check if the block is fully consumed */ + let consumed: u64 = (*blk).consumed.load(Relaxed); + let consumed_idx: u64 = wsq_local_idx(consumed); + + if consumed_idx < NE as u64 { + /* check if we have an entry to occupy */ + let committed: u64 = (*blk).committed.load(Relaxed); + let committed_idx: u64 = wsq_local_idx(committed); + if consumed_idx == committed_idx { + return None; + } + + /* we got the entry */ + let t = + std::ptr::read((*(*blk).entries.as_mut_ptr())[consumed_idx as usize].get()); + (*blk).consumed.store(consumed + 1, Relaxed); + return Some(t); + } + + /* r_head never pass the w_head and r_tail */ + let nblk: *mut Block = (*blk).conf.next; + let next_cons_vsn: u64 = wsq_local_vsn(consumed) + (*nblk).conf.beginning as u64; + let next_steal_vsn: u64 = wsq_local_vsn((*nblk).reserved.load(Relaxed)); + if next_steal_vsn != next_cons_vsn { + return None; + } + + /* stop stealers */ + let reserved_new: u64 = wsq_local_compose(next_cons_vsn, NE as u64); + let reserved_old: u64 = (*nblk).reserved.swap(reserved_new, Relaxed); + + /* pre-steal reserved */ + let reserved_idx: u64 = wsq_local_idx(reserved_old); + let pre_stealed: u64 = max(0, NE as u64 - reserved_idx); + (*nblk).stealed.fetch_add(pre_stealed, Relaxed); + + /* advance the block and try again */ + let new_cursor: u64 = wsq_local_compose(next_cons_vsn, reserved_idx); + (*nblk).consumed.store(new_cursor, Relaxed); + let q: *mut BwsQueue = Arc::as_ptr(&self.queue) as *mut _; + (*q).ccache = CachePadded::new(nblk); + } + } + } +} + +pub struct Stealer { + queue: Arc>, +} + +impl Clone for Stealer { + fn clone(&self) -> Self { + Stealer { + queue: self.queue.clone(), + } + } +} + +impl Stealer { + #[inline(always)] + pub fn steal(&mut self) -> Option { + unsafe { + loop { + /* get the address of the steal block */ + let spos: u64 = self.queue.spos.load(Relaxed); + let bidx: usize = wsq_global_idx(spos) as usize; + let blk: *mut Block = self.queue.blocks[bidx].get(); + + /* check if the block is fully reserved */ + let reserved: u64 = (*blk).reserved.load(Acquire); + let reserved_idx: u64 = wsq_local_idx(reserved); + + if reserved_idx < NE as u64 { + /* check if we have an entry to occupy */ + let committed: u64 = (*blk).committed.load(Acquire); + let committed_idx: u64 = wsq_local_idx(committed); + if reserved_idx == committed_idx { + return None; + } + + if !(*blk) + .reserved + .compare_exchange_weak(reserved, reserved + 1, Release, Relaxed) + .is_ok() + { + return None; + } + + /* we got the entry */ + let t = + std::ptr::read((*(*blk).entries.as_mut_ptr())[reserved_idx as usize].get()); + (*blk).stealed.fetch_add(1, Release); + return Some(t); + } + + /* r_head never pass the w_head and r_tail */ + let nblk: *mut Block = (*blk).conf.next; + let next_except_vsn: u64 = wsq_local_vsn(reserved) + (*nblk).conf.beginning as u64; + let next_actual_vsn: u64 = wsq_local_vsn((*nblk).reserved.load(Relaxed)); + if next_except_vsn != next_actual_vsn { + return None; + } + + /* reset cursor and advance block */ + advance(&self.queue.spos, spos); + } + } + } +} + +pub fn new() -> (Producer, Consumer, Stealer) { + let qa = Arc::new(BwsQueue::::new()); + + let mut blk_start: *mut Block = null_mut(); + let mut blk_pre: *mut Block = null_mut(); + let mut blk: *mut Block; + + for idx in 0..NB { + blk = qa.blocks[idx].get(); + if blk_start.is_null() { + blk_start = blk; + } else { + unsafe { + (*blk_pre).conf.next = blk; + (*blk).conf.prev = blk_pre; + } + } + blk_pre = blk; + if idx == NB - 1 { + unsafe { + (*blk).conf.next = blk_start; + (*blk_start).conf.prev = blk; + } + } + } + unsafe { + let q: *mut BwsQueue = Arc::as_ptr(&qa) as *mut _; + (*q).pcache = CachePadded::new(blk_start); + (*q).ccache = CachePadded::new(blk_start); + } + + let qb = qa.clone(); + let qc = qa.clone(); + ( + Producer { queue: qa }, + Consumer { queue: qb }, + Stealer { queue: qc }, + ) +} diff --git a/bwosqueue/benches/support/original_tokio_queue.rs b/bwosqueue/benches/support/original_tokio_queue.rs new file mode 100644 index 00000000000..404e8858e78 --- /dev/null +++ b/bwosqueue/benches/support/original_tokio_queue.rs @@ -0,0 +1,623 @@ +// This is the original tokio queue, modified slightly to use const generics for a configurable queue size +#![allow(dead_code)] + +//! Run-queue structures to support a work-stealing scheduler + +use loom::atomic_u16::AtomicU16; +use loom::atomic_u32::AtomicU32; +use loom::unsafe_cell::UnsafeCell; +use std::sync::Arc; + +use std::mem::MaybeUninit; +use std::ptr; +use std::sync::atomic::Ordering::{AcqRel, Acquire, Release}; + +mod loom; + +/// Producer handle. May only be used from a single thread. +pub(crate) struct Local { + inner: Arc>, +} + +/// Consumer handle. May be used from many threads. +pub(crate) struct Steal(Arc>); + +struct Inner { + /// Concurrently updated by many threads. + /// + /// Contains two `u16` values. The LSB byte is the "real" head of the queue. + /// The `u16` in the MSB is set by a stealer in process of stealing values. + /// It represents the first value being stolen in the batch. `u16` is used + /// in order to distinguish between `head == tail` and `head == tail - + /// capacity`. + /// + /// When both `u16` values are the same, there is no active stealer. + /// + /// Tracking an in-progress stealer prevents a wrapping scenario. + head: AtomicU32, + + /// Only updated by producer thread but read by many threads. + tail: AtomicU16, + + /// Elements + buffer: Box<[UnsafeCell>; N]>, +} + +unsafe impl Send for Inner {} +unsafe impl Sync for Inner {} + +fn make_mask(queue_size: usize) -> usize { + assert!(queue_size.is_power_of_two()); + queue_size - 1 +} + +// Constructing the fixed size array directly is very awkward. The only way to +// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as +// the contents are not Copy. The trick with defining a const doesn't work for +// generic types. +fn make_fixed_size(buffer: Box<[T]>) -> Box<[T; N]> { + assert_eq!(buffer.len(), N); + + // safety: We check that the length is correct. + unsafe { Box::from_raw(Box::into_raw(buffer).cast()) } +} + +/// Create a new local run-queue +pub(crate) fn local() -> (Steal, Local) { + let mut buffer = Vec::with_capacity(N); + + for _ in 0..N { + buffer.push(UnsafeCell::new(MaybeUninit::uninit())); + } + + let inner = Arc::new(Inner { + head: AtomicU32::new(0), + tail: AtomicU16::new(0), + buffer: make_fixed_size(buffer.into_boxed_slice()), + }); + + let local = Local { + inner: inner.clone(), + }; + + let remote = Steal(inner); + + (remote, local) +} + +impl Local { + /// Returns true if the queue has entries that can be stolen. + pub(crate) fn is_stealable(&self) -> bool { + !self.inner.is_empty() + } + + /// BwoS bench: Exposed this check from steal_into2 as public to use in the benchmark. + pub(crate) fn has_stealers(&self) -> bool { + let prev_packed = self.inner.head.load(Acquire); + let (src_head_steal, src_head_real) = unpack(prev_packed); + // If these two do not match, another thread is concurrently + // stealing from the queue. + src_head_steal != src_head_real + } + + /// Returns false if there are any entries in the queue + /// + /// Separate to is_stealable so that refactors of is_stealable to "protect" + /// some tasks from stealing won't affect this + pub(crate) fn has_tasks(&self) -> bool { + !self.inner.is_empty() + } + + /// Pushes a task to the back of the local queue, skipping the LIFO slot. + pub(crate) fn push_back(&mut self, task: T) -> Result<(), T> { + let tail = loop { + let head = self.inner.head.load(Acquire); + let (steal, _real) = unpack(head); + + // safety: this is the **only** thread that updates this cell. + let tail = unsafe { self.inner.tail.unsync_load() }; + + if tail.wrapping_sub(steal) < N as u16 { + // There is capacity for the task + break tail; + } else { + // Concurrently stealing, this will free up capacity, so only + // push the task onto the inject queue + //inject.push(task); + return Err(task); + } // JS: remove push_pverflow case for micro benchmark + }; + + // Map the position to a slot index. + let idx = tail as usize & make_mask(N); + + self.inner.buffer[idx].with_mut(|ptr| { + // Write the task to the slot + // + // Safety: There is only one producer and the above `if` + // condition ensures we don't touch a cell if there is a + // value, thus no consumer. + unsafe { + ptr::write((*ptr).as_mut_ptr(), task); + } + }); + + // Make the task available. Synchronizes with a load in + // `steal_into2`. + self.inner.tail.store(tail.wrapping_add(1), Release); + Ok(()) + } + + // /// Moves a batch of tasks into the inject queue. + // /// + // /// This will temporarily make some of the tasks unavailable to stealers. + // /// Once `push_overflow` is done, a notification is sent out, so if other + // /// workers "missed" some of the tasks during a steal, they will get + // /// another opportunity. + // #[inline(never)] + // fn push_overflow( + // &mut self, + // task: T, + // head: u16, + // tail: u16, + // ) -> Result<(), T> { + // /// How many elements are we taking from the local queue. + // /// + // /// This is one less than the number of tasks pushed to the inject + // /// queue as we are also inserting the `task` argument. + // const NUM_TASKS_TAKEN: u16 = (LOCAL_QUEUE_CAPACITY / 2) as u16; + + // assert_eq!( + // tail.wrapping_sub(head) as usize, + // LOCAL_QUEUE_CAPACITY, + // "queue is not full; tail = {}; head = {}", + // tail, + // head + // ); + + // let prev = pack(head, head); + + // // Claim a bunch of tasks + // // + // // We are claiming the tasks **before** reading them out of the buffer. + // // This is safe because only the **current** thread is able to push new + // // tasks. + // // + // // There isn't really any need for memory ordering... Relaxed would + // // work. This is because all tasks are pushed into the queue from the + // // current thread (or memory has been acquired if the local queue handle + // // moved). + // if self + // .inner + // .head + // .compare_exchange( + // prev, + // pack( + // head.wrapping_add(NUM_TASKS_TAKEN), + // head.wrapping_add(NUM_TASKS_TAKEN), + // ), + // Release, + // Relaxed, + // ) + // .is_err() + // { + // // We failed to claim the tasks, losing the race. Return out of + // // this function and try the full `push` routine again. The queue + // // may not be full anymore. + // return Err(task); + // } + + // /// An iterator that takes elements out of the run queue. + // struct BatchTaskIter<'a, T: 'static> { + // buffer: &'a [UnsafeCell>>; LOCAL_QUEUE_CAPACITY], + // head: u32, + // i: u32, + // } + // impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> { + // type Item = task::Notified; + + // #[inline] + // fn next(&mut self) -> Option> { + // if self.i == u32::from(NUM_TASKS_TAKEN) { + // None + // } else { + // let i_idx = self.i.wrapping_add(self.head) as usize & MASK; + // let slot = &self.buffer[i_idx]; + + // // safety: Our CAS from before has assumed exclusive ownership + // // of the task pointers in this range. + // let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + + // self.i += 1; + // Some(task) + // } + // } + // } + + // // safety: The CAS above ensures that no consumer will look at these + // // values again, and we are the only producer. + // let batch_iter = BatchTaskIter { + // buffer: &*self.inner.buffer, + // head: head as u32, + // i: 0, + // }; + // inject.push_batch(batch_iter.chain(std::iter::once(task))); + + // // Add 1 to factor in the task currently being scheduled. + // metrics.incr_overflow_count(); + + // Ok(()) + // } + + /// Pops a task from the local queue. + pub(crate) fn pop(&mut self) -> Option { + let mut head = self.inner.head.load(Acquire); + + let idx = loop { + let (steal, real) = unpack(head); + + // safety: this is the **only** thread that updates this cell. + let tail = unsafe { self.inner.tail.unsync_load() }; + + if real == tail { + // queue is empty + return None; + } + + let next_real = real.wrapping_add(1); + + // If `steal == real` there are no concurrent stealers. Both `steal` + // and `real` are updated. + let next = if steal == real { + pack(next_real, next_real) + } else { + assert_ne!(steal, next_real); + pack(steal, next_real) + }; + + // Attempt to claim a task. + let res = self + .inner + .head + .compare_exchange(head, next, AcqRel, Acquire); + + match res { + Ok(_) => break real as usize & make_mask(N), + Err(actual) => head = actual, + } + }; + + Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() })) + } +} + +impl Steal { + // BWoS bench: Taken from steal_into2 - Modified to support benchmarking + // only the steal operation without the additional enqueue into `dst`. + // Don't steal more than max_steal items, otherwise the stealer will + // steal 100% of all items and give the consumer in the benchmark no + // chance, so we can't measure consumer/stealer interference + pub fn bench_tokio_q_steal(&self, max_steal: u16) -> u16 { + let mut prev_packed = self.0.head.load(Acquire); + let mut next_packed; + + let n = loop { + let (src_head_steal, src_head_real) = unpack(prev_packed); + let src_tail = self.0.tail.load(Acquire); + + // If these two do not match, another thread is concurrently + // stealing from the queue. + if src_head_steal != src_head_real { + return 0; + } + + // Number of available tasks to steal + let n = src_tail.wrapping_sub(src_head_real); + // Bench BWoS steal at most + let n = core::cmp::min(max_steal, n - n / 2); + + if n == 0 { + // No tasks available to steal + return 0; + } + + // Update the real head index to acquire the tasks. + let steal_to = src_head_real.wrapping_add(n); + assert_ne!(src_head_steal, steal_to); + next_packed = pack(src_head_steal, steal_to); + + // Claim all those tasks. This is done by incrementing the "real" + // head but not the steal. By doing this, no other thread is able to + // steal from this queue until the current thread completes. + let res = self + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => break n, + Err(actual) => prev_packed = actual, + } + }; + + assert!(n <= N as u16 / 2, "actual = {}", n); + + let (first, _) = unpack(next_packed); + + // Take all the tasks + for i in 0..n { + // Compute the positions + let src_pos = first.wrapping_add(i); + + // Map to slots + let src_idx = src_pos as usize & make_mask(N); + + // Read the task + // + // safety: We acquired the task with the atomic exchange above. + let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + + // Use the queue entry so the compiler does not optimize the read away. + assert_eq!(task, 5); + } + + let mut prev_packed = next_packed; + + // Update `src_head_steal` to match `src_head_real` signalling that the + // stealing routine is complete. + loop { + let head = unpack(prev_packed).1; + next_packed = pack(head, head); + + let res = self + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => return n, + Err(actual) => { + let (actual_steal, actual_real) = unpack(actual); + + assert_ne!(actual_steal, actual_real); + + prev_packed = actual; + } + } + } + } +} + +impl Steal { + // BWoS bench + // Based on Local::pop, with modified memory ordering. + pub(crate) fn bench_tokio_steal_single(&self) -> Option { + let mut head = self.0.head.load(Acquire); + + let idx = loop { + let (steal, real) = unpack(head); + + let tail = self.0.tail.load(Acquire); + + if real == tail { + // queue is empty + return None; + } + + let next_real = real.wrapping_add(1); + + // If `steal == real` there are no concurrent stealers. Both `steal` + // and `real` are updated. + let next = if steal == real { + pack(next_real, next_real) + } else { + assert_ne!(steal, next_real); + pack(steal, next_real) + }; + + // Attempt to claim a task. + let res = self.0.head.compare_exchange(head, next, AcqRel, Acquire); + + match res { + Ok(_) => break real as usize & make_mask(N), + Err(actual) => head = actual, + } + }; + + Some(self.0.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() })) + } + + pub(crate) fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Steals half the tasks from self and place them into `dst`. + pub(crate) fn steal_into(&self, dst: &mut Local) -> Option { + // Safety: the caller is the only thread that mutates `dst.tail` and + // holds a mutable reference. + let dst_tail = unsafe { dst.inner.tail.unsync_load() }; + + // To the caller, `dst` may **look** empty but still have values + // contained in the buffer. If another thread is concurrently stealing + // from `dst` there may not be enough capacity to steal. + let (steal, _) = unpack(dst.inner.head.load(Acquire)); + + if dst_tail.wrapping_sub(steal) > N as u16 / 2 { + // we *could* try to steal less here, but for simplicity, we're just + // going to abort. + return None; + } + + // Steal the tasks into `dst`'s buffer. This does not yet expose the + // tasks in `dst`. + let mut n = self.steal_into2(dst, dst_tail); + + if n == 0 { + // No tasks were stolen + return None; + } + + // We are returning a task here + n -= 1; + + let ret_pos = dst_tail.wrapping_add(n); + let ret_idx = ret_pos as usize & make_mask(N); + + // safety: the value was written as part of `steal_into2` and not + // exposed to stealers, so no other thread can access it. + let ret = dst.inner.buffer[ret_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + + if n == 0 { + // The `dst` queue is empty, but a single task was stolen + return Some(ret); + } + + // Make the stolen items available to consumers + dst.inner.tail.store(dst_tail.wrapping_add(n), Release); + + Some(ret) + } + + // Steal tasks from `self`, placing them into `dst`. Returns the number of + // tasks that were stolen. + fn steal_into2(&self, dst: &mut Local, dst_tail: u16) -> u16 { + let mut prev_packed = self.0.head.load(Acquire); + let mut next_packed; + + let n = loop { + let (src_head_steal, src_head_real) = unpack(prev_packed); + let src_tail = self.0.tail.load(Acquire); + + // If these two do not match, another thread is concurrently + // stealing from the queue. + if src_head_steal != src_head_real { + return 0; + } + + // Number of available tasks to steal + let n = src_tail.wrapping_sub(src_head_real); + let n = n - n / 2; + + if n == 0 { + // No tasks available to steal + return 0; + } + + // Update the real head index to acquire the tasks. + let steal_to = src_head_real.wrapping_add(n); + assert_ne!(src_head_steal, steal_to); + next_packed = pack(src_head_steal, steal_to); + + // Claim all those tasks. This is done by incrementing the "real" + // head but not the steal. By doing this, no other thread is able to + // steal from this queue until the current thread completes. + let res = self + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => break n, + Err(actual) => prev_packed = actual, + } + }; + + assert!(n <= N as u16 / 2, "actual = {}", n); + + let (first, _) = unpack(next_packed); + + // Take all the tasks + for i in 0..n { + // Compute the positions + let src_pos = first.wrapping_add(i); + let dst_pos = dst_tail.wrapping_add(i); + + // Map to slots + let src_idx = src_pos as usize & make_mask(N); + let dst_idx = dst_pos as usize & make_mask(N); + + // Read the task + // + // safety: We acquired the task with the atomic exchange above. + let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + + // Write the task to the new slot + // + // safety: `dst` queue is empty and we are the only producer to + // this queue. + dst.inner.buffer[dst_idx] + .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) }); + } + + let mut prev_packed = next_packed; + + // Update `src_head_steal` to match `src_head_real` signalling that the + // stealing routine is complete. + loop { + let head = unpack(prev_packed).1; + next_packed = pack(head, head); + + let res = self + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => return n, + Err(actual) => { + let (actual_steal, actual_real) = unpack(actual); + + assert_ne!(actual_steal, actual_real); + + prev_packed = actual; + } + } + } + } +} + +impl Clone for Steal { + fn clone(&self) -> Steal { + Steal(self.0.clone()) + } +} + +impl Drop for Local { + fn drop(&mut self) { + if !std::thread::panicking() { + assert!(self.pop().is_none(), "queue not empty"); + } + } +} + +impl Inner { + fn len(&self) -> u16 { + let (_, head) = unpack(self.head.load(Acquire)); + let tail = self.tail.load(Acquire); + + tail.wrapping_sub(head) + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// Split the head value into the real head and the index a stealer is working +/// on. +fn unpack(n: u32) -> (u16, u16) { + let real = n & u16::MAX as u32; + let steal = n >> 16; + + (steal as u16, real as u16) +} + +/// Join the two head values +fn pack(steal: u16, real: u16) -> u32 { + (real as u32) | ((steal as u32) << 16) +} + +#[test] +fn test_local_queue_capacity() { + assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize); +} diff --git a/bwosqueue/src/bwos_queue.rs b/bwosqueue/src/bwos_queue.rs new file mode 100644 index 00000000000..65b8fc8d7bb --- /dev/null +++ b/bwosqueue/src/bwos_queue.rs @@ -0,0 +1,204 @@ +use super::metadata::AtomicIndexAndVersion; +use crate::loom::{cell::UnsafeCell, sync::Arc}; +use array_init::array_init; +use core::{marker::PhantomPinned, mem::MaybeUninit, pin::Pin, ptr::null}; +use crossbeam_utils::CachePadded; + +#[cfg(feature = "stats")] +mod bwsstats { + use crate::loom::sync::atomic::{AtomicU64, Ordering::Relaxed}; + use crossbeam_utils::CachePadded; + + pub(crate) struct BwsStats { + owner_counter: CachePadded, + total_stolen: CachePadded, + } + + impl BwsStats { + pub(crate) const fn new() -> Self { + Self { + owner_counter: CachePadded::new(AtomicU64::new(0)), + total_stolen: CachePadded::new(AtomicU64::new(0)), + } + } + + #[inline] + pub(crate) fn increment_enqueued(&self, rhs: usize) { + let curr = self.owner_counter.load(Relaxed); + let new = curr.wrapping_add(rhs as u64); + self.owner_counter.store(new, Relaxed); + } + #[inline] + pub(crate) fn increment_dequeued(&self, rhs: usize) { + let curr = self.owner_counter.load(Relaxed); + let new = curr.wrapping_sub(rhs as u64); + self.owner_counter.store(new, Relaxed); + } + + #[inline] + pub(crate) fn increment_stolen(&self, rhs: usize) { + self.total_stolen.fetch_add(rhs as u64, Relaxed); + } + + /// Returns the _estimated_ number of currently enqueued items. + /// + /// Assumes a maximum of usize items in the queue. + /// + /// Todo: assumes that we enqueue no more than u64::MAX items. This may not be acceptable for tokio. + #[inline] + pub(crate) fn curr_enqueued(&self) -> usize { + let owner_cnt = self.owner_counter.load(Relaxed); + let total_stolen = self.total_stolen.load(Relaxed); + + // We assume the `u64` total numbers will never overflow. + let num = owner_cnt.saturating_sub(total_stolen); + // The maximum queue size is usize::MAX, so this conversion is safe (with the assumption that the u64 + // counters don't overflow) + num as usize + } + } +} + +#[cfg(feature = "stats")] +pub(crate) use bwsstats::*; + +pub(crate) struct BwsQueue { + pub(crate) blocks: CachePadded<[Block; NUM_BLOCKS]>, + #[cfg(feature = "stats")] + pub(crate) stats: CachePadded, + _pin: PhantomPinned, +} + +pub(crate) struct Block { + /// The index and version of the next writable entry in the block + /// + /// index == NE signals that the producer has already fully written this block. + /// `committed` is only written to by the single producer ([Owner](super::Owner)). + pub(crate) committed: CachePadded>, + /// The index and version of the next readable entry in the block + /// + /// If consumed == committed, then there are not items that can be read in this block. + /// `consumed` is only written by the single consumer ([Owner](super::Owner)). + pub(crate) consumed: CachePadded>, + /// stealer-head - We ensure that consumer and stealer are never on same block + pub(crate) reserved: CachePadded>, + /// stealer-tail - stealing finished + pub(crate) stolen: CachePadded>, + /// Block specific configuration, including a reference to the next block in the bwosqueue. + conf: CachePadded>, + /// The storage for all entries in this block + pub(crate) entries: CachePadded<[UnsafeCell>; NE]>, +} + +struct BlockConfig { + /// true if this Block is the HEAD of the queue. + beginning: bool, + /// Blocks are linked together as a linked list via the `next` pointer to speed up accessing + /// the next block. The pointer is fixed, but needs to be initialized after the Block has + /// been put behind a shared reference in pinned memory, since we can't directly initialize + /// and pin memory on the heap. + next: UnsafeCell<*const Block>, +} + +impl BlockConfig { + fn new(idx: usize) -> BlockConfig { + BlockConfig { + beginning: idx == 0, + next: UnsafeCell::new(null()), + } + } +} + +impl Block { + fn new(idx: usize) -> Block { + let is_queue_head = idx == 0; + Block { + committed: CachePadded::new(AtomicIndexAndVersion::new_owner(is_queue_head)), + consumed: CachePadded::new(AtomicIndexAndVersion::new_owner(is_queue_head)), + reserved: CachePadded::new(AtomicIndexAndVersion::new_stealer(is_queue_head)), + stolen: CachePadded::new(AtomicIndexAndVersion::new_stealer(is_queue_head)), + conf: CachePadded::new(BlockConfig::new(idx)), + entries: CachePadded::new(array_init(|_| UnsafeCell::new(MaybeUninit::uninit()))), + } + } + + /// Returns the next Block in the BWoS queue + #[inline(always)] + pub(crate) fn next(&self) -> *const Self { + // SAFETY: The next pointer is static and valid after initialization of the queue for + // the whole lifetime of the queue. + unsafe { self.conf.next.with(|next| *next) } + } + + /// true if this block is the head of the BWoS queue + #[inline(always)] + pub(crate) fn is_head(&self) -> bool { + self.conf.beginning + } +} + +impl + BwsQueue +{ + const _ASSERT_NUM_BLOCKS_POW2: () = assert!(NUM_BLOCKS.is_power_of_two()); + const _ASSERT_NUM_GREATER_1: () = assert!(NUM_BLOCKS > 1); + + pub(crate) fn new() -> Pin> { + // We need to "use" the assertions here, otherwise the compile-time assertions are ignored. + #[allow(clippy::let_unit_value)] + let _ = Self::_ASSERT_NUM_BLOCKS_POW2; + #[allow(clippy::let_unit_value)] + let _ = Self::_ASSERT_NUM_GREATER_1; + + // First create and pin the queue on the heap + let q = Arc::pin(BwsQueue { + blocks: CachePadded::new(array_init(|idx| Block::new(idx))), + #[cfg(feature = "stats")] + stats: CachePadded::new(BwsStats::new()), + _pin: PhantomPinned, + }); + // Now initialize the fast-path pointers + let blocks: &[Block; NUM_BLOCKS] = &q.blocks; + for block_window in blocks.windows(2) { + // Note: This cannot panic since we asserted at compile-time that BwsQueue has at least + // 2 blocks + let curr_block = block_window.get(0).expect("INVALID_NUM_BLOCKS"); + let next_block = block_window.get(1).expect("INVALID_NUM_BLOCKS"); + // SAFETY: Since our array of blocks is already behind an `Arc` and `Pin`ned we can't + // initialize the pointers with safe code, but we do know that at this point in time + // no concurrent mutable access is possible, since there are no other references. + unsafe { + curr_block.conf.next.with_mut(|next_ptr| { + (*next_ptr) = next_block; + }); + } + } + + let first_block = blocks.first().expect("INVALID_NUM_BLOCKS"); + let last_block = blocks.last().expect("INVALID_NUM_BLOCKS"); + + // SAFETY: There are no other active references to the curr and next block and no + // concurrent access is possible here. + unsafe { + last_block.conf.next.with_mut(|next_ptr| { + (*next_ptr) = first_block; + }); + } + // Now all fields in the Queue are initialized correctly + q + } + + /// The estimated number of elements currently enqueued. + /// + /// Items which are currently being stolen do not count towards the length, + /// so this method is not suited to determine if the queue is full. + #[cfg(feature = "stats")] + pub(crate) fn estimated_len(&self) -> usize { + self.stats.curr_enqueued() + } + + #[cfg(feature = "stats")] + pub(crate) fn is_empty(&self) -> bool { + self.estimated_len() == 0 + } +} diff --git a/bwosqueue/src/lib.rs b/bwosqueue/src/lib.rs new file mode 100644 index 00000000000..fd803ce686c --- /dev/null +++ b/bwosqueue/src/lib.rs @@ -0,0 +1,790 @@ +//! The BWoS queue is a fast block-based work stealing queue for parallel processing. +//! +//! The BWoS queue is based on the [BBQ] (Block-based Bounded Queue) and is specially designed for the +//! workstealing scenario. Based on the real-world observation that the "stealing" operation is +//! rare and most of the operations are local enqueues and dequeues this queue implementation +//! offers a single [Owner] which can enqueue and dequeue without any heavy synchronization mechanisms +//! on the fast path. Concurrent stealing is possible and does not slow done the Owner too much. +//! This allows stealing policies which steal single items or in small batches. +//! +//! # Queue Semantics +//! +//! - The block-based design reduces the synchronization requirements on the fast-path +//! inside a block and moves the heavy synchronization operations necessary to support +//! multiple stealers to the slow-path when transitioning to the next block. +//! - The producer (enqueue) may not advance to the next block if the consumer or a stealer +//! is still operating on that block. This allows the producer to remove producer-consumer/stealer +//! synchronization from its fast-path operations, but reduces the queue capacity by +//! at most one block. +//! - Stealers may not steal from the same block as the consumer. This allows the consumer +//! to remove consumer-stealer synchronization from its fast-path operations, but means +//! one block is not available for stealing. +//! - Consumers may "take-over" the next block preventing stealers from stealing in that +//! block after the take-over. Stealers will still proceed with already in-progress steal +//! operations in this block. +//! - This queue implementation puts the producer and consumer into a shared Owner struct, +//! +//! # Examples +//! +//! todo +//! +//! [BBQ]: https://www.usenix.org/conference/atc22/presentation/wang-jiawei +//! +//! # Todo: +//! - Instead of const generics we could use a boxed slice for a dynamically sized array. +//! The performance impact be benchmarked though, since this will result in multiple operations +//! not being able to be calculated at compile-time anymore. + +#![deny(unsafe_op_in_unsafe_fn)] +#![warn(unreachable_pub)] + +use core::{ + marker::{Send, Sync}, + pin::Pin, +}; +use crossbeam_utils::CachePadded; +use std::fmt::Formatter; +use std::mem::MaybeUninit; + +mod bwos_queue; +mod loom; +mod metadata; + +use crate::loom::cell::UnsafeCell; +use crate::loom::sync::atomic::{ + AtomicUsize, + Ordering::{Acquire, Relaxed, Release}, +}; +use crate::loom::sync::Arc; +use bwos_queue::{Block, BwsQueue}; +use metadata::{Index, IndexAndVersion}; + +/// The Owner interface to the BWoS queue +/// +/// The owner is both the single producer and single consumer. +#[repr(align(128))] +pub struct Owner { + /// Producer cache (single producer)- points to block in self.queue. + pcache: CachePadded<*const Block>, + /// Consumer cache (single consumer) - points to block in self.queue. + ccache: CachePadded<*const Block>, + /// Stealer position cache - Allows the owner to quickly check if there are any stealers + spos: CachePadded>, + /// `Arc` to the actual queue to ensure the queue lives at least as long as the Owner. + #[allow(dead_code)] + queue: Pin>>, +} + +/// A Stealer interface to the BWoS queue +/// +/// There may be multiple stealers. Stealers share the stealer position which is used to quickly look up +/// the next block for attempted stealing. +#[repr(align(128))] +pub struct Stealer { + /// The actual stealer position is `self.spos % NUM_BLOCKS`. The position is incremented beyond + /// `NUM_BLOCKS` to detect ABA problems. + spos: CachePadded>, + queue: Pin>>, +} + +/// An iterator over elements of one Block. +/// +/// The iterator borrows all elements up to `committed` to allows batched +/// operations on the elements. When the iterator is dropped the entries +/// are marked as consumed in one atomic operation. +pub struct BlockIter<'a, E, const ENTRIES_PER_BLOCK: usize> { + buffer: &'a [UnsafeCell>; ENTRIES_PER_BLOCK], + /// Index if the next to be consumed entry in the buffer. + i: usize, + /// Number of committed entries in the buffer. + committed: usize, +} + +/// An iterator over elements of one Block of a stealer +/// +/// Marks the stolen entries as stolen once the iterator has been consumed. +pub struct StealerBlockIter<'a, E, const ENTRIES_PER_BLOCK: usize> { + /// Stealer Block + stealer_block: &'a Block, + /// Remember how many entries where reserved for the Drop implementation + num_reserved: usize, + /// reserved index of the block. We own the entries from `i..block_reserved` + block_reserved: usize, + /// curr index in the block + i: usize, +} + +unsafe impl Send + for Owner +{ +} + +// todo: is this really needed? +unsafe impl Sync + for Owner +{ +} + +unsafe impl Send + for Stealer +{ +} + +unsafe impl Sync + for Stealer +{ +} + +impl + Owner +{ + /// Try to enqueue `t` into the FIFO queue. + /// + /// If the queue is full, `Err(t)` is returned to the caller. + #[inline(always)] + pub fn enqueue(&mut self, t: E) -> Result<(), E> { + loop { + // SAFETY: `pcache` always points to a valid `Block` in the queue. We never create a mutable reference + // to a Block, so it is safe to construct a shared reference here. + let blk = unsafe { &**self.pcache }; + + // Load the index of the next free queue entry for the producer. `committed` is only written to by the + // single producer, so `Relaxed` reading is fine. + let committed = blk.committed.load(Relaxed); + let committed_idx = committed.raw_index(); + + // Fastpath (the block is not full): Due to the slowpath checks we know that the entire remaining block + // is available to the producer and do not need to check the consumed index in the fastpath. + if let Some(entry_cell) = blk.entries.get(committed_idx) { + // SAFETY: We checked the entry is available for writing and the index can be + // post-incremented unconditionally since `index == NE` is valid and means the block + // is full. + let committed_new = unsafe { + entry_cell.with_mut(|uninit_entry| uninit_entry.write(MaybeUninit::new(t))); + committed.index_add_unchecked(1) + }; + // Synchronizes with `Acquire` ordering on the stealer side. + blk.committed.store(committed_new, Release); + #[cfg(feature = "stats")] + self.queue.stats.increment_enqueued(1); + return Ok(()); + } + + /* slow path, move to the next block */ + let nblk = unsafe { &*blk.next() }; + let next = committed.next_version(nblk.is_head()); + + /* check if next block is ready */ + if !self.is_next_block_writable(nblk, next.version()) { + return Err(t); + }; + + /* reset cursor and advance block */ + nblk.committed.store(next, Relaxed); + nblk.stolen.store(next, Relaxed); + // Ensures the writes to `committed` and `stolen` are visible when `reserved` is loaded. + nblk.reserved.store(next, Release); + *self.pcache = nblk; + } + } + + pub unsafe fn enqueue_batch_unchecked( + &mut self, + mut iter: Box + '_>, + ) -> usize { + let mut count = 0; + loop { + // SAFETY: `pcache` always points to a valid `Block` in the queue. We never create a mutable reference + // to a Block, so it is safe to construct a shared reference here. + let blk = unsafe { &**self.pcache }; + + // Load the index of the next free queue entry for the producer. `committed` is only written to by the + // single producer, so `Relaxed` reading is fine. + let committed = blk.committed.load(Relaxed); + let mut new_committed = committed; + + while new_committed.raw_index() < ENTRIES_PER_BLOCK { + // Fastpath (the block is not full): Due to the slowpath checks we know that the entire remaining block + // is available to the producer and do not need to check the consumed index in the + // fastpath. + let entry_opt = iter.next(); + if let Some(entry) = entry_opt { + blk.entries[new_committed.raw_index()].with_mut(|uninit_entry| unsafe { + uninit_entry.write(MaybeUninit::new(entry)) + }); + new_committed = unsafe { new_committed.index_add_unchecked(1) }; + count += 1; + } else { + blk.committed.store(new_committed, Release); + #[cfg(feature = "stats")] + self.queue.stats.increment_enqueued(count); + return count; + } + } + /* slow path, move to the next block */ + let nblk = unsafe { &*blk.next() }; + let next = new_committed.next_version(nblk.is_head()); + + // The caller promises they already confirmed the next block is ready, so we only + // debug assert. + debug_assert!( + self.is_next_block_writable(nblk, next.version()), + "Precondition of unchecked enqueue function violated." + ); + + /* reset cursor and advance block */ + nblk.committed.store(next, Relaxed); + nblk.stolen.store(next, Relaxed); + // The changes to `committed` and `stolen` must be visible when reserved is changed. + nblk.reserved.store(next, Release); + *self.pcache = nblk; + } + } + /// true if the next block is ready for the producer to start writing. + fn is_next_block_writable( + &self, + next_blk: &Block, + next_block_version: usize, + ) -> bool { + let expected_version = next_block_version.wrapping_sub(1); + let consumed = next_blk.consumed.load(Relaxed); + let is_consumed = consumed.index().is_full() && expected_version == consumed.version(); + + // The next block must be already _fully_ consumed, since we do not want to checked the `consumed` index + // in the enqueue fastpath! + if !is_consumed { + return false; + } + // The producer must wait until the next block has no active stealers. + let stolen = next_blk.stolen.load(Acquire); + if !stolen.index().is_full() || stolen.version() != expected_version { + return false; + } + true + } +} + +impl + Owner +{ + /// Try to dequeue the oldest element in the queue. + #[inline(always)] + pub fn dequeue(&mut self) -> Option { + let (blk, consumed) = self.get_consumer_block()?; + + // We trust that the correct index is passed to us here. + let entry_cell = &blk.entries[consumed.raw_index()]; + // SAFETY: We know there is an entry to dequeue, so we know the entry is a valid initialized `E`. + let item = unsafe { entry_cell.with(|entry| entry.read().assume_init()) }; + // SAFETY: We already checked that `consumed_idx < ENTRIES_PER_BLOCK`. + let new_consumed = unsafe { consumed.index_add_unchecked(1) }; + blk.consumed.store(new_consumed, Relaxed); + #[cfg(feature = "stats")] + self.queue.stats.increment_dequeued(1); + return Some(item); + } + + /// Try to dequeue all remaining committed entries in the current block. + pub fn dequeue_block(&mut self) -> Option> { + let (blk, consumed) = self.get_consumer_block()?; + + let committed = blk.committed.load(Relaxed); + + // We are claiming the tasks **before** reading them out of the buffer. + // This is safe because only the **current** thread is able to push new + // tasks. + // + // There isn't really any need for memory ordering... Relaxed would + // work. This is because all tasks are pushed into the queue from the + // current thread (or memory has been acquired if the local queue handle + // moved). + blk.consumed.store(committed, Relaxed); + + return Some(BlockIter { + buffer: &blk.entries, + i: consumed.raw_index(), + committed: committed.raw_index(), + }); + } + + // returns true on success, false when advancing not possible. + fn try_advance_consumer_block( + &mut self, + next_block: &Block, + curr_consumed: IndexAndVersion, + ) -> bool { + let next_cons_vsn = curr_consumed + .version() + .wrapping_add(next_block.is_head() as usize); + + // The reserved field is updated last in `enqueue()`. It is only updated by the producer + // (`Owner`), so `Relaxed` is sufficient. If the actual reserved version is not equal to the + // expected next consumer version, then the producer has not advanced to the next block yet + // and we must wait. + let next_reserved_vsn = next_block.reserved.load(Relaxed).version(); + if next_reserved_vsn != next_cons_vsn { + debug_assert!(next_reserved_vsn == next_cons_vsn.wrapping_sub(1)); + return false; + } + + /* stop stealers */ + let reserved_new = IndexAndVersion::new(next_cons_vsn, Index::full()); + // todo: Why can this be Relaxed? + let reserved_old = next_block.reserved.swap(reserved_new, Relaxed); + debug_assert_eq!(reserved_old.version(), next_cons_vsn); + let reserved_old_idx = reserved_old.raw_index(); + + // Number of entries that can't be stolen anymore because we stopped stealing. + let num_consumer_owned = ENTRIES_PER_BLOCK.saturating_sub(reserved_old_idx); + // Increase `stolen`, by the number of entries that can't be stolen anymore and are now up to the + // consumer to deqeuue. This ensures that, once the stealers have finished stealing the already reserved + // entries, `nblk.stolen == ENTRIES_PER_BLOCK` holds, i.e. this block is marked as having no active + // stealers, which will allow the producer to the enter this block again (in the next round). + next_block.stolen.fetch_add(num_consumer_owned, Relaxed); + + /* advance the block and try again */ + // The consumer must skip already reserved entries. + next_block.consumed.store(reserved_old, Relaxed); + *self.ccache = next_block; + true + } + + /// Advance consumer to the next block, unless the producer has not reached the block yet. + fn can_advance_consumer_block( + &self, + next_block: &Block, + curr_consumed: IndexAndVersion, + ) -> bool { + let next_cons_vsn = curr_consumed + .version() + .wrapping_add(next_block.is_head() as usize); + // The reserved field is updated last in `enqueue()`. It is only updated by the producer + // (`Owner`), so `Relaxed` is sufficient. If the actual reserved version is not equal to the + // expected next consumer version, then the producer has not advanced to the next block yet + // and we must wait. + let next_reserved_vsn = next_block.reserved.load(Relaxed).version(); + if next_reserved_vsn != next_cons_vsn { + debug_assert!(next_reserved_vsn == next_cons_vsn.wrapping_sub(1)); + return false; + } + true + } + + // /// Advance consumer to the next block, unless the producer has not reached the block yet. + // fn try_advance_consumer_block( + // &mut self, + // next_block: &Block, + // curr_consumed: IndexAndVersion, + // ) -> Result<(), ()> { + // if self.can_advance_consumer_block(next_block, curr_consumed) { + // *self.ccache = next_block; + // Ok(()) + // } else { + // Err(()) + // } + // } + + /// Todo: Ideally we would not have this function. + pub fn has_stealers(&self) -> bool { + let curr_spos = self.spos.load(Relaxed); + // spos increments beyond NUM_BLOCKS to prevent ABA problems. + let start_block_idx = curr_spos % NUM_BLOCKS; + for i in 0..NUM_BLOCKS { + let block_idx = (start_block_idx + i) % NUM_BLOCKS; + let blk: &Block = &self.queue.blocks[block_idx]; + let stolen = blk.stolen.load(Relaxed); + let reserved = blk.reserved.load(Relaxed); + if reserved != stolen { + return true; + } else if !reserved.index().is_full() { + return false; + } + } + false + } + + /// Check if there is a block available for stealing in the queue. + /// + /// Note that stealing may still fail for a number of reasons even if this function returned true + /// Todo: the overhead could be reduced, if we allow this function to return false in some + /// cases when the queue size is low. + #[cfg(feature = "stats")] + pub fn has_stealable_block(&self) -> bool { + let n = self.queue.stats.curr_enqueued(); + // SAFETY: self.ccache always points to a valid Block. + let committed_idx = unsafe { (**self.ccache).committed.load(Relaxed).raw_index() }; + // SAFETY: self.ccache always points to a valid Block. + let consumed_idx = unsafe { (**self.ccache).consumed.load(Relaxed).raw_index() }; + // true if there are more items enqueued in total than enqueued in the current block. + n > (committed_idx - consumed_idx) + } + + /// Returns `true` if enqueuing one block of entries would succeed. + pub fn can_enqueue_block(&self) -> bool { + // Note: the current implementation of this function is overly conservative but fast. + let current_block = unsafe { &*(**self.pcache).next() }; + let committed = current_block.committed.load(Relaxed); + if committed.index().is_empty() { + true + } else { + self.is_next_block_writable(current_block, committed.version()) + } + } + + /// `true` if there is at least one entry that can be dequeued. + /// + /// It is possible that a dequeue can still fail, since the item was stolen after we checked + /// and before the consumer advanced to the block in question. + pub fn can_consume(&self) -> bool { + // SAFETY: `ccache` always points to a valid `Block` in the queue. We never create a mutable reference + // to a Block, so it is safe to construct a shared reference here. + let current_blk_cache = unsafe { &**self.ccache }; + let mut blk = current_blk_cache; + for _ in 0..NUM_BLOCKS + 1 { + // check if the block is fully consumed already + let consumed = blk.consumed.load(Relaxed); + let consumed_idx = consumed.raw_index(); + + // Fastpath (Block is not fully consumed yet) + if consumed_idx < ENTRIES_PER_BLOCK { + // we know the block is not full, but we must first check if there is an entry to + // dequeue. + let committed_idx = blk.committed.load(Relaxed).raw_index(); + if consumed_idx == committed_idx { + return false; + } + + /* There is an entry to dequeue */ + return true; + } + + /* Slow-path */ + + /* Consumer head may never pass the Producer head and Consumer/Stealer tail */ + let nblk = unsafe { &*blk.next() }; + if self.can_advance_consumer_block(nblk, consumed) { + blk = nblk; + } else { + return false; + } + /* We advanced to the next block - loop around and try again */ + } + // Since there is no concurrent enqueuing and the buffer is bounded, we should reach + // one of the exit conditions in at most NUM_BLOCKS iterations. + unreachable!() + } + + fn get_consumer_block( + &mut self, + ) -> Option<( + &Block, + IndexAndVersion, + )> { + // SAFETY: `ccache` always points to a valid `Block` in the queue. We never create a mutable reference + // to a Block, so it is safe to construct a shared reference here. + let current_blk_cache = unsafe { &**self.ccache }; + let mut blk = current_blk_cache; + // The +1 is necessary to advance again to our original starting block, this time with a + // new version. This can happen in the edge-case that all items in the queue where stolen. + for _ in 0..NUM_BLOCKS + 1 { + // check if the block is fully consumed already + let consumed = blk.consumed.load(Relaxed); + let consumed_idx = consumed.raw_index(); + + // Fastpath (Block is not fully consumed yet) + if consumed_idx < ENTRIES_PER_BLOCK { + // we know the block is not full, but we must first check if there is an entry to + // dequeue. + let committed_idx = blk.committed.load(Relaxed).raw_index(); + if consumed_idx == committed_idx { + return None; + } + + /* There is an entry to dequeue */ + return Some((blk, consumed)); + } + + /* Slow-path */ + + /* Consumer head may never pass the Producer head and Consumer/Stealer tail */ + let nblk = unsafe { &*blk.next() }; + if self.try_advance_consumer_block(nblk, consumed) { + blk = nblk; + } else { + return None; + } + /* We advanced to the next block - loop around and try again */ + } + // Since there is no concurrent enqueuing and the buffer is bounded, we should reach + // one of the exit conditions in at most NUM_BLOCKS+1 iterations. + unreachable!() + } +} + +impl Clone + for Stealer +{ + fn clone(&self) -> Self { + Self { + spos: self.spos.clone(), + queue: self.queue.clone(), + } + } +} + +impl + Stealer +{ + /// Try to steal a single item from the queue + #[inline] + pub fn steal(&self) -> Option { + loop { + let (blk, curr_spos) = self.curr_block(); + + /* check if the block is fully reserved */ + let reserved = blk.reserved.load(Acquire); + let reserved_idx = reserved.raw_index(); + + if reserved_idx < ENTRIES_PER_BLOCK { + /* check if we have an entry to occupy */ + let committed = blk.committed.load(Acquire); + let committed_idx = committed.raw_index(); + if reserved_idx == committed_idx { + return None; + } + // SAFETY: We checked before that `reserved_idx` < ENTRIES_PER_BLOCK, so the index + // can't overflow. + let new_reserved = unsafe { reserved.index_add_unchecked(1) }; + let reserve_res = + blk.reserved + .compare_exchange_weak(reserved, new_reserved, Release, Relaxed); + if reserve_res.is_err() { + return None; + } + + /* we got the entry */ + + #[cfg(feature = "stats")] + self.queue.stats.increment_stolen(1); + + // SAFETY: We know the entry is a valid and initialized `E` and is now exclusively owned by us. + let t = + unsafe { blk.entries[reserved_idx].with(|entry| entry.read().assume_init()) }; + // `t` is now owned by us so we mark the stealing as finished. Synchronizes with the Owner Acquire. + let old_stolen = blk.stolen.fetch_add(1, Release); + debug_assert!(old_stolen.raw_index() < ENTRIES_PER_BLOCK); + return Some(t); + } + + // Slow-path: The current block is already fully reserved. Try to advance to the next block + if !self.can_advance(blk, reserved) { + return None; + } + self.try_advance_spos(curr_spos); + } + } + + /// Get the current stealer `Block` and the corresponding stealer position (`spos`) + /// + /// The returned `spos` can be larger than `NUM_BLOCKS` to detect [ABA](https://en.wikipedia.org/wiki/ABA_problem) + /// situations. + fn curr_block(&self) -> (&Block, usize) { + let curr_spos = self.spos.load(Relaxed); + // spos increments beyond NUM_BLOCKS to prevent ABA problems. + let block_idx = curr_spos % NUM_BLOCKS; + let blk: &Block = &self.queue.blocks[block_idx]; + (blk, curr_spos) + } + + /// Try to steal a block from `self`. + /// + /// Tries to steal a full block from `self`. If the block is not fully + /// committed yet it will steal up to and including the last committed entry + /// of that block. + #[inline] + pub fn steal_block(&self) -> Option> { + loop { + let (blk, curr_spos) = self.curr_block(); + + /* check if the block is fully reserved */ + let reserved = blk.reserved.load(Acquire); + let reserved_idx = reserved.raw_index(); + + if reserved_idx < ENTRIES_PER_BLOCK { + /* check if we have an entry to occupy */ + let committed = blk.committed.load(Acquire); + let committed_idx = committed.raw_index(); + if reserved_idx == committed_idx { + return None; + } + + // Try to steal the block up to the latest committed entry + let reserve_res = blk + .reserved + .compare_exchange_weak(reserved, committed, Release, Relaxed); + + if reserve_res.is_err() { + return None; + } + + let num_reserved = committed_idx - reserved_idx; + // From the statistics perspective we consider the reserved range to already be + // stolen, since it is not available for the consumer or other stealers anymore. + #[cfg(feature = "stats")] + self.queue.stats.increment_stolen(num_reserved); + return Some(StealerBlockIter { + stealer_block: blk, + block_reserved: committed_idx, + i: reserved_idx, + num_reserved, + }); + } + + // Slow-path: The current block is already fully reserved. Try to advance to next block + if !self.can_advance(blk, reserved) { + return None; + } + self.try_advance_spos(curr_spos); + } + } + + /// True if the stealer can advance to the next block + fn can_advance( + &self, + curr_block: &Block, + curr_reserved: IndexAndVersion, + ) -> bool { + /* r_head never pass the w_head and r_tail */ + let nblk = unsafe { &*curr_block.next() }; + let next_expect_vsn = curr_reserved.version() + nblk.is_head() as usize; + let next_actual_vsn = nblk.reserved.load(Relaxed).version(); + next_expect_vsn == next_actual_vsn + } + + /// Try and advance `spos` to the next block. + /// + /// We are not interested in the failure case, since the next stealer can just try again. + fn try_advance_spos(&self, curr_spos: usize) { + // Ignore result. Failure means a different stealer succeeded in updating + // the stealer block index. In case of a sporadic failure the next stealer will try again. + let _ = + self.spos + .compare_exchange_weak(curr_spos, curr_spos.wrapping_add(1), Relaxed, Relaxed); + } + + /// The estimated number of entries currently enqueued. + #[cfg(feature = "stats")] + pub fn estimated_queue_entries(&self) -> usize { + self.queue.estimated_len() + } +} + +impl<'a, E, const ENTRIES_PER_BLOCK: usize> Iterator for BlockIter<'a, E, ENTRIES_PER_BLOCK> { + type Item = E; + + #[inline] + fn next(&mut self) -> Option { + let i = self.i; + self.i += 1; + if i < self.committed { + self.buffer.get(i).map(|entry_cell| { + entry_cell.with(|entry| { + // SAFETY: we claimed the entries + unsafe { entry.read().assume_init() } + }) + }) + } else { + None + } + } +} + +impl<'a, E, const ENTRIES_PER_BLOCK: usize> Iterator + for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> +{ + type Item = E; + + #[inline] + fn next(&mut self) -> Option { + if self.i < self.block_reserved { + let entry = self.stealer_block.entries[self.i].with(|entry| { + // SAFETY: we claimed the entries + unsafe { entry.read().assume_init() } + }); + self.i += 1; + Some(entry) + } else { + None + } + } +} + +impl<'a, E, const ENTRIES_PER_BLOCK: usize> Drop for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> { + fn drop(&mut self) { + // Ensure `Drop` is called on any items that where not consumed, by consuming the iterator, + // which implicitly dequeues all items + while self.next().is_some() {} + self.stealer_block + .stolen + .fetch_add(self.num_reserved, Release); + } +} + +impl<'a, E, const ENTRIES_PER_BLOCK: usize> StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> { + pub fn len(&self) -> usize { + self.block_reserved - self.i + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl<'a, E, const ENTRIES_PER_BLOCK: usize> core::fmt::Debug + for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> +{ + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!( + "StealerBlockIter over {} entries", + self.block_reserved - self.i + )) + } +} + +/// Create a new BWoS queue and return the [Owner] and a [Stealer] instance +/// +/// `NUM_BLOCKS` must be a power two and at least 2. `ENTRIES_PER_BLOCK` can be freely chosen (non-zero). +/// The total length of the queue is `NUM_BLOCKS * ENTRIES_PER_BLOCK` and must not be more than `usize::MAX`. +/// +/// ## Performance considerations +/// +/// The Owner throughput will improve with a larger `ENTRIES_PER_BLOCK` value. +/// Thieves however will prefer a higher `NUM_BLOCKS` count since it makes it easier to +/// steal a whole block. +pub fn new() -> ( + Owner, + Stealer, +) { + assert!(NUM_BLOCKS.checked_mul(ENTRIES_PER_BLOCK).is_some()); + assert!(NUM_BLOCKS.is_power_of_two()); + assert!(NUM_BLOCKS >= 1); + assert!(ENTRIES_PER_BLOCK >= 1); + + let q: Pin>> = BwsQueue::new(); + let first_block = &q.blocks[0]; + + let stealer_position = Arc::new(AtomicUsize::new(0)); + + ( + Owner { + pcache: CachePadded::new(first_block), + ccache: CachePadded::new(first_block), + spos: CachePadded::new(stealer_position.clone()), + queue: q.clone(), + }, + Stealer { + spos: CachePadded::new(stealer_position), + queue: q, + }, + ) +} diff --git a/bwosqueue/src/loom/mocked.rs b/bwosqueue/src/loom/mocked.rs new file mode 100644 index 00000000000..367d59b43a4 --- /dev/null +++ b/bwosqueue/src/loom/mocked.rs @@ -0,0 +1,40 @@ +pub(crate) use loom::*; + +pub(crate) mod sync { + + pub(crate) use loom::sync::MutexGuard; + + #[derive(Debug)] + pub(crate) struct Mutex(loom::sync::Mutex); + + #[allow(dead_code)] + impl Mutex { + #[inline] + pub(crate) fn new(t: T) -> Mutex { + Mutex(loom::sync::Mutex::new(t)) + } + + #[inline] + pub(crate) fn lock(&self) -> MutexGuard<'_, T> { + self.0.lock().unwrap() + } + + #[inline] + pub(crate) fn try_lock(&self) -> Option> { + self.0.try_lock().ok() + } + } + pub(crate) use loom::sync::*; +} + +pub(crate) mod rand { + pub(crate) fn seed() -> u64 { + 1 + } +} + +pub(crate) mod sys { + pub(crate) fn num_cpus() -> usize { + 2 + } +} diff --git a/bwosqueue/src/loom/mod.rs b/bwosqueue/src/loom/mod.rs new file mode 100644 index 00000000000..7925aa630b5 --- /dev/null +++ b/bwosqueue/src/loom/mod.rs @@ -0,0 +1,16 @@ +//! This module abstracts over `loom` and `std::sync` depending on whether we +//! are running tests or not. +//! This module is directly copied from tokio. Everything in this module is subject to the same license as tokio. + +#![allow(unused)] +#![allow(unsafe_op_in_unsafe_fn)] + +#[cfg(not(loom))] +mod std; +#[cfg(not(loom))] +pub(crate) use self::std::*; + +#[cfg(loom)] +mod mocked; +#[cfg(loom)] +pub(crate) use self::mocked::*; diff --git a/bwosqueue/src/loom/std/atomic_ptr.rs b/bwosqueue/src/loom/std/atomic_ptr.rs new file mode 100644 index 00000000000..236645f037b --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_ptr.rs @@ -0,0 +1,34 @@ +use std::fmt; +use std::ops::{Deref, DerefMut}; + +/// `AtomicPtr` providing an additional `load_unsync` function. +pub(crate) struct AtomicPtr { + inner: std::sync::atomic::AtomicPtr, +} + +impl AtomicPtr { + pub(crate) fn new(ptr: *mut T) -> AtomicPtr { + let inner = std::sync::atomic::AtomicPtr::new(ptr); + AtomicPtr { inner } + } +} + +impl Deref for AtomicPtr { + type Target = std::sync::atomic::AtomicPtr; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for AtomicPtr { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +impl fmt::Debug for AtomicPtr { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/src/loom/std/atomic_u16.rs b/bwosqueue/src/loom/std/atomic_u16.rs new file mode 100644 index 00000000000..c1c531208c2 --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_u16.rs @@ -0,0 +1,44 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops::Deref; + +/// `AtomicU16` providing an additional `load_unsync` function. +pub(crate) struct AtomicU16 { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicU16 {} +unsafe impl Sync for AtomicU16 {} + +impl AtomicU16 { + pub(crate) const fn new(val: u16) -> AtomicU16 { + let inner = UnsafeCell::new(std::sync::atomic::AtomicU16::new(val)); + AtomicU16 { inner } + } + + /// Performs an unsynchronized load. + /// + /// # Safety + /// + /// All mutations must have happened before the unsynchronized load. + /// Additionally, there must be no concurrent mutations. + pub(crate) unsafe fn unsync_load(&self) -> u16 { + *(*self.inner.get()).get_mut() + } +} + +impl Deref for AtomicU16 { + type Target = std::sync::atomic::AtomicU16; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl fmt::Debug for AtomicU16 { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/src/loom/std/atomic_u32.rs b/bwosqueue/src/loom/std/atomic_u32.rs new file mode 100644 index 00000000000..61f95fb30ce --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_u32.rs @@ -0,0 +1,34 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops::Deref; + +/// `AtomicU32` providing an additional `load_unsync` function. +pub(crate) struct AtomicU32 { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicU32 {} +unsafe impl Sync for AtomicU32 {} + +impl AtomicU32 { + pub(crate) const fn new(val: u32) -> AtomicU32 { + let inner = UnsafeCell::new(std::sync::atomic::AtomicU32::new(val)); + AtomicU32 { inner } + } +} + +impl Deref for AtomicU32 { + type Target = std::sync::atomic::AtomicU32; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl fmt::Debug for AtomicU32 { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/src/loom/std/atomic_u64.rs b/bwosqueue/src/loom/std/atomic_u64.rs new file mode 100644 index 00000000000..d7a656f2e24 --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_u64.rs @@ -0,0 +1,86 @@ +//! Implementation of an atomic u64 cell. On 64 bit platforms, this is a +//! re-export of `AtomicU64`. On 32 bit platforms, this is implemented using a +//! `Mutex`. + +// `AtomicU64` can only be used on targets with `target_has_atomic` is 64 or greater. +// Once `cfg_target_has_atomic` feature is stable, we can replace it with +// `#[cfg(target_has_atomic = "64")]`. +// Refs: https://github.com/rust-lang/rust/tree/master/src/librustc_target +//cfg_has_atomic_u64! { +pub(crate) use std::sync::atomic::AtomicU64; +//} +// +// cfg_not_has_atomic_u64! { +// use crate::loom::sync::Mutex; +// use std::sync::atomic::Ordering; +// +// #[derive(Debug)] +// pub(crate) struct AtomicU64 { +// inner: Mutex, +// } +// +// impl AtomicU64 { +// pub(crate) fn new(val: u64) -> Self { +// Self { +// inner: Mutex::new(val), +// } +// } +// +// pub(crate) fn load(&self, _: Ordering) -> u64 { +// *self.inner.lock() +// } +// +// pub(crate) fn store(&self, val: u64, _: Ordering) { +// *self.inner.lock() = val; +// } +// +// pub(crate) fn fetch_add(&self, val: u64, _: Ordering) -> u64 { +// let mut lock = self.inner.lock(); +// let prev = *lock; +// *lock = prev + val; +// prev +// } +// +// pub(crate) fn fetch_or(&self, val: u64, _: Ordering) -> u64 { +// let mut lock = self.inner.lock(); +// let prev = *lock; +// *lock = prev | val; +// prev +// } +// +// pub(crate) fn compare_exchange( +// &self, +// current: u64, +// new: u64, +// _success: Ordering, +// _failure: Ordering, +// ) -> Result { +// let mut lock = self.inner.lock(); +// +// if *lock == current { +// *lock = new; +// Ok(current) +// } else { +// Err(*lock) +// } +// } +// +// pub(crate) fn compare_exchange_weak( +// &self, +// current: u64, +// new: u64, +// success: Ordering, +// failure: Ordering, +// ) -> Result { +// self.compare_exchange(current, new, success, failure) +// } +// } +// +// impl Default for AtomicU64 { +// fn default() -> AtomicU64 { +// Self { +// inner: Mutex::new(0), +// } +// } +// } +// } diff --git a/bwosqueue/src/loom/std/atomic_u8.rs b/bwosqueue/src/loom/std/atomic_u8.rs new file mode 100644 index 00000000000..408aea338c6 --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_u8.rs @@ -0,0 +1,34 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops::Deref; + +/// `AtomicU8` providing an additional `load_unsync` function. +pub(crate) struct AtomicU8 { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicU8 {} +unsafe impl Sync for AtomicU8 {} + +impl AtomicU8 { + pub(crate) const fn new(val: u8) -> AtomicU8 { + let inner = UnsafeCell::new(std::sync::atomic::AtomicU8::new(val)); + AtomicU8 { inner } + } +} + +impl Deref for AtomicU8 { + type Target = std::sync::atomic::AtomicU8; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl fmt::Debug for AtomicU8 { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + self.deref().fmt(fmt) + } +} diff --git a/bwosqueue/src/loom/std/atomic_usize.rs b/bwosqueue/src/loom/std/atomic_usize.rs new file mode 100644 index 00000000000..0d5f36e4310 --- /dev/null +++ b/bwosqueue/src/loom/std/atomic_usize.rs @@ -0,0 +1,56 @@ +use std::cell::UnsafeCell; +use std::fmt; +use std::ops; + +/// `AtomicUsize` providing an additional `load_unsync` function. +pub(crate) struct AtomicUsize { + inner: UnsafeCell, +} + +unsafe impl Send for AtomicUsize {} +unsafe impl Sync for AtomicUsize {} + +impl AtomicUsize { + pub(crate) const fn new(val: usize) -> AtomicUsize { + let inner = UnsafeCell::new(std::sync::atomic::AtomicUsize::new(val)); + AtomicUsize { inner } + } + + /// Performs an unsynchronized load. + /// + /// # Safety + /// + /// All mutations must have happened before the unsynchronized load. + /// Additionally, there must be no concurrent mutations. + pub(crate) unsafe fn unsync_load(&self) -> usize { + *(*self.inner.get()).get_mut() + } + + pub(crate) fn with_mut(&mut self, f: impl FnOnce(&mut usize) -> R) -> R { + // safety: we have mutable access + f(unsafe { (*self.inner.get()).get_mut() }) + } +} + +impl ops::Deref for AtomicUsize { + type Target = std::sync::atomic::AtomicUsize; + + fn deref(&self) -> &Self::Target { + // safety: it is always safe to access `&self` fns on the inner value as + // we never perform unsafe mutations. + unsafe { &*self.inner.get() } + } +} + +impl ops::DerefMut for AtomicUsize { + fn deref_mut(&mut self) -> &mut Self::Target { + // safety: we hold `&mut self` + unsafe { &mut *self.inner.get() } + } +} + +impl fmt::Debug for AtomicUsize { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + (**self).fmt(fmt) + } +} diff --git a/bwosqueue/src/loom/std/mod.rs b/bwosqueue/src/loom/std/mod.rs new file mode 100644 index 00000000000..0c70bee74eb --- /dev/null +++ b/bwosqueue/src/loom/std/mod.rs @@ -0,0 +1,108 @@ +#![cfg_attr(any(not(feature = "full"), loom), allow(unused_imports, dead_code))] + +mod atomic_ptr; +mod atomic_u16; +mod atomic_u32; +mod atomic_u64; +mod atomic_u8; +mod atomic_usize; +mod mutex; +#[cfg(feature = "parking_lot")] +mod parking_lot; +mod unsafe_cell; + +pub(crate) mod cell { + pub(crate) use super::unsafe_cell::UnsafeCell; +} + +#[cfg(any( + feature = "net", + feature = "process", + feature = "signal", + feature = "sync", +))] +pub(crate) mod future { + pub(crate) use crate::sync::AtomicWaker; +} + +pub(crate) mod hint { + pub(crate) use std::hint::spin_loop; +} + +pub(crate) mod rand { + use std::collections::hash_map::RandomState; + use std::hash::{BuildHasher, Hash, Hasher}; + use std::sync::atomic::AtomicU32; + use std::sync::atomic::Ordering::Relaxed; + + static COUNTER: AtomicU32 = AtomicU32::new(1); + + pub(crate) fn seed() -> u64 { + let rand_state = RandomState::new(); + + let mut hasher = rand_state.build_hasher(); + + // Hash some unique-ish data to generate some new state + COUNTER.fetch_add(1, Relaxed).hash(&mut hasher); + + // Get the seed + hasher.finish() + } +} + +pub(crate) mod sync { + pub(crate) use std::sync::{Arc, Weak}; + + // Below, make sure all the feature-influenced types are exported for + // internal use. Note however that some are not _currently_ named by + // consuming code. + + #[cfg(feature = "parking_lot")] + #[allow(unused_imports)] + pub(crate) use crate::loom::std::parking_lot::{ + Condvar, Mutex, MutexGuard, RwLock, RwLockReadGuard, WaitTimeoutResult, + }; + + #[cfg(not(feature = "parking_lot"))] + #[allow(unused_imports)] + pub(crate) use std::sync::{Condvar, MutexGuard, RwLock, RwLockReadGuard, WaitTimeoutResult}; + + #[cfg(not(feature = "parking_lot"))] + pub(crate) use crate::loom::std::mutex::Mutex; + + pub(crate) mod atomic { + pub(crate) use crate::loom::std::atomic_ptr::AtomicPtr; + pub(crate) use crate::loom::std::atomic_u16::AtomicU16; + pub(crate) use crate::loom::std::atomic_u32::AtomicU32; + pub(crate) use crate::loom::std::atomic_u64::AtomicU64; + pub(crate) use crate::loom::std::atomic_u8::AtomicU8; + pub(crate) use crate::loom::std::atomic_usize::AtomicUsize; + + pub(crate) use std::sync::atomic::{fence, AtomicBool, Ordering}; + } +} + +pub(crate) mod sys { + #[cfg(feature = "rt-multi-thread")] + pub(crate) fn num_cpus() -> usize { + usize::max(1, num_cpus::get()) + } + + #[cfg(not(feature = "rt-multi-thread"))] + pub(crate) fn num_cpus() -> usize { + 1 + } +} + +pub(crate) mod thread { + #[inline] + pub(crate) fn yield_now() { + std::hint::spin_loop(); + } + + #[allow(unused_imports)] + pub(crate) use std::thread::{ + current, panicking, park, park_timeout, sleep, spawn, Builder, JoinHandle, LocalKey, + Result, Thread, ThreadId, + }; +} diff --git a/bwosqueue/src/loom/std/mutex.rs b/bwosqueue/src/loom/std/mutex.rs new file mode 100644 index 00000000000..3f686e0a78e --- /dev/null +++ b/bwosqueue/src/loom/std/mutex.rs @@ -0,0 +1,31 @@ +use std::sync::{self, MutexGuard, TryLockError}; + +/// Adapter for `std::Mutex` that removes the poisoning aspects +/// from its api. +#[derive(Debug)] +pub(crate) struct Mutex(sync::Mutex); + +#[allow(dead_code)] +impl Mutex { + #[inline] + pub(crate) fn new(t: T) -> Mutex { + Mutex(sync::Mutex::new(t)) + } + + #[inline] + pub(crate) fn lock(&self) -> MutexGuard<'_, T> { + match self.0.lock() { + Ok(guard) => guard, + Err(p_err) => p_err.into_inner(), + } + } + + #[inline] + pub(crate) fn try_lock(&self) -> Option> { + match self.0.try_lock() { + Ok(guard) => Some(guard), + Err(TryLockError::Poisoned(p_err)) => Some(p_err.into_inner()), + Err(TryLockError::WouldBlock) => None, + } + } +} diff --git a/bwosqueue/src/loom/std/parking_lot.rs b/bwosqueue/src/loom/std/parking_lot.rs new file mode 100644 index 00000000000..e3af258d116 --- /dev/null +++ b/bwosqueue/src/loom/std/parking_lot.rs @@ -0,0 +1,184 @@ +//! A minimal adaption of the `parking_lot` synchronization primitives to the +//! equivalent `std::sync` types. +//! +//! This can be extended to additional types/methods as required. + +use std::fmt; +use std::marker::PhantomData; +use std::ops::{Deref, DerefMut}; +use std::sync::LockResult; +use std::time::Duration; + +// All types in this file are marked with PhantomData to ensure that +// parking_lot's send_guard feature does not leak through and affect when Tokio +// types are Send. +// +// See for more info. + +// Types that do not need wrapping +pub(crate) use parking_lot::WaitTimeoutResult; + +#[derive(Debug)] +pub(crate) struct Mutex(PhantomData>, parking_lot::Mutex); + +#[derive(Debug)] +pub(crate) struct RwLock(PhantomData>, parking_lot::RwLock); + +#[derive(Debug)] +pub(crate) struct Condvar(PhantomData, parking_lot::Condvar); + +#[derive(Debug)] +pub(crate) struct MutexGuard<'a, T: ?Sized>( + PhantomData>, + parking_lot::MutexGuard<'a, T>, +); + +#[derive(Debug)] +pub(crate) struct RwLockReadGuard<'a, T: ?Sized>( + PhantomData>, + parking_lot::RwLockReadGuard<'a, T>, +); + +#[derive(Debug)] +pub(crate) struct RwLockWriteGuard<'a, T: ?Sized>( + PhantomData>, + parking_lot::RwLockWriteGuard<'a, T>, +); + +impl Mutex { + #[inline] + pub(crate) fn new(t: T) -> Mutex { + Mutex(PhantomData, parking_lot::Mutex::new(t)) + } + + #[inline] + #[cfg(all(feature = "parking_lot", not(all(loom, test))))] + #[cfg_attr(docsrs, doc(cfg(all(feature = "parking_lot",))))] + pub(crate) const fn const_new(t: T) -> Mutex { + Mutex(PhantomData, parking_lot::const_mutex(t)) + } + + #[inline] + pub(crate) fn lock(&self) -> MutexGuard<'_, T> { + MutexGuard(PhantomData, self.1.lock()) + } + + #[inline] + pub(crate) fn try_lock(&self) -> Option> { + self.1 + .try_lock() + .map(|guard| MutexGuard(PhantomData, guard)) + } + + #[inline] + pub(crate) fn get_mut(&mut self) -> &mut T { + self.1.get_mut() + } + + // Note: Additional methods `is_poisoned` and `into_inner`, can be + // provided here as needed. +} + +impl<'a, T: ?Sized> Deref for MutexGuard<'a, T> { + type Target = T; + fn deref(&self) -> &T { + self.1.deref() + } +} + +impl<'a, T: ?Sized> DerefMut for MutexGuard<'a, T> { + fn deref_mut(&mut self) -> &mut T { + self.1.deref_mut() + } +} + +impl RwLock { + pub(crate) fn new(t: T) -> RwLock { + RwLock(PhantomData, parking_lot::RwLock::new(t)) + } + + pub(crate) fn read(&self) -> LockResult> { + Ok(RwLockReadGuard(PhantomData, self.1.read())) + } + + pub(crate) fn write(&self) -> LockResult> { + Ok(RwLockWriteGuard(PhantomData, self.1.write())) + } +} + +impl<'a, T: ?Sized> Deref for RwLockReadGuard<'a, T> { + type Target = T; + fn deref(&self) -> &T { + self.1.deref() + } +} + +impl<'a, T: ?Sized> Deref for RwLockWriteGuard<'a, T> { + type Target = T; + fn deref(&self) -> &T { + self.1.deref() + } +} + +impl<'a, T: ?Sized> DerefMut for RwLockWriteGuard<'a, T> { + fn deref_mut(&mut self) -> &mut T { + self.1.deref_mut() + } +} + +impl Condvar { + #[inline] + pub(crate) fn new() -> Condvar { + Condvar(PhantomData, parking_lot::Condvar::new()) + } + + #[inline] + pub(crate) fn notify_one(&self) { + self.1.notify_one(); + } + + #[inline] + pub(crate) fn notify_all(&self) { + self.1.notify_all(); + } + + #[inline] + pub(crate) fn wait<'a, T>( + &self, + mut guard: MutexGuard<'a, T>, + ) -> LockResult> { + self.1.wait(&mut guard.1); + Ok(guard) + } + + #[inline] + pub(crate) fn wait_timeout<'a, T>( + &self, + mut guard: MutexGuard<'a, T>, + timeout: Duration, + ) -> LockResult<(MutexGuard<'a, T>, WaitTimeoutResult)> { + let wtr = self.1.wait_for(&mut guard.1, timeout); + Ok((guard, wtr)) + } + + // Note: Additional methods `wait_timeout_ms`, `wait_timeout_until`, + // `wait_until` can be provided here as needed. +} + +impl<'a, T: ?Sized + fmt::Display> fmt::Display for MutexGuard<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.1, f) + } +} + +impl<'a, T: ?Sized + fmt::Display> fmt::Display for RwLockReadGuard<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.1, f) + } +} + +impl<'a, T: ?Sized + fmt::Display> fmt::Display for RwLockWriteGuard<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.1, f) + } +} diff --git a/bwosqueue/src/loom/std/unsafe_cell.rs b/bwosqueue/src/loom/std/unsafe_cell.rs new file mode 100644 index 00000000000..66c1d7943e0 --- /dev/null +++ b/bwosqueue/src/loom/std/unsafe_cell.rs @@ -0,0 +1,16 @@ +#[derive(Debug)] +pub(crate) struct UnsafeCell(std::cell::UnsafeCell); + +impl UnsafeCell { + pub(crate) const fn new(data: T) -> UnsafeCell { + UnsafeCell(std::cell::UnsafeCell::new(data)) + } + + pub(crate) fn with(&self, f: impl FnOnce(*const T) -> R) -> R { + f(self.0.get()) + } + + pub(crate) fn with_mut(&self, f: impl FnOnce(*mut T) -> R) -> R { + f(self.0.get()) + } +} diff --git a/bwosqueue/src/metadata.rs b/bwosqueue/src/metadata.rs new file mode 100644 index 00000000000..59315bc819c --- /dev/null +++ b/bwosqueue/src/metadata.rs @@ -0,0 +1,233 @@ +//! Contains metadata for the block configuration + +use crate::loom::sync::atomic::{AtomicUsize, Ordering}; +use core::fmt::{Debug, Formatter}; + +/// A container for the current block index and block version +/// +/// `NE` is the number of elements in a block (index `0..NE`). `index == NE` marks a full block. +/// +/// Bits `0..=NE_LOG_CEIL`, where `NE_LOG_CEIL` is `(NE+1).next_power_of_two()).log2()` +/// are reserved for the index. +/// Bits `(NE_LOG_CEIL + 1)..` are used for the block version. The version field is +/// used to detect [ABA](https://en.wikipedia.org/wiki/ABA_problem) situations when accessing queue entries. +#[repr(transparent)] +#[derive(PartialEq, Eq, Copy, Clone)] +pub(crate) struct IndexAndVersion(usize); + +/// The index of the current element in the block +/// +/// 0 represents an empty block while NE represents a full block. +#[repr(transparent)] +pub(crate) struct Index(usize); + +impl Index { + /// Creates an Index for an empty block + #[inline(always)] + pub(crate) fn empty() -> Self { + Self(0) + } + + /// Creates an Index for a full block + #[inline(always)] + pub(crate) fn full() -> Self { + Self(NUM_ELEMENTS_PER_BLOCK) + } + + /// True if the block is full + #[inline(always)] + pub(crate) fn is_full(&self) -> bool { + self.0 == NUM_ELEMENTS_PER_BLOCK + } + + /// True if the block is empty + #[inline(always)] + pub(crate) fn is_empty(&self) -> bool { + self.0 == 0 + } +} + +// todo: use atomic usize after fixing overflow problem to support 32bit +#[repr(transparent)] +pub(crate) struct AtomicIndexAndVersion(AtomicUsize); + +impl IndexAndVersion<{ NE }> { + // 0 elements per block make no sense + const _ASSERT_NE_GREATER_ZERO: () = assert!(NE > 0); + const MIN_VERSION_BITS: u32 = 1; + // Subtract 1 to get the maximum number representable by that amount of bits and subtract another one to allow for + // representing the full block state (`idx == NE`). + const MAX_NE: usize = 2_usize.pow(usize::BITS - Self::MIN_VERSION_BITS) - 2; + const _ASSERT_NE_MAX: () = assert!(NE <= Self::MAX_NE); + + #[inline(always)] + fn raw(&self) -> usize { + self.0 + } + + #[inline(always)] + fn max_version() -> usize { + let num_version_bits = usize::BITS - Self::ne_log() as u32; + 2_usize.pow(num_version_bits).wrapping_sub(1) + } + + /// Number of bits used for the Number of elements in a block + /// + /// Guaranteed to be at least 1. + #[inline] + fn ne_log() -> usize { + #[allow(clippy::let_unit_value)] + let _ = Self::_ASSERT_NE_GREATER_ZERO; + #[allow(clippy::let_unit_value)] + let _ = Self::_ASSERT_NE_MAX; + // (const) integer logarithm is not stable yet, so we need to use floating point and + // rely on the compiler to optimize this away at compile time. + ((NE + 1).next_power_of_two() as f32).log2() as usize + } + + #[inline(always)] + pub(crate) fn new(version: usize, index: Index) -> Self { + debug_assert!(version <= Self::max_version()); + + Self(version.wrapping_shl(Self::ne_log() as u32) | index.0) + } + + #[inline(always)] + fn from_raw(raw: usize) -> Self { + Self(raw) + } + + #[inline(always)] + pub(crate) fn version(&self) -> usize { + self.0.wrapping_shr(Self::ne_log() as u32) + } + + /// Increment the version by one if this is the first block and reset index + #[inline] + pub(crate) fn next_version(&self, is_first_block: bool) -> Self { + let cur_version_shifted = self.0 & Self::version_mask(); + let first_bit_pos_version = Self::ne_log() as u32; + let new_version_shifted = cur_version_shifted + .wrapping_add((is_first_block as usize).wrapping_shl(first_bit_pos_version)); + // index is now zeroed. + Self(new_version_shifted) + } + + /// A bitmask for the bits used for the block index + #[inline(always)] + fn index_mask() -> usize { + // ne_log will be at least 1, so the subtraction will never wrap around + 1_usize.wrapping_shl(Self::ne_log() as u32) - 1 + } + + #[inline(always)] + fn version_mask() -> usize { + !Self::index_mask() + } + + #[inline(always)] + pub(crate) fn index(&self) -> Index { + // We are sure that the index we stored is valid + Index(self.raw_index()) + } + + #[inline(always)] + pub(crate) fn raw_index(&self) -> usize { + self.0 & Self::index_mask() + } + + #[inline(always)] + pub(crate) fn set_full(&self) -> Self { + Self((self.0 & Self::version_mask()) | NE) + } + + /// Increment the Index by `rhs`. + /// + /// # Safety + /// + /// The caller be sure that the result of self + rhs is <= NE. + #[inline(always)] + pub(crate) unsafe fn index_add_unchecked(&self, rhs: usize) -> Self { + debug_assert!(self.raw_index() + rhs <= NE); + Self(self.0.wrapping_add(rhs)) + } +} + +impl Debug for IndexAndVersion<{ NE }> { + fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { + f.debug_struct("IndexAndVersion") + .field("Index", &self.raw_index()) + .field("Version", &self.version()) + .finish() + } +} + +impl AtomicIndexAndVersion<{ NE }> { + #[inline(always)] + pub(crate) fn load(&self, order: Ordering) -> IndexAndVersion { + let v = self.0.load(order); + IndexAndVersion::from_raw(v) + } + + /// Creates a new instance for an `Owner` field (producer or consumer + pub(crate) fn new_owner(is_queue_head: bool) -> Self { + let empty_val: IndexAndVersion = if is_queue_head { + // The first block (head) starts at version one and with an empty index + // to indicate readiness to produce/consume once values where produced. + IndexAndVersion::new(1, Index::empty()) + } else { + // The remaining blocks start one version behind and are marked as fully + // produced/consumed. + IndexAndVersion::new(0, Index::full()) + }; + Self(AtomicUsize::new(empty_val.raw())) + } + + /// Creates a new instance for a `Stealer` field. The main difference to + /// [new_owner](Self::new_owner) is that the stealer is always initialized as full, + /// i.e. not ready for stealing. This is because the queue head is reserved for the + /// consumer and the stealer may not steal from the same block the consumer is on. + pub(crate) fn new_stealer(is_queue_head: bool) -> Self { + let full_val: IndexAndVersion = + IndexAndVersion::new(is_queue_head as usize, Index::full()); + Self(AtomicUsize::new(full_val.raw())) + } + + #[inline(always)] + pub(crate) fn fetch_add(&self, val: usize, order: Ordering) -> IndexAndVersion { + let old = self.0.fetch_add(val, order); + IndexAndVersion::from_raw(old) + } + + #[inline(always)] + pub(crate) fn compare_exchange_weak( + &self, + current: IndexAndVersion, + new: IndexAndVersion, + success: Ordering, + failure: Ordering, + ) -> Result, IndexAndVersion> { + self.0 + .compare_exchange_weak(current.raw(), new.raw(), success, failure) + .map_err(IndexAndVersion::from_raw) + .map(IndexAndVersion::from_raw) + } + + #[inline(always)] + pub(crate) fn store(&self, val: IndexAndVersion, order: Ordering) { + self.0.store(val.raw(), order) + } + + #[inline(always)] + pub(crate) fn swap(&self, val: IndexAndVersion, order: Ordering) -> IndexAndVersion { + let old = self.0.swap(val.raw(), order); + IndexAndVersion::from_raw(old) + } +} + +impl Debug for AtomicIndexAndVersion<{ NE }> { + fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { + let val = self.load(Ordering::SeqCst); + f.write_fmt(format_args!("{:?}", val)) + } +} diff --git a/bwosqueue/tests/blocked_stealer.rs b/bwosqueue/tests/blocked_stealer.rs new file mode 100644 index 00000000000..a806b5973ad --- /dev/null +++ b/bwosqueue/tests/blocked_stealer.rs @@ -0,0 +1,66 @@ +#![cfg(not(loom))] +// A test to check behaviour is sane when the queue is full and empty at the same time +// when waiting on a stealer. +#[test] +fn blocked_stealer() { + const NUM_BLOCKS: usize = 4; + const ENTRIES_PER_BLOCK: usize = 4; + let (mut owner, stealer) = bwosqueue::new::(); + let mut total_enqueues = 0; + for i in 0..ENTRIES_PER_BLOCK + 2 { + owner.enqueue(i).unwrap(); + total_enqueues += 1; + } + let mut stolen_iter = stealer.steal_block().unwrap(); + // We have now reserved the items in the block but not dequeued them yet. + + let mut total_dequeued = 0; + // consume first block + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(val, total_dequeued); + total_dequeued += 1; + } else { + break; + } + } + + // push until full + while owner.enqueue(total_enqueues).is_ok() { + total_enqueues += 1; + } + + loop { + if let Some(val) = owner.dequeue() { + // 2 entries where already reserved by stealer + assert_eq!(val, total_dequeued + 2); + total_dequeued += 1; + } else { + break; + } + } + + // We wrapped around once and are now stuck at the end of the first block waiting on the stealer + assert_eq!(total_enqueues, (NUM_BLOCKS + 1) * ENTRIES_PER_BLOCK); + assert_eq!(total_dequeued, (NUM_BLOCKS + 1) * ENTRIES_PER_BLOCK - 2); + + assert_eq!( + ENTRIES_PER_BLOCK, + stolen_iter.next().expect("No stolen item") + ); + // Stealer is not finished yet, so consumer and producer should still be stuck + assert_eq!(owner.enqueue(42), Err(42)); + assert_eq!(owner.dequeue(), None); + // Let the stealer finish + assert_eq!( + ENTRIES_PER_BLOCK + 1, + stolen_iter.next().expect("No stolen item") + ); + assert_eq!(stolen_iter.next(), None); + // Manually drop to unstuck + drop(stolen_iter); + // Producer and Consumer are now both unstuck, but the queue is still empty. + assert_eq!(owner.dequeue(), None); + assert!(owner.enqueue(1).is_ok()); + assert_eq!(total_enqueues, total_dequeued + 2); +} diff --git a/bwosqueue/tests/loom.rs b/bwosqueue/tests/loom.rs new file mode 100644 index 00000000000..829e289c6e6 --- /dev/null +++ b/bwosqueue/tests/loom.rs @@ -0,0 +1,367 @@ +use std::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst}; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; +use tracing::{event, span, Level}; + +#[cfg(loom)] +use loom::{ + self, model, + thread::{self, JoinHandle}, +}; +#[cfg(not(loom))] +use std::thread::{self, JoinHandle}; + +use bwosqueue::Owner; + +#[cfg(not(loom))] +fn model(f: F) +where + F: FnOnce(), +{ + f(); +} + +type QueueOwner = bwosqueue::Owner; +type QueueStealer = bwosqueue::Stealer; + +struct Stat { + sum: u64, + buf: u64, +} + +impl Stat { + fn new() -> Self { + Self { sum: 0, buf: 1 } + } + fn put(&mut self, owner: &mut QueueOwner) { + if owner.enqueue(self.buf).is_ok() { + event!(Level::INFO, "put succeeded"); + self.sum += self.buf; + self.buf <<= 1; + } else { + event!(Level::INFO, "put failed"); + } + } + + fn get(&mut self, owner: &mut QueueOwner) -> bool { + if let Some(data) = owner.dequeue() { + event!(Level::INFO, "get succeeded"); + self.sum += data; + true + } else { + event!(Level::INFO, "get failed"); + false + } + } + + fn steal(&mut self, stealer: &QueueStealer) { + event!(Level::INFO, "attempting to steal"); + if let Some(data) = stealer.steal() { + event!(Level::INFO, "steal succeeded"); + self.sum += data; + } else { + event!(Level::INFO, "steal failed"); + } + } +} + +fn thread0( + mut q_owner: QueueOwner, + mut enq_stat: Stat, + mut deq_stat: Stat, +) -> (QueueOwner, Stat, Stat) { + let owner = &mut q_owner; + + let span = span!(Level::INFO, "Owner Put A"); + let guard = span.enter(); + for i in 0..3 { + event!(Level::INFO, put_iter = i); + enq_stat.put(owner); + } + drop(guard); + + let span = span!(Level::INFO, "Owner Get B"); + let guard = span.enter(); + for i in 0..2 { + event!(Level::INFO, get_iter = i); + deq_stat.get(owner); + } + drop(guard); + + let span = span!(Level::INFO, "Owner Put C"); + let guard = span.enter(); + for i in 0..4 { + event!(Level::INFO, put_iter = i); + enq_stat.put(owner); + } + drop(guard); + + let span = span!(Level::INFO, "Owner Get D"); + let guard = span.enter(); + for i in 0..3 { + event!(Level::INFO, get_iter = i); + deq_stat.get(owner); + } + drop(guard); + + let span = span!(Level::INFO, "Owner Put E"); + let guard = span.enter(); + for i in 0..3 { + event!(Level::INFO, put_iter = i); + enq_stat.put(owner); + } + drop(guard); + + for _ in 0..4 { + deq_stat.get(owner); + } + + (q_owner, enq_stat, deq_stat) +} + +fn thread1(stealer: QueueStealer, mut s1: Stat) -> Stat { + let span = span!(Level::INFO, "Stealer 1"); + let _guard = span.enter(); + s1.steal(&stealer); + event!(Level::INFO, "Steal A done"); + s1 +} + +fn thread2(stealer: QueueStealer, mut s2: Stat) -> Stat { + let span = span!(Level::INFO, "Stealer 2"); + let _guard = span.enter(); + s2.steal(&stealer); + event!(Level::INFO, "Steal B done"); + s2.steal(&stealer); + event!(Level::INFO, "Steal C done"); + s2 +} + +fn test_inner(stealers: usize) { + assert!(stealers <= 2, "We only have 2 stealers implemented"); + let explored_executions = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0)); + let l_explored_executions = explored_executions.clone(); + println!(); + model(move || { + let current_iteration = l_explored_executions.fetch_add(1, Ordering::Relaxed); + let (owner, s1): (QueueOwner, QueueStealer) = bwosqueue::new(); + let enq_stat = Stat::new(); + let deq_stat = Stat::new(); + let s1_stat = Stat::new(); + let s2_stat = Stat::new(); + + let owner_handle = thread::spawn(move || thread0(owner, enq_stat, deq_stat)); + + let mut stealer_handles = Vec::with_capacity(stealers); + if stealers > 0 { + if stealers > 1 { + let s2 = s1.clone(); + stealer_handles.push(thread::spawn(move || thread2(s2, s2_stat))); + } + stealer_handles.push(thread::spawn(move || thread1(s1, s1_stat))); + } + + let (mut owner, enq_stat, mut deq_stat) = + owner_handle.join().expect("Owner thread panicked"); + let total_stolen: u64 = stealer_handles + .into_iter() + .map(|handle| handle.join().expect("Stealer thread panicked").sum) + .sum(); + while deq_stat.get(&mut owner) {} + assert_eq!(enq_stat.sum, deq_stat.sum + total_stolen); + + if current_iteration > 0 && current_iteration % 50_000 == 0 { + println!("Explored {current_iteration} iterations"); + } + }); + println!( + "Loom model explored {} interleavings.", + explored_executions.load(SeqCst) + ); +} + +#[test] +fn no_stealer() { + test_inner(0); +} + +#[test] +fn one_stealer() { + test_inner(1); +} + +// This test will take a very long time with loom, so ignore it unless specifically requested +#[test] +#[cfg_attr(loom, ignore)] +fn two_stealers() { + test_inner(2); +} + +#[test] +fn steal_block_loom() { + model(|| { + const NB: usize = 4; + const NE: usize = 4; + let (mut owner, stealer) = bwosqueue::new::(); + // explicitly not a loom type, since this only for the test and we do not care about reorderings + let total_dequeues: std::sync::Arc = std::sync::Arc::new(AtomicU32::new(0)); + + let mut total_enq = 0; + while owner.enqueue(5).is_ok() { + total_enq += 1; + } + + let mut handles: [Option>>; NB - 1] = + array_init::array_init(|_| None); + for i in 0..NB - 1 { + let local_stealer = stealer.clone(); + let local_total_dequeues = total_dequeues.clone(); + let handle = thread::spawn(move || { + let (mut dst_owner, _) = bwosqueue::new::(); + // Any ordering of stealers and consumer is possible, so maybe the consumer consumed everything + // already and there is nothing left to steal. + // Stealing could fail sporadically due to steal_block nature, but we can't do much about that. + if let Some(stolen_iter) = local_stealer.steal_block() { + let stolen_len = + unsafe { dst_owner.enqueue_batch_unchecked(Box::new(stolen_iter)) }; + assert!( + stolen_len > 0, + "Successfull steal implies at least one stolen item" + ); + loop { + if let Some(val) = dst_owner.dequeue() { + assert_eq!(val, 5); + local_total_dequeues.fetch_add(1, Relaxed); + } else { + break; + }; + } + } + + dst_owner + }); + handles[i] = Some(handle); + } + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(val, 5); + total_dequeues.fetch_add(1, Relaxed); + } else { + break; + }; + } + + for handle in handles { + let mut dst_queue = handle + .expect("Handle not initialized") + .join() + .expect("Join failed"); + #[cfg(feature = "stats")] + assert!(!dst_queue.can_consume()); + assert!(dst_queue.dequeue().is_none()); + assert!(dst_queue.dequeue_block().is_none()); + } + + std::sync::atomic::fence(SeqCst); + assert_eq!(total_dequeues.load(SeqCst), total_enq); + + #[cfg(feature = "stats")] + assert!(!owner.can_consume()); + }); +} + +#[test] +fn queue_loom() { + model(|| { + const NB: usize = 4; + const NE: usize = 8; + const ITERATIONS: u32 = 80; + let (mut owner, stealer) = bwosqueue::new::(); + // explicitly not `loom` types, since this only for the test and we do not care about reorderings + let total_dequeues = std::sync::Arc::new(AtomicU32::new(0)); + let finished = std::sync::Arc::new(AtomicBool::new(false)); + + let owner_total_deq = total_dequeues.clone(); + let owner_finished = finished.clone(); + let owner_thread_handle = thread::spawn(move || { + let mut total_enq: u32 = 0; + while total_enq < ITERATIONS { + while total_enq < ITERATIONS && owner.enqueue(5).is_ok() { + total_enq += 1; + } + + loop { + if let Some(res) = owner.dequeue() { + assert_eq!(res, 5); + owner_total_deq.fetch_add(1, Relaxed); + } else { + break; + } + } + } + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(val, 5); + owner_total_deq.fetch_add(1, Relaxed); + } else { + break; + }; + } + + #[cfg(feature = "stats")] + assert!(!owner.can_consume()); + owner_finished.store(true, Release); + total_enq + }); + + let mut handles: [Option>; 2] = array_init::array_init(|_| None); + for i in 0..2 { + let local_stealer = stealer.clone(); + let local_total_dequeues = total_dequeues.clone(); + let local_finished = finished.clone(); + + let handle = thread::spawn(move || { + let (mut dst_owner, _) = bwosqueue::new::(); + + // Any ordering of stealers and consumer is possible, so maybe the consumer consumed everything + // already and there is nothing left to steal. + // Stealing could fail sporadically due to steal_block nature, but we can't do much about that. + while !local_finished.load(Acquire) { + if let Some(stolen_iter) = local_stealer.steal_block() { + let num_stolen = + unsafe { dst_owner.enqueue_batch_unchecked(Box::new(stolen_iter)) }; + assert!( + num_stolen > 0, + "Successful steal implies at least one stolen item" + ); + + loop { + if let Some(val) = dst_owner.dequeue() { + assert_eq!(val, 5); + local_total_dequeues.fetch_add(1, Relaxed); + } else { + break; + }; + } + } + } + dst_owner + }); + handles[i] = Some(handle); + } + + for handle in handles { + let mut dst_queue = handle + .expect("Handle not initialized") + .join() + .expect("Join failed"); + #[cfg(feature = "stats")] + assert!(!dst_queue.can_consume()); + assert!(dst_queue.dequeue().is_none()); + assert!(dst_queue.dequeue_block().is_none()); + } + + let total_enqueued = owner_thread_handle.join().expect("Owner thread failed"); + std::sync::atomic::fence(SeqCst); + assert_eq!(total_dequeues.load(SeqCst), total_enqueued); + }); +} diff --git a/bwosqueue/tests/simple.rs b/bwosqueue/tests/simple.rs new file mode 100644 index 00000000000..d37840b9019 --- /dev/null +++ b/bwosqueue/tests/simple.rs @@ -0,0 +1,71 @@ +//! Simple, single threaded test cases + +#[cfg(not(loom))] +#[test] +fn simple_enqueue_dequeue() { + const NB: usize = 8; + const NE: usize = 1024; + let (mut owner, _) = bwosqueue::new::(); + + let mut i = 0; + while owner.enqueue(i).is_ok() { + i += 1; + } + + i = 0; + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(val, i); + i += 1; + } else { + break; + }; + } + + // use owner outside of iter to control drop + #[cfg(feature = "stats")] + assert!(!owner.can_consume()) +} + +#[cfg(not(loom))] +#[test] +fn steal_block() { + const NB: usize = 8; + const NE: usize = 1024; + let (mut owner, stealer) = bwosqueue::new::(); + let (mut dst_owner, _) = bwosqueue::new::(); + + let mut i = 0; + while owner.enqueue(i).is_ok() { + i += 1; + } + // steal all blocks except the consumer block + for _ in 0..NB - 1 { + let items = stealer.steal_block().unwrap(); + unsafe { dst_owner.enqueue_batch_unchecked(Box::new(items)) }; + } + + i = 0; + loop { + if let Some(val) = owner.dequeue() { + assert_eq!(val, i); + i += 1; + } else { + break; + }; + } + + #[cfg(feature = "stats")] + assert!(!owner.can_consume()); + + loop { + if let Some(val) = dst_owner.dequeue() { + assert_eq!(val, i); + i += 1; + } else { + break; + }; + } + #[cfg(feature = "stats")] + assert!(!dst_owner.can_consume()); +} diff --git a/bwosqueue/tests/steal_block.rs b/bwosqueue/tests/steal_block.rs new file mode 100644 index 00000000000..90cc858ffb2 --- /dev/null +++ b/bwosqueue/tests/steal_block.rs @@ -0,0 +1,228 @@ +extern crate core; + +use bwosqueue::{Owner, Stealer}; +use core::time; +use core_affinity::CoreId; +use rand; +use rand::Rng; +use std::arch::asm; +use std::sync::atomic::AtomicBool; +use std::sync::atomic::Ordering::{Relaxed, SeqCst}; +use std::sync::Arc; +use std::thread; + +#[derive(Copy, Clone)] +struct TestParams { + num_stealers: usize, + duration: usize, + idle_loop: usize, + push_percentage: usize, + stealer_core: Option, + steal_blocks: bool, +} + +impl Default for TestParams { + fn default() -> Self { + Self { + num_stealers: 0, + duration: 1, + idle_loop: 0, + push_percentage: 50, + stealer_core: None, + steal_blocks: false, + } + } +} +fn owner_thread( + mut owner: Owner, + push_percentage: usize, + stop_signal: Arc, +) -> (usize, usize) { + let mut counter: usize = 0; + let mut rng = rand::thread_rng(); + + let mut enqueued_count = 0; + let mut dequeued_count = 0; + + loop { + if rng.gen_range(1..=100) > push_percentage { + loop { + if let Some(data) = owner.dequeue() { + assert_eq!(data, 12345); + dequeued_count += 1; + } else { + break; + } + } + } else { + while owner.enqueue(12345).is_ok() { + enqueued_count += 1; + } + } + counter = counter.wrapping_add(1); + if counter % 1000 == 0 { + if stop_signal.load(Relaxed) { + break; + } + } + } + // dequeue until empty + loop { + match owner.dequeue() { + Some(data) => { + assert_eq!(data, 12345); + dequeued_count += 1; + } + None => { + break; + } + } + } + + (enqueued_count, dequeued_count) +} + +fn stealer_thread( + stealer: Stealer, + stealer_core: Option, + stop_signal: Arc, + stealer_work_noops: usize, +) -> usize { + if let Some(core_id) = stealer_core { + core_affinity::set_for_current(core_id); + } + let mut num_stolen = 0; + let mut counter: usize = 0; + loop { + if let Some(data) = stealer.steal() { + assert!(data > 0); + num_stolen += 1; + for _ in 0..stealer_work_noops { + unsafe { asm!("nop") } + } + } + counter = counter.wrapping_add(1); + if counter % 1000 == 0 && stop_signal.load(Relaxed) { + break; + } + } + num_stolen +} + +fn steal_block_thread( + stealer: Stealer, + stealer_core: Option, + stop_signal: Arc, + stealer_work_noops: usize, +) -> usize { + if let Some(core_id) = stealer_core { + core_affinity::set_for_current(core_id); + } + let mut num_stolen = 0; + let mut counter: usize = 0; + loop { + if let Some(mut items) = stealer.steal_block() { + assert!(items.len() > 0); + let stolen = items.len(); + num_stolen += items.len(); + for _ in 0..stealer_work_noops { + unsafe { asm!("nop") } + } + // start at one to account for `data` which was not enqueued into the local queue. + let mut local_dequeues = 0; + loop { + if let Some(data) = items.next() { + assert!(data > 0); + local_dequeues += 1; + } else { + break; + } + } + assert_eq!(local_dequeues, stolen); + } + counter = counter.wrapping_add(1); + if counter % 1000 == 0 && stop_signal.load(Relaxed) { + break; + } + } + num_stolen +} + +fn test_queue(params: TestParams) { + let stop = Arc::new(AtomicBool::new(false)); + let (owner, stealer) = bwosqueue::new::(); + + let producer_stop = stop.clone(); + let owner_handle = + thread::spawn(move || owner_thread(owner, params.push_percentage, producer_stop)); + let mut stealer_handles = Vec::with_capacity(params.num_stealers); + for k in 0..params.num_stealers { + let stealer = stealer.clone(); + let stealer_stop = stop.clone(); + + let stealer_core = if let Some(core_id) = params.stealer_core { + Some(CoreId { id: core_id }) + } else { + Some(CoreId { id: k + 1 }) + }; + let stealer_handle = thread::spawn(move || { + if params.steal_blocks { + steal_block_thread(stealer, stealer_core, stealer_stop, params.idle_loop) + } else { + stealer_thread(stealer, stealer_core, stealer_stop, params.idle_loop) + } + }); + stealer_handles.push(stealer_handle); + } + + thread::sleep(time::Duration::from_secs(params.duration as u64)); + println!("Test finished"); + stop.store(true, SeqCst); + let total_stolen: usize = stealer_handles + .into_iter() + .map(|handle| handle.join().expect("Joining stealer failed")) + .sum(); + println!("Waiting for owner to finish"); + let (total_enqueued, total_dequeued) = owner_handle.join().expect("Failed to join owner"); + assert_eq!(total_enqueued, total_dequeued + total_stolen); +} + +#[test] +fn no_stealers_short() { + let p = TestParams { + num_stealers: 0, + duration: 10, + idle_loop: 0, + push_percentage: 50, + stealer_core: None, + steal_blocks: false, + }; + test_queue(p); +} + +#[test] +fn with_stealers_short() { + let p = TestParams { + num_stealers: 2, + duration: 10, + idle_loop: 0, + push_percentage: 50, + stealer_core: None, + steal_blocks: false, + }; + test_queue(p); +} + +#[test] +#[ignore] +fn with_stealers_long() { + let p = TestParams { + num_stealers: 5, + duration: 100, + idle_loop: 500000, + push_percentage: 70, + stealer_core: None, + steal_blocks: true, + }; + test_queue(p); +} diff --git a/tokio/Cargo.toml b/tokio/Cargo.toml index 7dffee1ab7f..5e82094b195 100644 --- a/tokio/Cargo.toml +++ b/tokio/Cargo.toml @@ -24,7 +24,8 @@ keywords = ["io", "async", "non-blocking", "futures"] [features] # Include nothing by default default = [] - +# todo: behind unstable flag +bwos = ["bwosqueue"] # enable everything full = [ "fs", @@ -99,6 +100,8 @@ autocfg = "1.1" [dependencies] tokio-macros = { version = "~2.1.0", path = "../tokio-macros", optional = true } +bwosqueue = { path = "../bwosqueue", features = ["stats"], optional = true } + pin-project-lite = "0.2.0" # Everything else is optional... diff --git a/tokio/src/runtime/builder.rs b/tokio/src/runtime/builder.rs index dda21a3ae27..523964832ad 100644 --- a/tokio/src/runtime/builder.rs +++ b/tokio/src/runtime/builder.rs @@ -188,11 +188,20 @@ cfg_unstable! { pub(crate) type ThreadNameFn = std::sync::Arc String + Send + Sync + 'static>; +#[derive(Clone, Copy)] +pub(crate) enum MultiThreadFlavor { + Default, + // The size of Bwos (and potentially also the default queue) could be configured in + // the future. + #[cfg(feature = "bwos")] + Bwos, +} + #[derive(Clone, Copy)] pub(crate) enum Kind { CurrentThread, #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))] - MultiThread, + MultiThread(MultiThreadFlavor), } impl Builder { @@ -222,7 +231,17 @@ impl Builder { #[cfg_attr(docsrs, doc(cfg(feature = "rt-multi-thread")))] pub fn new_multi_thread() -> Builder { // The number `61` is fairly arbitrary. I believe this value was copied from golang. - Builder::new(Kind::MultiThread, 61, 61) + Builder::new(Kind::MultiThread(MultiThreadFlavor::Default), 61, 61) + } + + /// Returns a new builder with the BWoS multi thread scheduler selected. + /// + /// Configuration methods can be chained on the return value. + #[cfg(all(feature = "rt-multi-thread", feature = "bwos"))] + #[cfg_attr(docsrs, doc(cfg(feature = "bwos")))] + pub fn new_multi_thread_bwos() -> Builder { + // The number `61` is copied from `new_multi_thread()`. + Builder::new(Kind::MultiThread(MultiThreadFlavor::Bwos), 61, 61) } } @@ -649,7 +668,7 @@ impl Builder { match &self.kind { Kind::CurrentThread => self.build_current_thread_runtime(), #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))] - Kind::MultiThread => self.build_threaded_runtime(), + Kind::MultiThread(flavor) => self.build_threaded_runtime(*flavor), } } @@ -658,7 +677,7 @@ impl Builder { enable_pause_time: match self.kind { Kind::CurrentThread => true, #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))] - Kind::MultiThread => false, + Kind::MultiThread(_) => false, }, enable_io: self.enable_io, enable_time: self.enable_time, @@ -1163,7 +1182,7 @@ cfg_test_util! { cfg_rt_multi_thread! { impl Builder { - fn build_threaded_runtime(&mut self) -> io::Result { + fn build_threaded_runtime(&mut self, flavor: MultiThreadFlavor) -> io::Result { use crate::loom::sys::num_cpus; use crate::runtime::{Config, runtime::Scheduler}; use crate::runtime::scheduler::{self, MultiThread}; @@ -1183,6 +1202,7 @@ cfg_rt_multi_thread! { let (scheduler, handle, launch) = MultiThread::new( core_threads, + flavor, driver, driver_handle, blocking_spawner, diff --git a/tokio/src/runtime/scheduler/multi_thread/mod.rs b/tokio/src/runtime/scheduler/multi_thread/mod.rs index 47cd1f3d7ae..35cdb56ece6 100644 --- a/tokio/src/runtime/scheduler/multi_thread/mod.rs +++ b/tokio/src/runtime/scheduler/multi_thread/mod.rs @@ -11,6 +11,8 @@ pub(crate) use park::{Parker, Unparker}; pub(crate) mod queue; +//pub(crate) mod queue; + mod worker; pub(crate) use worker::Launch; @@ -24,6 +26,7 @@ use crate::runtime::{ }; use crate::util::RngSeedGenerator; +use crate::runtime::builder::MultiThreadFlavor; use std::fmt; use std::future::Future; @@ -35,6 +38,7 @@ pub(crate) struct MultiThread; impl MultiThread { pub(crate) fn new( size: usize, + flavor: MultiThreadFlavor, driver: Driver, driver_handle: driver::Handle, blocking_spawner: blocking::Spawner, @@ -44,6 +48,7 @@ impl MultiThread { let parker = Parker::new(driver); let (handle, launch) = worker::create( size, + flavor, parker, driver_handle, blocking_spawner, diff --git a/tokio/src/runtime/scheduler/multi_thread/queue.rs b/tokio/src/runtime/scheduler/multi_thread/queue.rs index faf56db2e91..71f9f7e9d13 100644 --- a/tokio/src/runtime/scheduler/multi_thread/queue.rs +++ b/tokio/src/runtime/scheduler/multi_thread/queue.rs @@ -1,533 +1,82 @@ -//! Run-queue structures to support a work-stealing scheduler - -use crate::loom::cell::UnsafeCell; -use crate::loom::sync::Arc; -use crate::runtime::task::{self, Inject}; -use crate::runtime::MetricsBatch; - -use std::mem::{self, MaybeUninit}; -use std::ptr; -use std::sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release}; - -// Use wider integers when possible to increase ABA resilience. -// -// See issue #5041: . -cfg_has_atomic_u64! { - type UnsignedShort = u32; - type UnsignedLong = u64; - type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU32; - type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU64; -} -cfg_not_has_atomic_u64! { - type UnsignedShort = u16; - type UnsignedLong = u32; - type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU16; - type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU32; -} - -/// Producer handle. May only be used from a single thread. -pub(crate) struct Local { - inner: Arc>, -} - -/// Consumer handle. May be used from many threads. -pub(crate) struct Steal(Arc>); - -pub(crate) struct Inner { - /// Concurrently updated by many threads. - /// - /// Contains two `UnsignedShort` values. The LSB byte is the "real" head of - /// the queue. The `UnsignedShort` in the MSB is set by a stealer in process - /// of stealing values. It represents the first value being stolen in the - /// batch. The `UnsignedShort` indices are intentionally wider than strictly - /// required for buffer indexing in order to provide ABA mitigation and make - /// it possible to distinguish between full and empty buffers. - /// - /// When both `UnsignedShort` values are the same, there is no active - /// stealer. - /// - /// Tracking an in-progress stealer prevents a wrapping scenario. - head: AtomicUnsignedLong, - - /// Only updated by producer thread but read by many threads. - tail: AtomicUnsignedShort, - - /// Elements - buffer: Box<[UnsafeCell>>; LOCAL_QUEUE_CAPACITY]>, -} - -unsafe impl Send for Inner {} -unsafe impl Sync for Inner {} - -#[cfg(not(loom))] -const LOCAL_QUEUE_CAPACITY: usize = 256; - -// Shrink the size of the local queue when using loom. This shouldn't impact -// logic, but allows loom to test more edge cases in a reasonable a mount of -// time. -#[cfg(loom)] -const LOCAL_QUEUE_CAPACITY: usize = 4; - -const MASK: usize = LOCAL_QUEUE_CAPACITY - 1; - -// Constructing the fixed size array directly is very awkward. The only way to -// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as -// the contents are not Copy. The trick with defining a const doesn't work for -// generic types. -fn make_fixed_size(buffer: Box<[T]>) -> Box<[T; LOCAL_QUEUE_CAPACITY]> { - assert_eq!(buffer.len(), LOCAL_QUEUE_CAPACITY); - - // safety: We check that the length is correct. - unsafe { Box::from_raw(Box::into_raw(buffer).cast()) } -} - -/// Create a new local run-queue -pub(crate) fn local() -> (Steal, Local) { - let mut buffer = Vec::with_capacity(LOCAL_QUEUE_CAPACITY); - - for _ in 0..LOCAL_QUEUE_CAPACITY { - buffer.push(UnsafeCell::new(MaybeUninit::uninit())); +#[cfg(feature = "bwos")] +mod bwosq; + +mod tokioq; + +use crate::runtime::builder::MultiThreadFlavor; +use crate::runtime::task::Inject; +use crate::runtime::{task, MetricsBatch}; + +pub(crate) fn local( + flavor: MultiThreadFlavor, +) -> ( + Box + Send + Sync>, + Box + Send + Sync>, +) { + match flavor { + MultiThreadFlavor::Default => tokioq::local(), + #[cfg(feature = "bwos")] + MultiThreadFlavor::Bwos => bwosq::local(), } - - let inner = Arc::new(Inner { - head: AtomicUnsignedLong::new(0), - tail: AtomicUnsignedShort::new(0), - buffer: make_fixed_size(buffer.into_boxed_slice()), - }); - - let local = Local { - inner: inner.clone(), - }; - - let remote = Steal(inner); - - (remote, local) } -impl Local { +pub(crate) trait Owner: Send + Sync { /// Returns true if the queue has entries that can be stolen. - pub(crate) fn is_stealable(&self) -> bool { - !self.inner.is_empty() - } + fn is_stealable(&self) -> bool; - /// Returns false if there are any entries in the queue - /// - /// Separate to is_stealable so that refactors of is_stealable to "protect" - /// some tasks from stealing won't affect this - pub(crate) fn has_tasks(&self) -> bool { - !self.inner.is_empty() - } + /// Returns true if there are entries in the queue. + fn has_tasks(&self) -> bool; /// Pushes a task to the back of the local queue, skipping the LIFO slot. - pub(crate) fn push_back( + fn push_back( &mut self, - mut task: task::Notified, + task: task::Notified, inject: &Inject, metrics: &mut MetricsBatch, - ) { - let tail = loop { - let head = self.inner.head.load(Acquire); - let (steal, real) = unpack(head); - - // safety: this is the **only** thread that updates this cell. - let tail = unsafe { self.inner.tail.unsync_load() }; - - if tail.wrapping_sub(steal) < LOCAL_QUEUE_CAPACITY as UnsignedShort { - // There is capacity for the task - break tail; - } else if steal != real { - // Concurrently stealing, this will free up capacity, so only - // push the task onto the inject queue - inject.push(task); - return; - } else { - // Push the current task and half of the queue into the - // inject queue. - match self.push_overflow(task, real, tail, inject, metrics) { - Ok(_) => return, - // Lost the race, try again - Err(v) => { - task = v; - } - } - } - }; - - // Map the position to a slot index. - let idx = tail as usize & MASK; - - self.inner.buffer[idx].with_mut(|ptr| { - // Write the task to the slot - // - // Safety: There is only one producer and the above `if` - // condition ensures we don't touch a cell if there is a - // value, thus no consumer. - unsafe { - ptr::write((*ptr).as_mut_ptr(), task); - } - }); + ); - // Make the task available. Synchronizes with a load in - // `steal_into2`. - self.inner.tail.store(tail.wrapping_add(1), Release); - } - - /// Moves a batch of tasks into the inject queue. + /// Push a batch of tasks to the back of the local queue /// - /// This will temporarily make some of the tasks unavailable to stealers. - /// Once `push_overflow` is done, a notification is sent out, so if other - /// workers "missed" some of the tasks during a steal, they will get - /// another opportunity. - #[inline(never)] - fn push_overflow( + /// # Safety: + /// + /// The caller must ensure that the queue has enough capacity to accept + /// all tasks, e.g. by calling `can_enqueue` beforehand. + unsafe fn push_back_batch_unchecked( &mut self, - task: task::Notified, - head: UnsignedShort, - tail: UnsignedShort, - inject: &Inject, + tasks: Box> + '_>, metrics: &mut MetricsBatch, - ) -> Result<(), task::Notified> { - /// How many elements are we taking from the local queue. - /// - /// This is one less than the number of tasks pushed to the inject - /// queue as we are also inserting the `task` argument. - const NUM_TASKS_TAKEN: UnsignedShort = (LOCAL_QUEUE_CAPACITY / 2) as UnsignedShort; - - assert_eq!( - tail.wrapping_sub(head) as usize, - LOCAL_QUEUE_CAPACITY, - "queue is not full; tail = {}; head = {}", - tail, - head - ); - - let prev = pack(head, head); - - // Claim a bunch of tasks - // - // We are claiming the tasks **before** reading them out of the buffer. - // This is safe because only the **current** thread is able to push new - // tasks. - // - // There isn't really any need for memory ordering... Relaxed would - // work. This is because all tasks are pushed into the queue from the - // current thread (or memory has been acquired if the local queue handle - // moved). - if self - .inner - .head - .compare_exchange( - prev, - pack( - head.wrapping_add(NUM_TASKS_TAKEN), - head.wrapping_add(NUM_TASKS_TAKEN), - ), - Release, - Relaxed, - ) - .is_err() - { - // We failed to claim the tasks, losing the race. Return out of - // this function and try the full `push` routine again. The queue - // may not be full anymore. - return Err(task); - } - - /// An iterator that takes elements out of the run queue. - struct BatchTaskIter<'a, T: 'static> { - buffer: &'a [UnsafeCell>>; LOCAL_QUEUE_CAPACITY], - head: UnsignedLong, - i: UnsignedLong, - } - impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> { - type Item = task::Notified; - - #[inline] - fn next(&mut self) -> Option> { - if self.i == UnsignedLong::from(NUM_TASKS_TAKEN) { - None - } else { - let i_idx = self.i.wrapping_add(self.head) as usize & MASK; - let slot = &self.buffer[i_idx]; - - // safety: Our CAS from before has assumed exclusive ownership - // of the task pointers in this range. - let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); - - self.i += 1; - Some(task) - } - } - } - - // safety: The CAS above ensures that no consumer will look at these - // values again, and we are the only producer. - let batch_iter = BatchTaskIter { - buffer: &self.inner.buffer, - head: head as UnsignedLong, - i: 0, - }; - inject.push_batch(batch_iter.chain(std::iter::once(task))); - - // Add 1 to factor in the task currently being scheduled. - metrics.incr_overflow_count(); - - Ok(()) - } - - /// Pops a task from the local queue. - pub(crate) fn pop(&mut self) -> Option> { - let mut head = self.inner.head.load(Acquire); - - let idx = loop { - let (steal, real) = unpack(head); - - // safety: this is the **only** thread that updates this cell. - let tail = unsafe { self.inner.tail.unsync_load() }; - - if real == tail { - // queue is empty - return None; - } - - let next_real = real.wrapping_add(1); + ); - // If `steal == real` there are no concurrent stealers. Both `steal` - // and `real` are updated. - let next = if steal == real { - pack(next_real, next_real) - } else { - assert_ne!(steal, next_real); - pack(steal, next_real) - }; - - // Attempt to claim a task. - let res = self - .inner - .head - .compare_exchange(head, next, AcqRel, Acquire); + /// Ok ( if enqueuing of `num` items will succeed. + /// + /// Returns an optional hint how many items can be enqueued. + fn can_enqueue(&self, num: u16) -> Result<(), Option>; - match res { - Ok(_) => break real as usize & MASK, - Err(actual) => head = actual, - } - }; + /// Pop one task from the front of the queue. + fn pop(&mut self) -> Option>; - Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() })) - } + // /// approximate length of the queue + // fn len(&self) -> usize { + // todo!() + // } } -impl Steal { - pub(crate) fn is_empty(&self) -> bool { - self.0.is_empty() - } +pub(crate) trait Stealer: Send + Sync { + // Todo: `is_empty()` is hard to implement for BWoS, since + // the stealer doesn't really have access to this kind of information, + // at least not in an inexpensive way, that doesn't interfere with the + // owner. Check if the upper layers really need this function! + fn is_empty(&self) -> bool; /// Steals half the tasks from self and place them into `dst`. - pub(crate) fn steal_into( + fn steal_into( &self, - dst: &mut Local, + dst: &mut dyn Owner, dst_metrics: &mut MetricsBatch, - ) -> Option> { - // Safety: the caller is the only thread that mutates `dst.tail` and - // holds a mutable reference. - let dst_tail = unsafe { dst.inner.tail.unsync_load() }; - - // To the caller, `dst` may **look** empty but still have values - // contained in the buffer. If another thread is concurrently stealing - // from `dst` there may not be enough capacity to steal. - let (steal, _) = unpack(dst.inner.head.load(Acquire)); - - if dst_tail.wrapping_sub(steal) > LOCAL_QUEUE_CAPACITY as UnsignedShort / 2 { - // we *could* try to steal less here, but for simplicity, we're just - // going to abort. - return None; - } - - // Steal the tasks into `dst`'s buffer. This does not yet expose the - // tasks in `dst`. - let mut n = self.steal_into2(dst, dst_tail); - - if n == 0 { - // No tasks were stolen - return None; - } + ) -> Option>; - dst_metrics.incr_steal_count(n as u16); - dst_metrics.incr_steal_operations(); - - // We are returning a task here - n -= 1; - - let ret_pos = dst_tail.wrapping_add(n); - let ret_idx = ret_pos as usize & MASK; - - // safety: the value was written as part of `steal_into2` and not - // exposed to stealers, so no other thread can access it. - let ret = dst.inner.buffer[ret_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); - - if n == 0 { - // The `dst` queue is empty, but a single task was stolen - return Some(ret); - } - - // Make the stolen items available to consumers - dst.inner.tail.store(dst_tail.wrapping_add(n), Release); - - Some(ret) + cfg_metrics! { + /// Number of tasks in the queue. + fn len(&self) -> usize; } - - // Steal tasks from `self`, placing them into `dst`. Returns the number of - // tasks that were stolen. - fn steal_into2(&self, dst: &mut Local, dst_tail: UnsignedShort) -> UnsignedShort { - let mut prev_packed = self.0.head.load(Acquire); - let mut next_packed; - - let n = loop { - let (src_head_steal, src_head_real) = unpack(prev_packed); - let src_tail = self.0.tail.load(Acquire); - - // If these two do not match, another thread is concurrently - // stealing from the queue. - if src_head_steal != src_head_real { - return 0; - } - - // Number of available tasks to steal - let n = src_tail.wrapping_sub(src_head_real); - let n = n - n / 2; - - if n == 0 { - // No tasks available to steal - return 0; - } - - // Update the real head index to acquire the tasks. - let steal_to = src_head_real.wrapping_add(n); - assert_ne!(src_head_steal, steal_to); - next_packed = pack(src_head_steal, steal_to); - - // Claim all those tasks. This is done by incrementing the "real" - // head but not the steal. By doing this, no other thread is able to - // steal from this queue until the current thread completes. - let res = self - .0 - .head - .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); - - match res { - Ok(_) => break n, - Err(actual) => prev_packed = actual, - } - }; - - assert!( - n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2, - "actual = {}", - n - ); - - let (first, _) = unpack(next_packed); - - // Take all the tasks - for i in 0..n { - // Compute the positions - let src_pos = first.wrapping_add(i); - let dst_pos = dst_tail.wrapping_add(i); - - // Map to slots - let src_idx = src_pos as usize & MASK; - let dst_idx = dst_pos as usize & MASK; - - // Read the task - // - // safety: We acquired the task with the atomic exchange above. - let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); - - // Write the task to the new slot - // - // safety: `dst` queue is empty and we are the only producer to - // this queue. - dst.inner.buffer[dst_idx] - .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) }); - } - - let mut prev_packed = next_packed; - - // Update `src_head_steal` to match `src_head_real` signalling that the - // stealing routine is complete. - loop { - let head = unpack(prev_packed).1; - next_packed = pack(head, head); - - let res = self - .0 - .head - .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); - - match res { - Ok(_) => return n, - Err(actual) => { - let (actual_steal, actual_real) = unpack(actual); - - assert_ne!(actual_steal, actual_real); - - prev_packed = actual; - } - } - } - } -} - -cfg_metrics! { - impl Steal { - pub(crate) fn len(&self) -> usize { - self.0.len() as _ - } - } -} - -impl Clone for Steal { - fn clone(&self) -> Steal { - Steal(self.0.clone()) - } -} - -impl Drop for Local { - fn drop(&mut self) { - if !std::thread::panicking() { - assert!(self.pop().is_none(), "queue not empty"); - } - } -} - -impl Inner { - fn len(&self) -> UnsignedShort { - let (_, head) = unpack(self.head.load(Acquire)); - let tail = self.tail.load(Acquire); - - tail.wrapping_sub(head) - } - - fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// Split the head value into the real head and the index a stealer is working -/// on. -fn unpack(n: UnsignedLong) -> (UnsignedShort, UnsignedShort) { - let real = n & UnsignedShort::MAX as UnsignedLong; - let steal = n >> (mem::size_of::() * 8); - - (steal as UnsignedShort, real as UnsignedShort) -} - -/// Join the two head values -fn pack(steal: UnsignedShort, real: UnsignedShort) -> UnsignedLong { - (real as UnsignedLong) | ((steal as UnsignedLong) << (mem::size_of::() * 8)) -} - -#[test] -fn test_local_queue_capacity() { - assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize); } diff --git a/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs b/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs new file mode 100644 index 00000000000..868fd647b13 --- /dev/null +++ b/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs @@ -0,0 +1,148 @@ +use std::convert::TryInto; + +use crate::runtime::scheduler::multi_thread::queue::Owner as OwnerTrait; +use crate::runtime::task::{self, Inject, Notified}; +use crate::runtime::MetricsBatch; +use bwosqueue::{Owner, Stealer}; + +// todo: Discuss using const generics or runtime values. Benchmark performance difference. +const NUM_BLOCKS: usize = 8; +const ELEMENTS_PER_BLOCK: usize = 32; + +/// Producer handle. May only be used from a single thread. +pub(crate) struct Local { + inner: Owner, NUM_BLOCKS, ELEMENTS_PER_BLOCK>, +} + +/// Consumer handle. May be used from many threads. +pub(crate) struct Steal(Stealer, NUM_BLOCKS, ELEMENTS_PER_BLOCK>); + +/// Create a new local run-queue +pub(crate) fn local() -> ( + Box + Send + Sync>, + Box + Send + Sync>, +) { + let (owner, stealer) = bwosqueue::new::, NUM_BLOCKS, ELEMENTS_PER_BLOCK>(); + + let local = Local { inner: owner }; + + let remote = Steal(stealer); + + (Box::new(remote), Box::new(local)) +} + +impl super::Owner for Local { + /// Returns true if the queue has entries that can be stolen. + fn is_stealable(&self) -> bool { + self.inner.has_stealable_block() + } + + /// Returns true if there are entries in the queue. + fn has_tasks(&self) -> bool { + self.inner.can_consume() + } + + /// Pushes a task to the back of the local queue, skipping the LIFO slot. + fn push_back( + &mut self, + task: task::Notified, + inject: &Inject, + metrics: &mut MetricsBatch, + ) { + if let Err(t) = self.inner.enqueue(task) { + inject.push(t); + // note: the current implementation is slow + // if self.inner.has_stealers() { + // inject.push(t); + // } else { + // // push overflow of old queue + // if let Some(block_iter) = self.inner.dequeue_block() { + // // could use `and_then` to chain block dequeues a couple of times if + // // successfull, if we want to steal more than one block + // inject.push_batch(block_iter.chain(std::iter::once(t))) + // } else { + // // Give up and use inject queue. + // inject.push(t) + // } + // } + // Add 1 to factor in the task currently being scheduled. + metrics.incr_overflow_count(); + }; + } + + unsafe fn push_back_batch_unchecked( + &mut self, + tasks: Box> + '_>, + _metrics: &mut MetricsBatch, + ) { + let _num_enqueued = self.inner.enqueue_batch_unchecked(tasks); + } + + fn can_enqueue(&self, num: u16) -> Result<(), Option> { + // todo: trait signature probably needs to be modified. + if self.inner.can_enqueue_block() { + Ok(()) + } else { + Err(None) + } + } + + fn pop(&mut self) -> Option> { + self.inner.dequeue() + } +} + +impl Drop for Local { + fn drop(&mut self) { + if !std::thread::panicking() { + assert!(self.pop().is_none(), "queue not empty"); + } + } +} + +impl super::Stealer for Steal { + fn is_empty(&self) -> bool { + self.0.estimated_queue_entries() == 0 + } + + /// Steals one block from self and place them into `dst`. + fn steal_into( + &self, + dst: &mut dyn OwnerTrait, + dst_metrics: &mut MetricsBatch, + ) -> Option> { + // We know `dst` is empty, so we expect this to enqueue to succeed in most cases. + // In the rare case that the `dst` queue is at the same time also full, because the + // producer is blocked waiting on a stealer we only attempt to steal a single task + // todo: can_enqueue... + if dst.can_enqueue(ELEMENTS_PER_BLOCK as u16).is_err() { + return self.0.steal(); + } + + if let Some(mut stolen_tasks) = self.0.steal_block() { + let num_stolen = stolen_tasks.len(); + let first = stolen_tasks.next(); + debug_assert!(first.is_some()); + unsafe { dst.push_back_batch_unchecked(Box::new(stolen_tasks), dst_metrics) } + dst_metrics.incr_steal_count(num_stolen.try_into().unwrap()); + dst_metrics.incr_steal_operations(); + first + } else { + None + } + } +} + +impl Clone for Steal { + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +cfg_metrics! { + impl Steal { + pub(crate) fn len(&self) -> usize { + self.0.estimated_queue_entries() + } + } +} diff --git a/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs b/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs new file mode 100644 index 00000000000..644b39c5668 --- /dev/null +++ b/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs @@ -0,0 +1,706 @@ +//! Run-queue structures to support a work-stealing scheduler + +use crate::loom::cell::UnsafeCell; +use crate::loom::sync::Arc; +use crate::runtime::task::{self, Inject, Notified}; +use crate::runtime::MetricsBatch; + +use crate::runtime::scheduler::multi_thread::queue::{Owner, Stealer}; +use std::mem::{self, MaybeUninit}; +use std::ptr; +use std::sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release}; + +// Use wider integers when possible to increase ABA resilience. +// +// See issue #5041: . +cfg_has_atomic_u64! { + type UnsignedShort = u32; + type UnsignedLong = u64; + type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU32; + type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU64; +} +cfg_not_has_atomic_u64! { + type UnsignedShort = u16; + type UnsignedLong = u32; + type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU16; + type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU32; +} + +/// Producer handle. May only be used from a single thread. +pub(crate) struct Local { + inner: Arc>, +} + +/// Consumer handle. May be used from many threads. +pub(crate) struct Steal(Arc>); + +pub(crate) struct Inner { + /// Concurrently updated by many threads. + /// + /// Contains two `UnsignedShort` values. The LSB byte is the "real" head of + /// the queue. The `UnsignedShort` in the MSB is set by a stealer in process + /// of stealing values. It represents the first value being stolen in the + /// batch. The `UnsignedShort` indices are intentionally wider than strictly + /// required for buffer indexing in order to provide ABA mitigation and make + /// it possible to distinguish between full and empty buffers. + /// + /// When both `UnsignedShort` values are the same, there is no active + /// stealer. + /// + /// Tracking an in-progress stealer prevents a wrapping scenario. + head: AtomicUnsignedLong, + + /// Only updated by producer thread but read by many threads. + tail: AtomicUnsignedShort, + + /// Elements + buffer: Box<[UnsafeCell>>; LOCAL_QUEUE_CAPACITY]>, +} + +unsafe impl Send for Inner {} +unsafe impl Sync for Inner {} + +#[cfg(not(loom))] +const LOCAL_QUEUE_CAPACITY: usize = 256; + +// Shrink the size of the local queue when using loom. This shouldn't impact +// logic, but allows loom to test more edge cases in a reasonable a mount of +// time. +#[cfg(loom)] +const LOCAL_QUEUE_CAPACITY: usize = 4; + +const MASK: usize = LOCAL_QUEUE_CAPACITY - 1; + +// Constructing the fixed size array directly is very awkward. The only way to +// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as +// the contents are not Copy. The trick with defining a const doesn't work for +// generic types. +fn make_fixed_size(buffer: Box<[T]>) -> Box<[T; LOCAL_QUEUE_CAPACITY]> { + assert_eq!(buffer.len(), LOCAL_QUEUE_CAPACITY); + + // safety: We check that the length is correct. + unsafe { Box::from_raw(Box::into_raw(buffer).cast()) } +} + +#[clippy::has_significant_drop] +struct StealerIterator<'a, T: 'static> { + stealer: &'a Steal, + // `tail` index of the stealer in the queue. Should not change + stolen_tail: UnsignedShort, + /// current position in the iterator + cur_pos: UnsignedShort, + /// Head of the stealer (one element past the last reserved item) + head: UnsignedShort, +} + +impl<'a, T> StealerIterator<'a, T> { + fn is_empty(&self) -> bool { + // tail will always be behind head, but head could have wrapped around already, + // so calculate `new_tail` before comparing with head. + let new_tail = self.stolen_tail.wrapping_add(self.cur_pos); + new_tail >= self.head + } +} + +impl<'a, T> Iterator for StealerIterator<'a, T> { + type Item = task::Notified; + + fn next(&mut self) -> Option { + // tail will always be behind head, but head could have wrapped around already, + // so calculate `new_tail` before comparing with head. + let new_tail = self.stolen_tail.wrapping_add(self.cur_pos); + if new_tail < self.head { + let idx = (new_tail as usize) & MASK; + let task = self.stealer.0.buffer[idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + self.cur_pos += 1; + Some(task) + } else { + None + } + } +} + +impl<'a, T> Drop for StealerIterator<'a, T> { + fn drop(&mut self) { + debug_assert!(self.is_empty()); + // This is the value of head if no further enqueues happened concurrently. + let mut prev_packed = pack(self.stolen_tail, self.head); + + let mut new_real = self.head; + + // Update `head_steal` to match `head_real` signalling that the + // stealing routine is complete. + loop { + let next_packed = pack(new_real, new_real); + + let res = + self.stealer + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => return (), + Err(actual) => { + let (actual_steal, actual_real) = unpack(actual); + + assert_ne!(actual_steal, actual_real); + // We don't concurrently steal, so the actual steal shouldn't have changed. + debug_assert_eq!(self.stolen_tail, actual_steal); + prev_packed = actual; + new_real = actual_real; + } + } + } + } +} + +/// Create a new local run-queue +pub(crate) fn local() -> ( + Box + Send + Sync>, + Box + Send + Sync>, +) { + let mut buffer = Vec::with_capacity(LOCAL_QUEUE_CAPACITY); + + for _ in 0..LOCAL_QUEUE_CAPACITY { + buffer.push(UnsafeCell::new(MaybeUninit::uninit())); + } + + let inner = Arc::new(Inner { + head: AtomicUnsignedLong::new(0), + tail: AtomicUnsignedShort::new(0), + buffer: make_fixed_size(buffer.into_boxed_slice()), + }); + + let local = Local { + inner: inner.clone(), + }; + + let remote = Steal(inner); + + ( + Box::new(remote) as Box + Send + Sync>, + Box::new(local) as Box + Send + Sync>, + ) +} + +impl Local { + /// Moves a batch of tasks into the inject queue. + /// + /// This will temporarily make some of the tasks unavailable to stealers. + /// Once `push_overflow` is done, a notification is sent out, so if other + /// workers "missed" some of the tasks during a steal, they will get + /// another opportunity. + #[inline(never)] + fn push_overflow( + &mut self, + task: task::Notified, + head: UnsignedShort, + tail: UnsignedShort, + inject: &Inject, + metrics: &mut MetricsBatch, + ) -> Result<(), task::Notified> { + /// How many elements are we taking from the local queue. + /// + /// This is one less than the number of tasks pushed to the inject + /// queue as we are also inserting the `task` argument. + const NUM_TASKS_TAKEN: UnsignedShort = (LOCAL_QUEUE_CAPACITY / 2) as UnsignedShort; + + assert_eq!( + tail.wrapping_sub(head) as usize, + LOCAL_QUEUE_CAPACITY, + "queue is not full; tail = {}; head = {}", + tail, + head + ); + + let prev = pack(head, head); + + // Claim a bunch of tasks + // + // We are claiming the tasks **before** reading them out of the buffer. + // This is safe because only the **current** thread is able to push new + // tasks. + // + // There isn't really any need for memory ordering... Relaxed would + // work. This is because all tasks are pushed into the queue from the + // current thread (or memory has been acquired if the local queue handle + // moved). + if self + .inner + .head + .compare_exchange( + prev, + pack( + head.wrapping_add(NUM_TASKS_TAKEN), + head.wrapping_add(NUM_TASKS_TAKEN), + ), + Release, + Relaxed, + ) + .is_err() + { + // We failed to claim the tasks, losing the race. Return out of + // this function and try the full `push` routine again. The queue + // may not be full anymore. + return Err(task); + } + + /// An iterator that takes elements out of the run queue. + struct BatchTaskIter<'a, T: 'static> { + buffer: &'a [UnsafeCell>>; LOCAL_QUEUE_CAPACITY], + head: UnsignedLong, + i: UnsignedLong, + } + impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> { + type Item = task::Notified; + + #[inline] + fn next(&mut self) -> Option> { + if self.i == UnsignedLong::from(NUM_TASKS_TAKEN) { + None + } else { + let i_idx = self.i.wrapping_add(self.head) as usize & MASK; + let slot = &self.buffer[i_idx]; + + // safety: Our CAS from before has assumed exclusive ownership + // of the task pointers in this range. + let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + + self.i += 1; + Some(task) + } + } + } + + // safety: The CAS above ensures that no consumer will look at these + // values again, and we are the only producer. + let batch_iter = BatchTaskIter { + buffer: &self.inner.buffer, + head: head as UnsignedLong, + i: 0, + }; + inject.push_batch(batch_iter.chain(std::iter::once(task))); + + // Add 1 to factor in the task currently being scheduled. + metrics.incr_overflow_count(); + + Ok(()) + } +} + +impl super::Owner for Local { + /// Returns true if the queue has entries that can be stolen. + fn is_stealable(&self) -> bool { + !self.inner.is_empty() + } + + /// Returns false if there are any entries in the queue + /// + /// Separate to is_stealable so that refactors of is_stealable to "protect" + /// some tasks from stealing won't affect this + fn has_tasks(&self) -> bool { + !self.inner.is_empty() + } + + /// Pushes a task to the back of the local queue, skipping the LIFO slot. + fn push_back( + &mut self, + mut task: task::Notified, + inject: &Inject, + metrics: &mut MetricsBatch, + ) { + let tail = loop { + let head = self.inner.head.load(Acquire); + let (steal, real) = unpack(head); + + // safety: this is the **only** thread that updates this cell. + let tail = unsafe { self.inner.tail.unsync_load() }; + + if tail.wrapping_sub(steal) < LOCAL_QUEUE_CAPACITY as UnsignedShort { + // There is capacity for the task + break tail; + } else if steal != real { + // Concurrently stealing, this will free up capacity, so only + // push the task onto the inject queue + inject.push(task); + return; + } else { + // Push the current task and half of the queue into the + // inject queue. + match self.push_overflow(task, real, tail, inject, metrics) { + Ok(_) => return, + // Lost the race, try again + Err(v) => { + task = v; + } + } + } + }; + + // Map the position to a slot index. + let idx = tail as usize & MASK; + + self.inner.buffer[idx].with_mut(|ptr| { + // Write the task to the slot + // + // Safety: There is only one producer and the above `if` + // condition ensures we don't touch a cell if there is a + // value, thus no consumer. + unsafe { + ptr::write((*ptr).as_mut_ptr(), task); + } + }); + + // Make the task available. Synchronizes with a load in + // `steal_into2`. + self.inner.tail.store(tail.wrapping_add(1), Release); + } + + // fn len(&self) -> usize { + // // Safety: We own the queue and thus are the only ones that could potentially mutate + // // `inner.tail`. + // let dst_tail = unsafe { self.inner.tail.unsync_load() }; + // + // // To the caller, `dst` may **look** empty but still have values + // // contained in the buffer. If another thread is concurrently stealing + // // from `dst` there may not be enough capacity to steal. + // let (steal, real_head) = unpack(self.inner.head.load(Acquire)); + // } + + #[deny(unsafe_op_in_unsafe_fn)] + unsafe fn push_back_batch_unchecked( + &mut self, + tasks: Box> + '_>, + metrics: &mut MetricsBatch, + ) { + // Safety: this is the **only** thread that updates this cell. + let tail = unsafe { self.inner.tail.unsync_load() }; + let mut count = 0; + for task in tasks { + let idx = tail.wrapping_add(count) as usize & MASK; + // Write the task to the new slot + // + // Safety: We are the queue Owner and the caller assures the queue has sufficient capacity. + self.inner.buffer[idx].with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) }); + count += 1; + } + + metrics.incr_steal_count(count as u16); + metrics.incr_steal_operations(); + + // Make the stolen items available to consumers + self.inner.tail.store(tail.wrapping_add(count), Release); + } + + fn can_enqueue(&self, num: u16) -> Result<(), Option> { + // Safety: We own the queue and thus are the only ones that could potentially mutate + // `inner.tail`. + let tail = unsafe { self.inner.tail.unsync_load() }; + + // To the caller, `dst` may **look** empty but still have values + // contained in the buffer. If another thread is concurrently stealing + // from `dst` there may not be enough capacity to steal. + let (steal, _real_head) = unpack(self.inner.head.load(Acquire)); + + // `steal` is behind `real_head` when there is an in-progress steal, otherwise it is + // equal to `real_head`. `tail` - `steal` is the amount of queue slots currently used. + // `tail` is always larger then `steal`, since the counter is monotonically increasing, + // at least until it wraps around at `UnsignedShort::MAX`. wrapping_sub always gives the + // correct difference. + let capacity = LOCAL_QUEUE_CAPACITY as UnsignedShort - (tail.wrapping_sub(steal)); + if capacity > num as UnsignedShort { + Ok(()) + } else { + Err(Some(capacity as u16)) + } + } + + /// Pops a task from the local queue. + fn pop(&mut self) -> Option> { + let mut head = self.inner.head.load(Acquire); + + let idx = loop { + let (steal, real) = unpack(head); + + // safety: this is the **only** thread that updates this cell. + let tail = unsafe { self.inner.tail.unsync_load() }; + + if real == tail { + // queue is empty + return None; + } + + let next_real = real.wrapping_add(1); + + // If `steal == real` there are no concurrent stealers. Both `steal` + // and `real` are updated. + let next = if steal == real { + pack(next_real, next_real) + } else { + assert_ne!(steal, next_real); + pack(steal, next_real) + }; + + // Attempt to claim a task. + let res = self + .inner + .head + .compare_exchange(head, next, AcqRel, Acquire); + + match res { + Ok(_) => break real as usize & MASK, + Err(actual) => head = actual, + } + }; + + Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() })) + } +} + +impl super::Stealer for Steal { + fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Steals half the tasks from self and place them into `dst`. + fn steal_into( + &self, + dst: &mut dyn Owner, + dst_metrics: &mut MetricsBatch, + ) -> Option> { + if dst.can_enqueue(LOCAL_QUEUE_CAPACITY as u16 / 2).is_err() { + // we *could* try to steal less here, but for simplicity, we're just + // going to abort. + return None; + } + + let mut stolen_tasks = self.steal_half()?; + + // We take the first task from the iterator to directly return it. + let first = stolen_tasks.next()?; + + if stolen_tasks.is_empty() { + dst_metrics.incr_steal_count(1); + dst_metrics.incr_steal_operations(); + return Some(first); + } else { + // Safety: We checked that `dst` has sufficient capacity, and we are the owner + // thread, so the capacity can only have increased in the meantime. + unsafe { dst.push_back_batch_unchecked(Box::new(stolen_tasks), dst_metrics) } + } + + Some(first) + } + + cfg_metrics! { + fn len(&self) -> usize { + self.0.len() as _ + } + } +} + +impl Steal { + fn steal_half(&self) -> Option> { + let mut prev_packed = self.0.head.load(Acquire); + let mut next_packed; + + let (steal_head, real_head) = loop { + let (src_head_steal, src_head_real) = unpack(prev_packed); + let src_tail = self.0.tail.load(Acquire); + // If these two do not match, another thread is concurrently + // stealing from the queue. + if src_head_steal != src_head_real { + return None; + } + // Number of available tasks to steal + let n = src_tail.wrapping_sub(src_head_real); + let n = n - n / 2; + + if n == 0 { + // No tasks available to steal + return None; + } + // Update the real head index to acquire the tasks. + let steal_to = src_head_real.wrapping_add(n); + assert_ne!(src_head_steal, steal_to); + next_packed = pack(src_head_steal, steal_to); + + // Claim all those tasks. This is done by incrementing the "real" + // head but not the steal. By doing this, no other thread is able to + // steal from this queue until the current thread completes. + let res = self + .0 + .head + .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + + match res { + Ok(_) => { + break (src_head_steal, steal_to); + } + Err(actual) => prev_packed = actual, + } + }; + + let n = real_head.wrapping_sub(steal_head); + assert!( + n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2, + "actual = {}", + n + ); + + Some(StealerIterator { + stealer: &self, + stolen_tail: steal_head, + cur_pos: 0, + head: real_head, + }) + } + + // // Steal tasks from `self`, placing them into `dst`. Returns the number of + // // tasks that were stolen. + // fn steal_into2(&self, dst: &mut Local, dst_tail: UnsignedShort) -> UnsignedShort { + // let mut prev_packed = self.0.head.load(Acquire); + // let mut next_packed; + // + // let n = loop { + // let (src_head_steal, src_head_real) = unpack(prev_packed); + // let src_tail = self.0.tail.load(Acquire); + // + // // If these two do not match, another thread is concurrently + // // stealing from the queue. + // if src_head_steal != src_head_real { + // return 0; + // } + // + // // Number of available tasks to steal + // let n = src_tail.wrapping_sub(src_head_real); + // let n = n - n / 2; + // + // if n == 0 { + // // No tasks available to steal + // return 0; + // } + // + // // Update the real head index to acquire the tasks. + // let steal_to = src_head_real.wrapping_add(n); + // assert_ne!(src_head_steal, steal_to); + // next_packed = pack(src_head_steal, steal_to); + // + // // Claim all those tasks. This is done by incrementing the "real" + // // head but not the steal. By doing this, no other thread is able to + // // steal from this queue until the current thread completes. + // let res = self + // .0 + // .head + // .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + // + // match res { + // Ok(_) => break n, + // Err(actual) => prev_packed = actual, + // } + // }; + // + // assert!( + // n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2, + // "actual = {}", + // n + // ); + // + // let (first, _) = unpack(next_packed); + // + // // Take all the tasks + // for i in 0..n { + // // Compute the positions + // let src_pos = first.wrapping_add(i); + // let dst_pos = dst_tail.wrapping_add(i); + // + // // Map to slots + // let src_idx = src_pos as usize & MASK; + // let dst_idx = dst_pos as usize & MASK; + // + // // Read the task + // // + // // safety: We acquired the task with the atomic exchange above. + // let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) }); + // + // // Write the task to the new slot + // // + // // safety: `dst` queue is empty and we are the only producer to + // // this queue. + // dst.inner.buffer[dst_idx] + // .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) }); + // } + // + // let mut prev_packed = next_packed; + // + // // Update `src_head_steal` to match `src_head_real` signalling that the + // // stealing routine is complete. + // loop { + // let head = unpack(prev_packed).1; + // next_packed = pack(head, head); + // + // let res = self + // .0 + // .head + // .compare_exchange(prev_packed, next_packed, AcqRel, Acquire); + // + // match res { + // Ok(_) => return n, + // Err(actual) => { + // let (actual_steal, actual_real) = unpack(actual); + // + // assert_ne!(actual_steal, actual_real); + // + // prev_packed = actual; + // } + // } + // } + // } +} + +impl Clone for Steal { + fn clone(&self) -> Steal { + Steal(self.0.clone()) + } +} + +impl Inner { + fn len(&self) -> UnsignedShort { + let (_, head) = unpack(self.head.load(Acquire)); + let tail = self.tail.load(Acquire); + + tail.wrapping_sub(head) + } + + fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// Split the head value into the real head and the index a stealer is working +/// on. +fn unpack(n: UnsignedLong) -> (UnsignedShort, UnsignedShort) { + let real = n & UnsignedShort::MAX as UnsignedLong; + let steal = n >> (mem::size_of::() * 8); + + (steal as UnsignedShort, real as UnsignedShort) +} + +/// Join the two head values +fn pack(steal: UnsignedShort, real: UnsignedShort) -> UnsignedLong { + (real as UnsignedLong) | ((steal as UnsignedLong) << (mem::size_of::() * 8)) +} + +impl Drop for Local { + fn drop(&mut self) { + if !std::thread::panicking() { + assert!(self.pop().is_none(), "queue not empty"); + } + } +} + +#[test] +fn test_local_queue_capacity() { + assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize); +} diff --git a/tokio/src/runtime/scheduler/multi_thread/worker.rs b/tokio/src/runtime/scheduler/multi_thread/worker.rs index c59f19e59d5..936eff697f4 100644 --- a/tokio/src/runtime/scheduler/multi_thread/worker.rs +++ b/tokio/src/runtime/scheduler/multi_thread/worker.rs @@ -67,6 +67,7 @@ use crate::runtime::{ use crate::util::atomic_cell::AtomicCell; use crate::util::rand::{FastRand, RngSeedGenerator}; +use crate::runtime::builder::MultiThreadFlavor; use std::cell::RefCell; use std::time::Duration; @@ -95,7 +96,7 @@ struct Core { lifo_slot: Option, /// The worker-local run queue. - run_queue: queue::Local>, + run_queue: Box> + Send + Sync>, /// True if the worker is currently searching for more work. Searching /// involves attempting to steal from other workers. @@ -153,7 +154,7 @@ pub(super) struct Shared { /// Used to communicate with a worker from other threads. struct Remote { /// Steals tasks from this worker. - steal: queue::Steal>, + steal: Box> + Send + Sync>, /// Unparks the associated worker thread unpark: Unparker, @@ -187,6 +188,7 @@ scoped_thread_local!(static CURRENT: Context); pub(super) fn create( size: usize, + flavor: MultiThreadFlavor, park: Parker, driver_handle: driver::Handle, blocking_spawner: blocking::Spawner, @@ -199,7 +201,7 @@ pub(super) fn create( // Create the local queues for _ in 0..size { - let (steal, run_queue) = queue::local(); + let (steal, run_queue) = queue::local(flavor); let park = park.clone(); let unpark = park.unpark(); @@ -636,7 +638,7 @@ impl Core { let target = &worker.handle.shared.remotes[i]; if let Some(task) = target .steal - .steal_into(&mut self.run_queue, &mut self.metrics) + .steal_into(&mut *self.run_queue, &mut self.metrics) { return Some(task); } diff --git a/tokio/src/runtime/tests/queue.rs b/tokio/src/runtime/tests/queue.rs index ac80fa7332f..ec059d2ebb3 100644 --- a/tokio/src/runtime/tests/queue.rs +++ b/tokio/src/runtime/tests/queue.rs @@ -2,6 +2,7 @@ use crate::runtime::scheduler::multi_thread::queue; use crate::runtime::task::{self, Inject, Schedule, Task}; use crate::runtime::MetricsBatch; +use crate::runtime::builder::MultiThreadFlavor; use std::thread; use std::time::Duration; @@ -28,7 +29,7 @@ fn metrics_batch() -> MetricsBatch { #[test] fn fits_256() { - let (_, mut local) = queue::local(); + let (_, mut local) = queue::local(MultiThreadFlavor::Default); let inject = Inject::new(); let mut metrics = metrics_batch(); @@ -48,7 +49,7 @@ fn fits_256() { #[test] fn overflow() { - let (_, mut local) = queue::local(); + let (_, mut local) = queue::local(MultiThreadFlavor::Default); let inject = Inject::new(); let mut metrics = metrics_batch(); @@ -78,8 +79,8 @@ fn overflow() { fn steal_batch() { let mut metrics = metrics_batch(); - let (steal1, mut local1) = queue::local(); - let (_, mut local2) = queue::local(); + let (steal1, mut local1) = queue::local(MultiThreadFlavor::Default); + let (_, mut local2) = queue::local(MultiThreadFlavor::Default); let inject = Inject::new(); for _ in 0..4 { @@ -87,7 +88,7 @@ fn steal_batch() { local1.push_back(task, &inject, &mut metrics); } - assert!(steal1.steal_into(&mut local2, &mut metrics).is_some()); + assert!(steal1.steal_into(&mut *local2, &mut metrics).is_some()); cfg_metrics! { assert_metrics!(metrics, steal_count == 2); @@ -114,6 +115,8 @@ const fn normal_or_miri(normal: usize, miri: usize) -> usize { } } +// todo: stolen increments by one here, so counting seems incorrect even before our queue +// joins the picture #[test] fn stress1() { const NUM_ITER: usize = 5; @@ -125,16 +128,16 @@ fn stress1() { let mut metrics = metrics_batch(); for _ in 0..NUM_ITER { - let (steal, mut local) = queue::local(); + let (steal, mut local) = queue::local(MultiThreadFlavor::Default); let inject = Inject::new(); let th = thread::spawn(move || { let mut metrics = metrics_batch(); - let (_, mut local) = queue::local(); + let (_, mut local) = queue::local(MultiThreadFlavor::Default); let mut n = 0; for _ in 0..NUM_STEAL { - if steal.steal_into(&mut local, &mut metrics).is_some() { + if steal.steal_into(&mut *local, &mut metrics).is_some() { n += 1; } @@ -188,16 +191,16 @@ fn stress2() { let mut metrics = metrics_batch(); for _ in 0..NUM_ITER { - let (steal, mut local) = queue::local(); + let (steal, mut local) = queue::local(MultiThreadFlavor::Default); let inject = Inject::new(); let th = thread::spawn(move || { let mut stats = metrics_batch(); - let (_, mut local) = queue::local(); + let (_, mut local) = queue::local(MultiThreadFlavor::Default); let mut n = 0; for _ in 0..NUM_STEAL { - if steal.steal_into(&mut local, &mut stats).is_some() { + if steal.steal_into(&mut *local, &mut stats).is_some() { n += 1; }