diff --git a/bwosqueue/Cargo.toml b/bwosqueue/Cargo.toml
new file mode 100644
index 00000000000..317d375011a
--- /dev/null
+++ b/bwosqueue/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "bwosqueue"
+version = "1.0.0"
+edition = "2018"
+
+[dependencies]
+# msrv currently 1.31 -> okay!
+crossbeam-utils = { version = "0.8", default-features = false}
+# MSRV # 1.51
+array-init = "2.1"
+
+
+[dev-dependencies]
+criterion = { version = "0.4.0", features = ["html_reports"] }
+core_affinity = "0.7.6"
+rand = "0.8.5"
+tracing = "0.1.37"
+
+
+[target.'cfg(loom)'.dependencies]
+loom = "0.5"
+
+[features]
+default = ["std"]
+std = []
+unstable = []
+stats = []
+
+
+[[bench]]
+name = "bench"
+harness = false
+
+[profile.bench]
+debug-assertions = false
+lto = true
+opt-level = 3
diff --git a/bwosqueue/benches/bench.rs b/bwosqueue/benches/bench.rs
new file mode 100644
index 00000000000..cbc9b87aa14
--- /dev/null
+++ b/bwosqueue/benches/bench.rs
@@ -0,0 +1,616 @@
+//! Microbenchmarks to benchmark the BWoS queue and compare to the original queue in tokio.
+//!
+//! Please note that the tokio queue stores `task::Notified<T>` items, which boils down to a
+//! `NonNull` pointer, so we benchmark with a u64 as the queue item.
+
+use core::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst};
+use std::arch::asm;
+use std::sync::atomic::fence;
+use std::time::{Duration, Instant};
+use std::{
+    sync::{
+        atomic::{AtomicBool, AtomicUsize},
+        Arc,
+    },
+    thread::{self},
+};
+
+#[path = "support/original_tokio_queue.rs"]
+mod original_tokio_queue;
+
+#[path = "support/original_bwos.rs"]
+mod original_bwos;
+
+use bwosqueue::{Owner, Stealer};
+use criterion::{
+    black_box, criterion_group, criterion_main, measurement::WallTime, BenchmarkGroup, BenchmarkId,
+    Criterion, Throughput,
+};
+
+fn bench_stealing(c: &mut Criterion) {
+    let mut stealer_group = c.benchmark_group("Stealing");
+    let bwos_single = QueueType::BwosStealSingleItems;
+    let bwos_block = QueueType::BwosStealBlocks;
+    let tokio_q_single = QueueType::TokioStealSingleItems;
+    let tokio_q_batch = QueueType::TokioStealHalf;
+
+    bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 0);
+    bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 1);
+    bench_steal::<8, 32>(&mut stealer_group, tokio_q_single, 2);
+    bench_steal::<8, 32>(&mut stealer_group, tokio_q_batch, 1);
+    bench_steal::<8, 32>(&mut stealer_group, tokio_q_batch, 2);
+
+    bench_steal::<8, 32>(&mut stealer_group, bwos_single, 0);
+    bench_steal::<8, 32>(&mut stealer_group, bwos_single, 1);
+    bench_steal::<8, 32>(&mut stealer_group, bwos_block, 0);
+    bench_steal::<8, 32>(&mut stealer_group, bwos_block, 1);
+}
+
+fn simple_enqueue_dequeue(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Simple Enqueue Dequeue");
+    simple_enqueue_dequeue_original_queue_inner::<{ 8 * 32 }>(&mut group);
+    simple_enqueue_dequeue_original_queue_inner::<{ 8 * 128 }>(&mut group);
+    simple_enqueue_dequeue_original_queue_inner::<{ 8 * 512 }>(&mut group);
+    simple_enqueue_dequeue_original_queue_inner::<{ 8 * 1024 }>(&mut group);
+    simple_enqueue_dequeue_inner::<8, 32>(&mut group);
+    simple_enqueue_dequeue_inner::<8, 128>(&mut group);
+    simple_enqueue_dequeue_inner::<8, 512>(&mut group);
+    simple_enqueue_dequeue_inner::<8, 1024>(&mut group);
+    simple_enqueue_dequeue_inner::<32, 8>(&mut group);
+    simple_enqueue_dequeue_inner::<128, 2>(&mut group);
+    simple_enqueue_dequeue_inner::<256, 1>(&mut group);
+    simple_enqueue_dequeue_original_bwos_queue(&mut group);
+}
+
+#[inline(never)]
+fn bwos_enq_deq<const NB: usize, const NE: usize>(owner: &mut bwosqueue::Owner<u64, NB, NE>) {
+    while owner.enqueue(black_box(5)).is_ok() {}
+    loop {
+        if let Some(val) = owner.dequeue() {
+            assert_eq!(black_box(val), 5_u64);
+        } else {
+            break;
+        };
+    }
+}
+
+fn simple_enqueue_dequeue_inner<const NB: usize, const NE: usize>(
+    group: &mut BenchmarkGroup<WallTime>,
+) {
+    let (mut owner, _) = bwosqueue::new::<u64, NB, NE>();
+    let num_elements = NE;
+
+    group.throughput(Throughput::Elements((NB * NE * 2) as u64));
+    group.bench_with_input(
+        BenchmarkId::new(
+            format!("BWoS {NE} Elems per Block"),
+            format!("{} Total size", NB * NE),
+        ),
+        &num_elements,
+        |b, _num_elements| {
+            b.iter(|| {
+                bwos_enq_deq(&mut owner);
+            });
+            #[cfg(feature = "stats")]
+            assert!(!owner.can_consume())
+        },
+    );
+}
+
+fn simple_enqueue_dequeue_original_queue_inner<const SIZE: usize>(
+    group: &mut BenchmarkGroup<WallTime>,
+) {
+    let (_, mut owner) = original_tokio_queue::local::<u64, SIZE>();
+    group.throughput(Throughput::Elements((SIZE * 2) as u64));
+    // todo: we could do a binary search here by doing dry runs to determine how much
+    // idle time we need to reach a certain stealing percentage
+    group.bench_with_input(
+        BenchmarkId::new("Original tokio queue", format!("{SIZE} Total size")),
+        &SIZE,
+        |b, _num_elements| {
+            b.iter(|| {
+                while owner.push_back(black_box(5)).is_ok() {}
+                loop {
+                    if let Some(val) = owner.pop() {
+                        assert_eq!(black_box(val), 5_u64);
+                    } else {
+                        break;
+                    };
+                }
+            });
+            assert!(!owner.has_tasks())
+        },
+    );
+
+    // one full enqueue + one full dequeue
+}
+
+#[inline(never)]
+fn simple_enqueue_dequeue_original_bwos_queue(group: &mut BenchmarkGroup<WallTime>) {
+    #[inline(never)]
+    fn enq_deq(prod: &mut original_bwos::Producer<u64>, cons: &mut original_bwos::Consumer<u64>) {
+        while prod.enqueue(black_box(5)) {}
+        loop {
+            if let Some(val) = cons.dequeue() {
+                assert_eq!(black_box(val), 5_u64);
+            } else {
+                break;
+            };
+        }
+    }
+    let (mut prod, mut cons, _stealer) = original_bwos::new();
+
+    const SIZE: u64 = (1024 * 8) as u64;
+    group.throughput(Throughput::Elements(SIZE * 2));
+    group.bench_with_input(
+        BenchmarkId::new("Unsafe Rust: enq-deq", format!("{SIZE} Total size")),
+        &SIZE,
+        |b, _num_elements| {
+            b.iter(|| {
+                enq_deq(&mut prod, &mut cons);
+            });
+            // use owner outside of iter to control drop
+            assert!(cons.dequeue().is_none())
+        },
+    );
+
+    // one full enqueue + one full dequeue
+}
+
+enum StealKind<const NB: usize, const NE: usize> {
+    BwosSingleSteal(Stealer<u64, NB, NE>),
+    BwosBlockSteal(Stealer<u64, NB, NE>),
+    // Just use 8K for the tokio queue, since the size doesn't really matter for this queue and we can't
+    // use generic const expressions here.
+    TokioSingleSteal(original_tokio_queue::Steal<u64, 8192>),
+    TokioBatchSteal(original_tokio_queue::Steal<u64, 8192>),
+
+    // This version also currently has 8K hardcoded, and it's not really worth it to modify it, since it is
+    // only used as a baseline for benchmarks.
+    BwosUnsafeSingleSteal(original_bwos::Stealer<u64>),
+}
+
+struct StealTest<const NB: usize, const NE: usize> {
+    owner: QueueOwner<NB, NE>,
+    stealer: StealKind<NB, NE>,
+    params: StealTestParams,
+}
+
+#[derive(Clone)]
+struct StealTestParams {
+    num_stealers: usize,
+    num_ready_stealers: Arc<AtomicUsize>,
+    start: Arc<AtomicUsize>,
+    stop: Arc<AtomicBool>,
+    stealer_idle_ops: usize,
+}
+
+fn bwos_steal_block_thread<const NB: usize, const NE: usize>(
+    stealer: Stealer<u64, NB, NE>,
+    params: StealTestParams,
+) {
+    params.num_ready_stealers.fetch_add(1, Release);
+    let mut idle_iterations: u64 = 0;
+    loop {
+        if let Some(stolen_iter) = stealer.steal_block() {
+            for element in stolen_iter {
+                assert_eq!(element, 5);
+            }
+            // Stealing should be a "rare" operation, so configure the stealing frequency by waiting for
+            // a while after a successfully stolen item.
+            for _ in 0..params.stealer_idle_ops {
+                unsafe { asm!("nop") };
+            }
+        } else {
+            // Only check the atomic variable once in a while to reduce the overhead.
+            idle_iterations = idle_iterations.wrapping_add(1);
+            if (idle_iterations % 1024) == 0 {
+                if params.stop.load(Relaxed) {
+                    params.num_ready_stealers.fetch_sub(1, SeqCst);
+                    return ();
+                }
+            }
+        }
+    }
+}
+
+fn bwos_steal_single_item_thread<const NB: usize, const NE: usize>(
+    stealer: Stealer<u64, NB, NE>,
+    params: StealTestParams,
+) {
+    params.num_ready_stealers.fetch_add(1, Release);
+    let mut iterations: u64 = 0;
+    loop {
+        if let Some(val) = stealer.steal() {
+            assert_eq!(val, 5);
+            for _ in 0..params.stealer_idle_ops {
+                unsafe { asm!("nop") };
+            }
+        } else {
+            iterations = iterations.wrapping_add(1);
+            if (iterations % 1024) == 0 {
+                if params.stop.load(Relaxed) {
+                    params.num_ready_stealers.fetch_sub(1, SeqCst);
+                    return ();
+                }
+            }
+        }
+    }
+}
+
+fn bwos_unsafe_steal_single_item_thread(
+    mut stealer: original_bwos::Stealer<u64>,
+    params: StealTestParams,
+) {
+    params.num_ready_stealers.fetch_add(1, Release);
+    let mut iterations: u64 = 0;
+    loop {
+        if let Some(val) = stealer.steal() {
+            assert_eq!(val, 5);
+            for _ in 0..params.stealer_idle_ops {
+                unsafe { asm!("nop") };
+            }
+        } else {
+            iterations = iterations.wrapping_add(1);
+            if (iterations % 1024) == 0 {
+                if params.stop.load(Relaxed) {
+                    params.num_ready_stealers.fetch_sub(1, SeqCst);
+                    return ();
+                }
+            }
+        }
+    }
+}
+
+fn tokio_q_steal_block_thread(
+    stealer: original_tokio_queue::Steal<u64, 8192>,
+    params: StealTestParams,
+) {
+    params.num_ready_stealers.fetch_add(1, Release);
+    let mut iterations: u64 = 0;
+    loop {
+        if stealer.bench_tokio_q_steal(1024) != 0 {
+            for _ in 0..params.stealer_idle_ops {
+                unsafe { asm!("nop") };
+            }
+        } else {
+            iterations = iterations.wrapping_add(1);
+            if (iterations % 1024) == 0 {
+                if params.stop.load(Relaxed) {
+                    params.num_ready_stealers.fetch_sub(1, SeqCst);
+                    return ();
+                }
+            }
+        }
+    }
+}
+
+fn tokio_q_steal_single_item_thread(
+    stealer: original_tokio_queue::Steal<u64, 8192>,
+    params: StealTestParams,
+) {
+    params.num_ready_stealers.fetch_add(1, Release);
+    let mut iterations: u64 = 0;
+    loop {
+        if let Some(val) = stealer.bench_tokio_steal_single() {
+            assert_eq!(val, 5);
+            for _ in 0..params.stealer_idle_ops {
+                unsafe { asm!("nop") };
+            }
+        } else {
+            iterations = iterations.wrapping_add(1);
+            if (iterations % 1024) == 0 {
+                if params.stop.load(Relaxed) {
+                    params.num_ready_stealers.fetch_sub(1, SeqCst);
+                    return ();
+                }
+            }
+        }
+    }
+}
+
+/// Sets up stealers to only steals items, without enqueuing them into a different queue.
+/// This allows us to measure only the overhead of the stealing operation, without
+/// any side effects from an enqueue into a different queue.
+fn setup_stealers<const NB: usize, const NE: usize>(steal_test: &StealTest<NB, NE>) {
+    let params = &steal_test.params;
+    // ensure any remaining stealer threads from previous run have shutdown.
+    while params.num_ready_stealers.load(SeqCst) != 0 {}
+    params.stop.store(false, SeqCst);
+    params.start.store(0, SeqCst);
+
+    assert_eq!(params.num_ready_stealers.load(SeqCst), 0);
+    for _ in 0..params.num_stealers {
+        let l_params = params.clone();
+
+        match &steal_test.stealer {
+            StealKind::BwosSingleSteal(stealer) => {
+                let l_stealer = stealer.clone();
+                thread::spawn(|| bwos_steal_single_item_thread(l_stealer, l_params));
+            }
+            StealKind::BwosBlockSteal(stealer) => {
+                let l_stealer = stealer.clone();
+                thread::spawn(|| bwos_steal_block_thread(l_stealer, l_params));
+            }
+            StealKind::TokioSingleSteal(stealer) => {
+                let l_stealer = stealer.clone();
+                thread::spawn(|| tokio_q_steal_single_item_thread(l_stealer, l_params));
+            }
+            StealKind::TokioBatchSteal(stealer) => {
+                let l_stealer = stealer.clone();
+                thread::spawn(|| tokio_q_steal_block_thread(l_stealer, l_params));
+            }
+            StealKind::BwosUnsafeSingleSteal(stealer) => {
+                let l_stealer = stealer.clone();
+                thread::spawn(|| bwos_unsafe_steal_single_item_thread(l_stealer, l_params));
+            }
+        }
+    }
+    while params.num_ready_stealers.load(Acquire) != params.num_stealers {}
+}
+
+// Owner thread implementation which enqueues for a configurable amount of items
+// as fast as possible, dequeuing until empty once the queue is full.
+fn bwos_owner_thread<const NB: usize, const NE: usize>(
+    owner: &mut Owner<u64, NB, NE>,
+    num_enqueues: u64,
+    total_enqueues: &mut u64,
+    total_dequeues: &mut u64,
+) -> Duration {
+    let mut enq_count: u64 = 0;
+    let mut deq_count: u64 = 0;
+    let start = Instant::now();
+
+    while enq_count < num_enqueues {
+        while owner.enqueue(black_box(5)).is_ok() {
+            enq_count += 1;
+            if enq_count >= num_enqueues {
+                break;
+            }
+        }
+        loop {
+            if let Some(val) = owner.dequeue() {
+                assert_eq!(black_box(val), 5);
+                deq_count += 1;
+            } else {
+                break;
+            };
+        }
+    }
+    // This adds some additional overhead even with 0 stealers compared to the simple enq/deq benchmark.
+    while owner.has_stealers() {}
+    let duration = start.elapsed();
+
+    debug_assert!(
+        enq_count >= deq_count,
+        "enq: {}, deq: {}",
+        enq_count,
+        deq_count
+    );
+    *total_enqueues += enq_count;
+    *total_dequeues += deq_count;
+    duration
+}
+
+fn original_bwos_owner_thread(
+    producer: &mut original_bwos::Producer<u64>,
+    consumer: &mut original_bwos::Consumer<u64>,
+    num_enqueues: u64,
+    total_enqueues: &mut u64,
+    total_dequeues: &mut u64,
+) -> Duration {
+    let mut enq_count: u64 = 0;
+    let mut deq_count: u64 = 0;
+    let start = Instant::now();
+
+    while enq_count < num_enqueues {
+        while producer.enqueue(black_box(5)) {
+            enq_count += 1;
+            if enq_count >= num_enqueues {
+                break;
+            }
+        }
+        loop {
+            if let Some(val) = consumer.dequeue() {
+                assert_eq!(black_box(val), 5);
+                deq_count += 1;
+            } else {
+                break;
+            };
+        }
+    }
+    // No implementation to check for stealers, so just skip this here.
+    //while owner.has_stealers() {}
+    let duration = start.elapsed();
+
+    debug_assert!(
+        enq_count >= deq_count,
+        "enq: {}, deq: {}",
+        enq_count,
+        deq_count
+    );
+    *total_enqueues += enq_count;
+    *total_dequeues += deq_count;
+    duration
+}
+
+fn tokio_q_owner_thread(
+    owner: &mut original_tokio_queue::Local<u64, 8192>,
+    num_enqueues: u64,
+    total_enqueues: &mut u64,
+    total_dequeues: &mut u64,
+) -> Duration {
+    let mut enq_count: u64 = 0;
+    let mut deq_count: u64 = 0;
+    let start = Instant::now();
+
+    while enq_count < num_enqueues {
+        while owner.push_back(black_box(5)).is_ok() {
+            enq_count += 1;
+            if enq_count >= num_enqueues {
+                break;
+            }
+        }
+        loop {
+            if let Some(val) = owner.pop() {
+                assert_eq!(black_box(val), 5_u64);
+                deq_count += 1;
+            } else {
+                break;
+            };
+        }
+    }
+    while owner.has_stealers() {}
+
+    let duration = start.elapsed();
+
+    debug_assert!(
+        enq_count >= deq_count,
+        "enq: {}, deq: {}",
+        enq_count,
+        deq_count
+    );
+    *total_enqueues += enq_count;
+    *total_dequeues += deq_count;
+    duration
+}
+
+#[derive(Copy, Clone, Debug)]
+enum QueueType {
+    BwosStealSingleItems,
+    BwosStealBlocks,
+    TokioStealSingleItems,
+    // Default tokio configuration
+    TokioStealHalf,
+    BwosUnsafe,
+}
+
+enum QueueOwner<const NB: usize, const NE: usize> {
+    Bwos(bwosqueue::Owner<u64, NB, NE>),
+    Tokio(original_tokio_queue::Local<u64, 8192>),
+    BwosUnsafe((original_bwos::Producer<u64>, original_bwos::Consumer<u64>)),
+}
+
+fn bench_steal<const NB: usize, const NE: usize>(
+    group: &mut BenchmarkGroup<WallTime>,
+    queue_type: QueueType,
+    num_stealers: usize,
+) {
+    let setup_params = StealTestParams {
+        num_stealers,
+        num_ready_stealers: Arc::new(AtomicUsize::new(0)),
+        stealer_idle_ops: 5000,
+        start: Arc::new(AtomicUsize::new(0)),
+        stop: Arc::new(AtomicBool::new(false)),
+    };
+    let mut test_configuration = match queue_type {
+        QueueType::BwosStealBlocks => {
+            let (owner, stealer) = bwosqueue::new::<u64, NB, NE>();
+            StealTest {
+                owner: QueueOwner::Bwos(owner),
+                stealer: StealKind::BwosBlockSteal(stealer),
+                params: setup_params,
+            }
+        }
+        QueueType::BwosStealSingleItems => {
+            let (owner, stealer) = bwosqueue::new::<u64, NB, NE>();
+            StealTest {
+                owner: QueueOwner::Bwos(owner),
+                stealer: StealKind::BwosSingleSteal(stealer),
+                params: setup_params,
+            }
+        }
+        QueueType::TokioStealSingleItems => {
+            let (stealer, owner) = original_tokio_queue::local();
+            StealTest {
+                owner: QueueOwner::Tokio(owner),
+                stealer: StealKind::TokioSingleSteal(stealer),
+                params: setup_params,
+            }
+        }
+        QueueType::TokioStealHalf => {
+            let (stealer, owner) = original_tokio_queue::local();
+            StealTest {
+                owner: QueueOwner::Tokio(owner),
+                stealer: StealKind::TokioBatchSteal(stealer),
+                params: setup_params,
+            }
+        }
+        QueueType::BwosUnsafe => {
+            let (producer, consumer, stealer) = original_bwos::new();
+            StealTest {
+                owner: QueueOwner::BwosUnsafe((producer, consumer)),
+                stealer: StealKind::BwosUnsafeSingleSteal(stealer),
+                params: setup_params,
+            }
+        }
+    };
+
+    let enqueue_iterations = 1;
+    // One enqueue + One dequeue is the base throughput. This is scaled up by the number of iterations
+    // criterion determines and additional enqueue_iterations, since otherwise the time span is too short
+    group.throughput(Throughput::Elements(enqueue_iterations * 2));
+    let mut total_enqueues: u64 = 0;
+    let mut total_dequeues: u64 = 0;
+    group.bench_with_input(
+        // todo: precalculate expected stealing percentage and make that the parameter!
+        BenchmarkId::new(
+            format!("{queue_type:?}"),
+            format!("{num_stealers} stealers"),
+        ),
+        &(),
+        |b, _| {
+            setup_stealers(&test_configuration);
+            test_configuration.params.start.store(1, Release);
+            fence(SeqCst);
+            match &mut test_configuration.owner {
+                QueueOwner::Bwos(owner) => {
+                    b.iter_custom(|num_iters| {
+                        bwos_owner_thread(
+                            owner,
+                            num_iters * enqueue_iterations,
+                            &mut total_enqueues,
+                            &mut total_dequeues,
+                        )
+                    });
+                }
+                QueueOwner::BwosUnsafe((producer, consumer)) => {
+                    b.iter_custom(|num_iters| {
+                        original_bwos_owner_thread(
+                            producer,
+                            consumer,
+                            num_iters * enqueue_iterations,
+                            &mut total_enqueues,
+                            &mut total_dequeues,
+                        )
+                    });
+                }
+                QueueOwner::Tokio(owner) => {
+                    b.iter_custom(|num_iters| {
+                        tokio_q_owner_thread(
+                            owner,
+                            num_iters * enqueue_iterations,
+                            &mut total_enqueues,
+                            &mut total_dequeues,
+                        )
+                    });
+                }
+            }
+
+            test_configuration.params.stop.store(true, Relaxed);
+            test_configuration.params.start.store(2, Release);
+        },
+    );
+    let steal_percentage = if total_enqueues == total_dequeues {
+        "0%".to_string()
+    } else {
+        let p = ((total_enqueues - total_dequeues) as f64 / total_enqueues as f64) * 100.0;
+        format!("{:.1}%", p)
+    };
+    eprintln!("Steal percentage: {steal_percentage}");
+}
+
+criterion_group! {name = benches;
+config = Criterion::default();
+targets = simple_enqueue_dequeue, bench_stealing}
+criterion_main!(benches);
diff --git a/bwosqueue/benches/support/loom/atomic_u16.rs b/bwosqueue/benches/support/loom/atomic_u16.rs
new file mode 100644
index 00000000000..c1c531208c2
--- /dev/null
+++ b/bwosqueue/benches/support/loom/atomic_u16.rs
@@ -0,0 +1,44 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops::Deref;
+
+/// `AtomicU16` providing an additional `load_unsync` function.
+pub(crate) struct AtomicU16 {
+    inner: UnsafeCell<std::sync::atomic::AtomicU16>,
+}
+
+unsafe impl Send for AtomicU16 {}
+unsafe impl Sync for AtomicU16 {}
+
+impl AtomicU16 {
+    pub(crate) const fn new(val: u16) -> AtomicU16 {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicU16::new(val));
+        AtomicU16 { inner }
+    }
+
+    /// Performs an unsynchronized load.
+    ///
+    /// # Safety
+    ///
+    /// All mutations must have happened before the unsynchronized load.
+    /// Additionally, there must be no concurrent mutations.
+    pub(crate) unsafe fn unsync_load(&self) -> u16 {
+        *(*self.inner.get()).get_mut()
+    }
+}
+
+impl Deref for AtomicU16 {
+    type Target = std::sync::atomic::AtomicU16;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicU16 {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/benches/support/loom/atomic_u32.rs b/bwosqueue/benches/support/loom/atomic_u32.rs
new file mode 100644
index 00000000000..61f95fb30ce
--- /dev/null
+++ b/bwosqueue/benches/support/loom/atomic_u32.rs
@@ -0,0 +1,34 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops::Deref;
+
+/// `AtomicU32` providing an additional `load_unsync` function.
+pub(crate) struct AtomicU32 {
+    inner: UnsafeCell<std::sync::atomic::AtomicU32>,
+}
+
+unsafe impl Send for AtomicU32 {}
+unsafe impl Sync for AtomicU32 {}
+
+impl AtomicU32 {
+    pub(crate) const fn new(val: u32) -> AtomicU32 {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicU32::new(val));
+        AtomicU32 { inner }
+    }
+}
+
+impl Deref for AtomicU32 {
+    type Target = std::sync::atomic::AtomicU32;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicU32 {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/benches/support/loom/mod.rs b/bwosqueue/benches/support/loom/mod.rs
new file mode 100644
index 00000000000..36c46375602
--- /dev/null
+++ b/bwosqueue/benches/support/loom/mod.rs
@@ -0,0 +1,3 @@
+pub mod atomic_u16;
+pub mod atomic_u32;
+pub mod unsafe_cell;
diff --git a/bwosqueue/benches/support/loom/unsafe_cell.rs b/bwosqueue/benches/support/loom/unsafe_cell.rs
new file mode 100644
index 00000000000..66c1d7943e0
--- /dev/null
+++ b/bwosqueue/benches/support/loom/unsafe_cell.rs
@@ -0,0 +1,16 @@
+#[derive(Debug)]
+pub(crate) struct UnsafeCell<T>(std::cell::UnsafeCell<T>);
+
+impl<T> UnsafeCell<T> {
+    pub(crate) const fn new(data: T) -> UnsafeCell<T> {
+        UnsafeCell(std::cell::UnsafeCell::new(data))
+    }
+
+    pub(crate) fn with<R>(&self, f: impl FnOnce(*const T) -> R) -> R {
+        f(self.0.get())
+    }
+
+    pub(crate) fn with_mut<R>(&self, f: impl FnOnce(*mut T) -> R) -> R {
+        f(self.0.get())
+    }
+}
diff --git a/bwosqueue/benches/support/original_bwos.rs b/bwosqueue/benches/support/original_bwos.rs
new file mode 100644
index 00000000000..921d76b8409
--- /dev/null
+++ b/bwosqueue/benches/support/original_bwos.rs
@@ -0,0 +1,368 @@
+#![allow(dead_code)]
+use array_init::array_init;
+use crossbeam_utils::CachePadded;
+use std::cell::UnsafeCell;
+use std::cmp::max;
+use std::marker::{Send, Sync};
+use std::mem::MaybeUninit;
+use std::ptr::null_mut;
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst};
+use std::sync::Arc;
+
+const NB: usize = 8;
+const NE: usize = 1024;
+const NB_LOG: usize = 3;
+const NE_LOG: usize = 11;
+
+#[inline(always)]
+fn wsq_global_idx(v: u64) -> u64 {
+    return v & ((1 << NB_LOG) - 1);
+}
+
+#[inline(always)]
+fn wsq_local_idx(v: u64) -> u64 {
+    return v & ((1 << NE_LOG) - 1);
+}
+
+#[inline(always)]
+fn wsq_local_vsn(v: u64) -> u64 {
+    return v >> NE_LOG;
+}
+
+#[inline(always)]
+fn wsq_local_compose(h: u64, l: u64) -> u64 {
+    return (h << NE_LOG) | l;
+}
+
+#[inline(always)]
+fn advance(v: &AtomicU64, old_v: u64) {
+    let _ = v.compare_exchange_weak(old_v, old_v + 1, Relaxed, Relaxed);
+}
+
+struct BlockConfig<E: 'static> {
+    beginning: u8,
+    prev: *mut Block<E>,
+    next: *mut Block<E>,
+}
+
+struct Block<E: 'static> {
+    /// producer
+    committed: CachePadded<AtomicU64>,
+    /// consumer
+    consumed: CachePadded<AtomicU64>,
+    /// stealer-head
+    reserved: CachePadded<AtomicU64>,
+    /// stealer-tail
+    stealed: CachePadded<AtomicU64>,
+    conf: CachePadded<BlockConfig<E>>,
+    entries: CachePadded<MaybeUninit<[UnsafeCell<E>; NE]>>,
+}
+
+struct BwsQueue<E: 'static> {
+    pcache: CachePadded<*mut Block<E>>,
+    spos: CachePadded<AtomicU64>,
+    ccache: CachePadded<*mut Block<E>>,
+    blocks: CachePadded<[UnsafeCell<Block<E>>; NB]>,
+}
+
+unsafe impl<E> Send for BwsQueue<E> {}
+unsafe impl<E> Sync for BwsQueue<E> {}
+
+impl<E> BlockConfig<E> {
+    fn new(idx: usize) -> BlockConfig<E> {
+        BlockConfig {
+            beginning: if idx == 0 { 1 } else { 0 },
+            prev: null_mut(),
+            next: null_mut(),
+        }
+    }
+}
+
+impl<E> Block<E> {
+    fn new(idx: usize) -> Block<E> {
+        let empty_val: u64 = if idx != 0 {
+            NE as u64
+        } else {
+            wsq_local_compose(1, 0)
+        };
+        let full_val: u64 = if idx != 0 {
+            NE as u64
+        } else {
+            wsq_local_compose(1, NE as u64)
+        };
+        Block {
+            committed: CachePadded::new(AtomicU64::new(empty_val)),
+            consumed: CachePadded::new(AtomicU64::new(empty_val)),
+            reserved: CachePadded::new(AtomicU64::new(full_val)),
+            stealed: CachePadded::new(AtomicU64::new(full_val)),
+            conf: CachePadded::new(BlockConfig::<E>::new(idx)),
+            entries: CachePadded::new(MaybeUninit::uninit()),
+        }
+    }
+
+    #[inline(always)]
+    fn is_consumed(&mut self, vsn: u64) -> bool {
+        let consumed: u64 = self.consumed.load(SeqCst);
+        return (wsq_local_idx(consumed) == NE as u64 && wsq_local_vsn(consumed) == vsn)
+            || wsq_local_vsn(consumed) > vsn;
+    }
+
+    #[inline(always)]
+    fn is_stealed(&mut self) -> bool {
+        let stealed: u64 = self.stealed.load(SeqCst);
+        return wsq_local_idx(stealed) == NE as u64;
+    }
+}
+
+impl<E> BwsQueue<E> {
+    fn new() -> BwsQueue<E> {
+        BwsQueue {
+            pcache: CachePadded::new(null_mut()),
+            spos: CachePadded::new(AtomicU64::new(0)),
+            ccache: CachePadded::new(null_mut()),
+            blocks: CachePadded::new(array_init(|idx| UnsafeCell::new(Block::new(idx)))),
+        }
+    }
+
+    #[inline(always)]
+    #[allow(dead_code)]
+    fn is_empty(&self) -> bool {
+        unsafe {
+            /* fast check */
+            let blk = self.ccache.into_inner();
+            let consumed: u64 = (*blk).consumed.load(SeqCst);
+            let committed: u64 = (*blk).committed.load(SeqCst);
+            return committed == consumed;
+        }
+    }
+}
+
+pub struct Producer<E: 'static> {
+    queue: Arc<BwsQueue<E>>,
+}
+
+impl<E> Clone for Producer<E> {
+    fn clone(&self) -> Self {
+        Producer {
+            queue: self.queue.clone(),
+        }
+    }
+}
+
+impl<E> Producer<E> {
+    #[inline(always)]
+    pub fn enqueue(&mut self, t: E) -> bool {
+        unsafe {
+            loop {
+                /* get the address of the alloc block */
+                let blk: *mut Block<E> = self.queue.pcache.into_inner();
+
+                /* precheck once */
+                let committed: u64 = (*blk).committed.load(Relaxed);
+                let committed_idx: u64 = wsq_local_idx(committed);
+
+                /* if out of bound, we don't add the space, but help to move the block */
+                if committed_idx < NE as u64 {
+                    /* copy the data into the entry and commit it */
+                    std::ptr::write(
+                        (*(*blk).entries.as_mut_ptr())[committed_idx as usize].get(),
+                        t,
+                    );
+                    (*blk).committed.store(committed + 1, Release);
+                    return true;
+                }
+
+                /* slow path, all writers help to move to next block */
+                let nblk: *mut Block<E> = (*blk).conf.next;
+                let next_vsn: u64 = wsq_local_vsn(committed) + (*nblk).conf.beginning as u64;
+
+                /* check if next block is ready */
+                if !(*nblk).is_consumed(next_vsn - 1) {
+                    return false;
+                };
+                if !(*nblk).is_stealed() {
+                    return false;
+                };
+
+                /* reset cursor and advance block */
+                let new_cursor: u64 = wsq_local_compose(next_vsn, 0);
+                (*nblk).committed.store(new_cursor, Relaxed);
+                (*nblk).stealed.store(new_cursor, Relaxed);
+                (*nblk).reserved.store(new_cursor, Release);
+                let q: *mut BwsQueue<E> = Arc::as_ptr(&self.queue) as *mut _;
+                (*q).pcache = CachePadded::new(nblk);
+            }
+        }
+    }
+}
+
+pub struct Consumer<E: 'static> {
+    queue: Arc<BwsQueue<E>>,
+}
+
+impl<E> Clone for Consumer<E> {
+    fn clone(&self) -> Self {
+        Consumer {
+            queue: self.queue.clone(),
+        }
+    }
+}
+
+impl<E> Consumer<E> {
+    #[inline(always)]
+    pub fn dequeue(&mut self) -> Option<E> {
+        unsafe {
+            loop {
+                /* get the current block */
+                let blk: *mut Block<E> = self.queue.ccache.into_inner();
+
+                /* check if the block is fully consumed */
+                let consumed: u64 = (*blk).consumed.load(Relaxed);
+                let consumed_idx: u64 = wsq_local_idx(consumed);
+
+                if consumed_idx < NE as u64 {
+                    /* check if we have an entry to occupy */
+                    let committed: u64 = (*blk).committed.load(Relaxed);
+                    let committed_idx: u64 = wsq_local_idx(committed);
+                    if consumed_idx == committed_idx {
+                        return None;
+                    }
+
+                    /* we got the entry */
+                    let t =
+                        std::ptr::read((*(*blk).entries.as_mut_ptr())[consumed_idx as usize].get());
+                    (*blk).consumed.store(consumed + 1, Relaxed);
+                    return Some(t);
+                }
+
+                /* r_head never pass the w_head and r_tail */
+                let nblk: *mut Block<E> = (*blk).conf.next;
+                let next_cons_vsn: u64 = wsq_local_vsn(consumed) + (*nblk).conf.beginning as u64;
+                let next_steal_vsn: u64 = wsq_local_vsn((*nblk).reserved.load(Relaxed));
+                if next_steal_vsn != next_cons_vsn {
+                    return None;
+                }
+
+                /* stop stealers */
+                let reserved_new: u64 = wsq_local_compose(next_cons_vsn, NE as u64);
+                let reserved_old: u64 = (*nblk).reserved.swap(reserved_new, Relaxed);
+
+                /* pre-steal reserved */
+                let reserved_idx: u64 = wsq_local_idx(reserved_old);
+                let pre_stealed: u64 = max(0, NE as u64 - reserved_idx);
+                (*nblk).stealed.fetch_add(pre_stealed, Relaxed);
+
+                /* advance the block and try again */
+                let new_cursor: u64 = wsq_local_compose(next_cons_vsn, reserved_idx);
+                (*nblk).consumed.store(new_cursor, Relaxed);
+                let q: *mut BwsQueue<E> = Arc::as_ptr(&self.queue) as *mut _;
+                (*q).ccache = CachePadded::new(nblk);
+            }
+        }
+    }
+}
+
+pub struct Stealer<E: 'static> {
+    queue: Arc<BwsQueue<E>>,
+}
+
+impl<E> Clone for Stealer<E> {
+    fn clone(&self) -> Self {
+        Stealer {
+            queue: self.queue.clone(),
+        }
+    }
+}
+
+impl<E> Stealer<E> {
+    #[inline(always)]
+    pub fn steal(&mut self) -> Option<E> {
+        unsafe {
+            loop {
+                /* get the address of the steal block */
+                let spos: u64 = self.queue.spos.load(Relaxed);
+                let bidx: usize = wsq_global_idx(spos) as usize;
+                let blk: *mut Block<E> = self.queue.blocks[bidx].get();
+
+                /* check if the block is fully reserved */
+                let reserved: u64 = (*blk).reserved.load(Acquire);
+                let reserved_idx: u64 = wsq_local_idx(reserved);
+
+                if reserved_idx < NE as u64 {
+                    /* check if we have an entry to occupy */
+                    let committed: u64 = (*blk).committed.load(Acquire);
+                    let committed_idx: u64 = wsq_local_idx(committed);
+                    if reserved_idx == committed_idx {
+                        return None;
+                    }
+
+                    if !(*blk)
+                        .reserved
+                        .compare_exchange_weak(reserved, reserved + 1, Release, Relaxed)
+                        .is_ok()
+                    {
+                        return None;
+                    }
+
+                    /* we got the entry */
+                    let t =
+                        std::ptr::read((*(*blk).entries.as_mut_ptr())[reserved_idx as usize].get());
+                    (*blk).stealed.fetch_add(1, Release);
+                    return Some(t);
+                }
+
+                /* r_head never pass the w_head and r_tail */
+                let nblk: *mut Block<E> = (*blk).conf.next;
+                let next_except_vsn: u64 = wsq_local_vsn(reserved) + (*nblk).conf.beginning as u64;
+                let next_actual_vsn: u64 = wsq_local_vsn((*nblk).reserved.load(Relaxed));
+                if next_except_vsn != next_actual_vsn {
+                    return None;
+                }
+
+                /* reset cursor and advance block */
+                advance(&self.queue.spos, spos);
+            }
+        }
+    }
+}
+
+pub fn new<E: 'static>() -> (Producer<E>, Consumer<E>, Stealer<E>) {
+    let qa = Arc::new(BwsQueue::<E>::new());
+
+    let mut blk_start: *mut Block<E> = null_mut();
+    let mut blk_pre: *mut Block<E> = null_mut();
+    let mut blk: *mut Block<E>;
+
+    for idx in 0..NB {
+        blk = qa.blocks[idx].get();
+        if blk_start.is_null() {
+            blk_start = blk;
+        } else {
+            unsafe {
+                (*blk_pre).conf.next = blk;
+                (*blk).conf.prev = blk_pre;
+            }
+        }
+        blk_pre = blk;
+        if idx == NB - 1 {
+            unsafe {
+                (*blk).conf.next = blk_start;
+                (*blk_start).conf.prev = blk;
+            }
+        }
+    }
+    unsafe {
+        let q: *mut BwsQueue<E> = Arc::as_ptr(&qa) as *mut _;
+        (*q).pcache = CachePadded::new(blk_start);
+        (*q).ccache = CachePadded::new(blk_start);
+    }
+
+    let qb = qa.clone();
+    let qc = qa.clone();
+    (
+        Producer { queue: qa },
+        Consumer { queue: qb },
+        Stealer { queue: qc },
+    )
+}
diff --git a/bwosqueue/benches/support/original_tokio_queue.rs b/bwosqueue/benches/support/original_tokio_queue.rs
new file mode 100644
index 00000000000..404e8858e78
--- /dev/null
+++ b/bwosqueue/benches/support/original_tokio_queue.rs
@@ -0,0 +1,623 @@
+// This is the original tokio queue, modified slightly to use const generics for a configurable queue size
+#![allow(dead_code)]
+
+//! Run-queue structures to support a work-stealing scheduler
+
+use loom::atomic_u16::AtomicU16;
+use loom::atomic_u32::AtomicU32;
+use loom::unsafe_cell::UnsafeCell;
+use std::sync::Arc;
+
+use std::mem::MaybeUninit;
+use std::ptr;
+use std::sync::atomic::Ordering::{AcqRel, Acquire, Release};
+
+mod loom;
+
+/// Producer handle. May only be used from a single thread.
+pub(crate) struct Local<T: 'static, const N: usize> {
+    inner: Arc<Inner<T, { N }>>,
+}
+
+/// Consumer handle. May be used from many threads.
+pub(crate) struct Steal<T: 'static, const N: usize>(Arc<Inner<T, { N }>>);
+
+struct Inner<T: 'static, const N: usize> {
+    /// Concurrently updated by many threads.
+    ///
+    /// Contains two `u16` values. The LSB byte is the "real" head of the queue.
+    /// The `u16` in the MSB is set by a stealer in process of stealing values.
+    /// It represents the first value being stolen in the batch. `u16` is used
+    /// in order to distinguish between `head == tail` and `head == tail -
+    /// capacity`.
+    ///
+    /// When both `u16` values are the same, there is no active stealer.
+    ///
+    /// Tracking an in-progress stealer prevents a wrapping scenario.
+    head: AtomicU32,
+
+    /// Only updated by producer thread but read by many threads.
+    tail: AtomicU16,
+
+    /// Elements
+    buffer: Box<[UnsafeCell<MaybeUninit<T>>; N]>,
+}
+
+unsafe impl<T, const N: usize> Send for Inner<T, { N }> {}
+unsafe impl<T, const N: usize> Sync for Inner<T, { N }> {}
+
+fn make_mask(queue_size: usize) -> usize {
+    assert!(queue_size.is_power_of_two());
+    queue_size - 1
+}
+
+// Constructing the fixed size array directly is very awkward. The only way to
+// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as
+// the contents are not Copy. The trick with defining a const doesn't work for
+// generic types.
+fn make_fixed_size<T, const N: usize>(buffer: Box<[T]>) -> Box<[T; N]> {
+    assert_eq!(buffer.len(), N);
+
+    // safety: We check that the length is correct.
+    unsafe { Box::from_raw(Box::into_raw(buffer).cast()) }
+}
+
+/// Create a new local run-queue
+pub(crate) fn local<T: 'static, const N: usize>() -> (Steal<T, { N }>, Local<T, { N }>) {
+    let mut buffer = Vec::with_capacity(N);
+
+    for _ in 0..N {
+        buffer.push(UnsafeCell::new(MaybeUninit::uninit()));
+    }
+
+    let inner = Arc::new(Inner {
+        head: AtomicU32::new(0),
+        tail: AtomicU16::new(0),
+        buffer: make_fixed_size(buffer.into_boxed_slice()),
+    });
+
+    let local = Local {
+        inner: inner.clone(),
+    };
+
+    let remote = Steal(inner);
+
+    (remote, local)
+}
+
+impl<T, const N: usize> Local<T, { N }> {
+    /// Returns true if the queue has entries that can be stolen.
+    pub(crate) fn is_stealable(&self) -> bool {
+        !self.inner.is_empty()
+    }
+
+    /// BwoS bench: Exposed this check from steal_into2 as public to use in the benchmark.
+    pub(crate) fn has_stealers(&self) -> bool {
+        let prev_packed = self.inner.head.load(Acquire);
+        let (src_head_steal, src_head_real) = unpack(prev_packed);
+        // If these two do not match, another thread is concurrently
+        // stealing from the queue.
+        src_head_steal != src_head_real
+    }
+
+    /// Returns false if there are any entries in the queue
+    ///
+    /// Separate to is_stealable so that refactors of is_stealable to "protect"
+    /// some tasks from stealing won't affect this
+    pub(crate) fn has_tasks(&self) -> bool {
+        !self.inner.is_empty()
+    }
+
+    /// Pushes a task to the back of the local queue, skipping the LIFO slot.
+    pub(crate) fn push_back(&mut self, task: T) -> Result<(), T> {
+        let tail = loop {
+            let head = self.inner.head.load(Acquire);
+            let (steal, _real) = unpack(head);
+
+            // safety: this is the **only** thread that updates this cell.
+            let tail = unsafe { self.inner.tail.unsync_load() };
+
+            if tail.wrapping_sub(steal) < N as u16 {
+                // There is capacity for the task
+                break tail;
+            } else {
+                // Concurrently stealing, this will free up capacity, so only
+                // push the task onto the inject queue
+                //inject.push(task);
+                return Err(task);
+            } // JS: remove push_pverflow case for micro benchmark
+        };
+
+        // Map the position to a slot index.
+        let idx = tail as usize & make_mask(N);
+
+        self.inner.buffer[idx].with_mut(|ptr| {
+            // Write the task to the slot
+            //
+            // Safety: There is only one producer and the above `if`
+            // condition ensures we don't touch a cell if there is a
+            // value, thus no consumer.
+            unsafe {
+                ptr::write((*ptr).as_mut_ptr(), task);
+            }
+        });
+
+        // Make the task available. Synchronizes with a load in
+        // `steal_into2`.
+        self.inner.tail.store(tail.wrapping_add(1), Release);
+        Ok(())
+    }
+
+    // /// Moves a batch of tasks into the inject queue.
+    // ///
+    // /// This will temporarily make some of the tasks unavailable to stealers.
+    // /// Once `push_overflow` is done, a notification is sent out, so if other
+    // /// workers "missed" some of the tasks during a steal, they will get
+    // /// another opportunity.
+    // #[inline(never)]
+    // fn push_overflow(
+    //     &mut self,
+    //     task: T,
+    //     head: u16,
+    //     tail: u16,
+    // ) -> Result<(), T> {
+    //     /// How many elements are we taking from the local queue.
+    //     ///
+    //     /// This is one less than the number of tasks pushed to the inject
+    //     /// queue as we are also inserting the `task` argument.
+    //     const NUM_TASKS_TAKEN: u16 = (LOCAL_QUEUE_CAPACITY / 2) as u16;
+
+    //     assert_eq!(
+    //         tail.wrapping_sub(head) as usize,
+    //         LOCAL_QUEUE_CAPACITY,
+    //         "queue is not full; tail = {}; head = {}",
+    //         tail,
+    //         head
+    //     );
+
+    //     let prev = pack(head, head);
+
+    //     // Claim a bunch of tasks
+    //     //
+    //     // We are claiming the tasks **before** reading them out of the buffer.
+    //     // This is safe because only the **current** thread is able to push new
+    //     // tasks.
+    //     //
+    //     // There isn't really any need for memory ordering... Relaxed would
+    //     // work. This is because all tasks are pushed into the queue from the
+    //     // current thread (or memory has been acquired if the local queue handle
+    //     // moved).
+    //     if self
+    //         .inner
+    //         .head
+    //         .compare_exchange(
+    //             prev,
+    //             pack(
+    //                 head.wrapping_add(NUM_TASKS_TAKEN),
+    //                 head.wrapping_add(NUM_TASKS_TAKEN),
+    //             ),
+    //             Release,
+    //             Relaxed,
+    //         )
+    //         .is_err()
+    //     {
+    //         // We failed to claim the tasks, losing the race. Return out of
+    //         // this function and try the full `push` routine again. The queue
+    //         // may not be full anymore.
+    //         return Err(task);
+    //     }
+
+    //     /// An iterator that takes elements out of the run queue.
+    //     struct BatchTaskIter<'a, T: 'static> {
+    //         buffer: &'a [UnsafeCell<MaybeUninit<task::Notified<T>>>; LOCAL_QUEUE_CAPACITY],
+    //         head: u32,
+    //         i: u32,
+    //     }
+    //     impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> {
+    //         type Item = task::Notified<T>;
+
+    //         #[inline]
+    //         fn next(&mut self) -> Option<task::Notified<T>> {
+    //             if self.i == u32::from(NUM_TASKS_TAKEN) {
+    //                 None
+    //             } else {
+    //                 let i_idx = self.i.wrapping_add(self.head) as usize & MASK;
+    //                 let slot = &self.buffer[i_idx];
+
+    //                 // safety: Our CAS from before has assumed exclusive ownership
+    //                 // of the task pointers in this range.
+    //                 let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+
+    //                 self.i += 1;
+    //                 Some(task)
+    //             }
+    //         }
+    //     }
+
+    //     // safety: The CAS above ensures that no consumer will look at these
+    //     // values again, and we are the only producer.
+    //     let batch_iter = BatchTaskIter {
+    //         buffer: &*self.inner.buffer,
+    //         head: head as u32,
+    //         i: 0,
+    //     };
+    //     inject.push_batch(batch_iter.chain(std::iter::once(task)));
+
+    //     // Add 1 to factor in the task currently being scheduled.
+    //     metrics.incr_overflow_count();
+
+    //     Ok(())
+    // }
+
+    /// Pops a task from the local queue.
+    pub(crate) fn pop(&mut self) -> Option<T> {
+        let mut head = self.inner.head.load(Acquire);
+
+        let idx = loop {
+            let (steal, real) = unpack(head);
+
+            // safety: this is the **only** thread that updates this cell.
+            let tail = unsafe { self.inner.tail.unsync_load() };
+
+            if real == tail {
+                // queue is empty
+                return None;
+            }
+
+            let next_real = real.wrapping_add(1);
+
+            // If `steal == real` there are no concurrent stealers. Both `steal`
+            // and `real` are updated.
+            let next = if steal == real {
+                pack(next_real, next_real)
+            } else {
+                assert_ne!(steal, next_real);
+                pack(steal, next_real)
+            };
+
+            // Attempt to claim a task.
+            let res = self
+                .inner
+                .head
+                .compare_exchange(head, next, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => break real as usize & make_mask(N),
+                Err(actual) => head = actual,
+            }
+        };
+
+        Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() }))
+    }
+}
+
+impl<const N: usize> Steal<u64, { N }> {
+    // BWoS bench: Taken from steal_into2 - Modified to support benchmarking
+    // only the steal operation without the additional enqueue into `dst`.
+    // Don't steal more than max_steal items, otherwise the stealer will
+    // steal 100% of all items and give the consumer in the benchmark no
+    // chance, so we can't measure consumer/stealer interference
+    pub fn bench_tokio_q_steal(&self, max_steal: u16) -> u16 {
+        let mut prev_packed = self.0.head.load(Acquire);
+        let mut next_packed;
+
+        let n = loop {
+            let (src_head_steal, src_head_real) = unpack(prev_packed);
+            let src_tail = self.0.tail.load(Acquire);
+
+            // If these two do not match, another thread is concurrently
+            // stealing from the queue.
+            if src_head_steal != src_head_real {
+                return 0;
+            }
+
+            // Number of available tasks to steal
+            let n = src_tail.wrapping_sub(src_head_real);
+            // Bench BWoS steal at most
+            let n = core::cmp::min(max_steal, n - n / 2);
+
+            if n == 0 {
+                // No tasks available to steal
+                return 0;
+            }
+
+            // Update the real head index to acquire the tasks.
+            let steal_to = src_head_real.wrapping_add(n);
+            assert_ne!(src_head_steal, steal_to);
+            next_packed = pack(src_head_steal, steal_to);
+
+            // Claim all those tasks. This is done by incrementing the "real"
+            // head but not the steal. By doing this, no other thread is able to
+            // steal from this queue until the current thread completes.
+            let res = self
+                .0
+                .head
+                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => break n,
+                Err(actual) => prev_packed = actual,
+            }
+        };
+
+        assert!(n <= N as u16 / 2, "actual = {}", n);
+
+        let (first, _) = unpack(next_packed);
+
+        // Take all the tasks
+        for i in 0..n {
+            // Compute the positions
+            let src_pos = first.wrapping_add(i);
+
+            // Map to slots
+            let src_idx = src_pos as usize & make_mask(N);
+
+            // Read the task
+            //
+            // safety: We acquired the task with the atomic exchange above.
+            let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+
+            // Use the queue entry so the compiler does not optimize the read away.
+            assert_eq!(task, 5);
+        }
+
+        let mut prev_packed = next_packed;
+
+        // Update `src_head_steal` to match `src_head_real` signalling that the
+        // stealing routine is complete.
+        loop {
+            let head = unpack(prev_packed).1;
+            next_packed = pack(head, head);
+
+            let res = self
+                .0
+                .head
+                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => return n,
+                Err(actual) => {
+                    let (actual_steal, actual_real) = unpack(actual);
+
+                    assert_ne!(actual_steal, actual_real);
+
+                    prev_packed = actual;
+                }
+            }
+        }
+    }
+}
+
+impl<T, const N: usize> Steal<T, { N }> {
+    // BWoS bench
+    // Based on Local::pop, with modified memory ordering.
+    pub(crate) fn bench_tokio_steal_single(&self) -> Option<T> {
+        let mut head = self.0.head.load(Acquire);
+
+        let idx = loop {
+            let (steal, real) = unpack(head);
+
+            let tail = self.0.tail.load(Acquire);
+
+            if real == tail {
+                // queue is empty
+                return None;
+            }
+
+            let next_real = real.wrapping_add(1);
+
+            // If `steal == real` there are no concurrent stealers. Both `steal`
+            // and `real` are updated.
+            let next = if steal == real {
+                pack(next_real, next_real)
+            } else {
+                assert_ne!(steal, next_real);
+                pack(steal, next_real)
+            };
+
+            // Attempt to claim a task.
+            let res = self.0.head.compare_exchange(head, next, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => break real as usize & make_mask(N),
+                Err(actual) => head = actual,
+            }
+        };
+
+        Some(self.0.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() }))
+    }
+
+    pub(crate) fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Steals half the tasks from self and place them into `dst`.
+    pub(crate) fn steal_into(&self, dst: &mut Local<T, N>) -> Option<T> {
+        // Safety: the caller is the only thread that mutates `dst.tail` and
+        // holds a mutable reference.
+        let dst_tail = unsafe { dst.inner.tail.unsync_load() };
+
+        // To the caller, `dst` may **look** empty but still have values
+        // contained in the buffer. If another thread is concurrently stealing
+        // from `dst` there may not be enough capacity to steal.
+        let (steal, _) = unpack(dst.inner.head.load(Acquire));
+
+        if dst_tail.wrapping_sub(steal) > N as u16 / 2 {
+            // we *could* try to steal less here, but for simplicity, we're just
+            // going to abort.
+            return None;
+        }
+
+        // Steal the tasks into `dst`'s buffer. This does not yet expose the
+        // tasks in `dst`.
+        let mut n = self.steal_into2(dst, dst_tail);
+
+        if n == 0 {
+            // No tasks were stolen
+            return None;
+        }
+
+        // We are returning a task here
+        n -= 1;
+
+        let ret_pos = dst_tail.wrapping_add(n);
+        let ret_idx = ret_pos as usize & make_mask(N);
+
+        // safety: the value was written as part of `steal_into2` and not
+        // exposed to stealers, so no other thread can access it.
+        let ret = dst.inner.buffer[ret_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+
+        if n == 0 {
+            // The `dst` queue is empty, but a single task was stolen
+            return Some(ret);
+        }
+
+        // Make the stolen items available to consumers
+        dst.inner.tail.store(dst_tail.wrapping_add(n), Release);
+
+        Some(ret)
+    }
+
+    // Steal tasks from `self`, placing them into `dst`. Returns the number of
+    // tasks that were stolen.
+    fn steal_into2(&self, dst: &mut Local<T, N>, dst_tail: u16) -> u16 {
+        let mut prev_packed = self.0.head.load(Acquire);
+        let mut next_packed;
+
+        let n = loop {
+            let (src_head_steal, src_head_real) = unpack(prev_packed);
+            let src_tail = self.0.tail.load(Acquire);
+
+            // If these two do not match, another thread is concurrently
+            // stealing from the queue.
+            if src_head_steal != src_head_real {
+                return 0;
+            }
+
+            // Number of available tasks to steal
+            let n = src_tail.wrapping_sub(src_head_real);
+            let n = n - n / 2;
+
+            if n == 0 {
+                // No tasks available to steal
+                return 0;
+            }
+
+            // Update the real head index to acquire the tasks.
+            let steal_to = src_head_real.wrapping_add(n);
+            assert_ne!(src_head_steal, steal_to);
+            next_packed = pack(src_head_steal, steal_to);
+
+            // Claim all those tasks. This is done by incrementing the "real"
+            // head but not the steal. By doing this, no other thread is able to
+            // steal from this queue until the current thread completes.
+            let res = self
+                .0
+                .head
+                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => break n,
+                Err(actual) => prev_packed = actual,
+            }
+        };
+
+        assert!(n <= N as u16 / 2, "actual = {}", n);
+
+        let (first, _) = unpack(next_packed);
+
+        // Take all the tasks
+        for i in 0..n {
+            // Compute the positions
+            let src_pos = first.wrapping_add(i);
+            let dst_pos = dst_tail.wrapping_add(i);
+
+            // Map to slots
+            let src_idx = src_pos as usize & make_mask(N);
+            let dst_idx = dst_pos as usize & make_mask(N);
+
+            // Read the task
+            //
+            // safety: We acquired the task with the atomic exchange above.
+            let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+
+            // Write the task to the new slot
+            //
+            // safety: `dst` queue is empty and we are the only producer to
+            // this queue.
+            dst.inner.buffer[dst_idx]
+                .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) });
+        }
+
+        let mut prev_packed = next_packed;
+
+        // Update `src_head_steal` to match `src_head_real` signalling that the
+        // stealing routine is complete.
+        loop {
+            let head = unpack(prev_packed).1;
+            next_packed = pack(head, head);
+
+            let res = self
+                .0
+                .head
+                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => return n,
+                Err(actual) => {
+                    let (actual_steal, actual_real) = unpack(actual);
+
+                    assert_ne!(actual_steal, actual_real);
+
+                    prev_packed = actual;
+                }
+            }
+        }
+    }
+}
+
+impl<T, const N: usize> Clone for Steal<T, { N }> {
+    fn clone(&self) -> Steal<T, N> {
+        Steal(self.0.clone())
+    }
+}
+
+impl<T, const N: usize> Drop for Local<T, { N }> {
+    fn drop(&mut self) {
+        if !std::thread::panicking() {
+            assert!(self.pop().is_none(), "queue not empty");
+        }
+    }
+}
+
+impl<T, const N: usize> Inner<T, { N }> {
+    fn len(&self) -> u16 {
+        let (_, head) = unpack(self.head.load(Acquire));
+        let tail = self.tail.load(Acquire);
+
+        tail.wrapping_sub(head)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+/// Split the head value into the real head and the index a stealer is working
+/// on.
+fn unpack(n: u32) -> (u16, u16) {
+    let real = n & u16::MAX as u32;
+    let steal = n >> 16;
+
+    (steal as u16, real as u16)
+}
+
+/// Join the two head values
+fn pack(steal: u16, real: u16) -> u32 {
+    (real as u32) | ((steal as u32) << 16)
+}
+
+#[test]
+fn test_local_queue_capacity() {
+    assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize);
+}
diff --git a/bwosqueue/src/bwos_queue.rs b/bwosqueue/src/bwos_queue.rs
new file mode 100644
index 00000000000..65b8fc8d7bb
--- /dev/null
+++ b/bwosqueue/src/bwos_queue.rs
@@ -0,0 +1,204 @@
+use super::metadata::AtomicIndexAndVersion;
+use crate::loom::{cell::UnsafeCell, sync::Arc};
+use array_init::array_init;
+use core::{marker::PhantomPinned, mem::MaybeUninit, pin::Pin, ptr::null};
+use crossbeam_utils::CachePadded;
+
+#[cfg(feature = "stats")]
+mod bwsstats {
+    use crate::loom::sync::atomic::{AtomicU64, Ordering::Relaxed};
+    use crossbeam_utils::CachePadded;
+
+    pub(crate) struct BwsStats {
+        owner_counter: CachePadded<AtomicU64>,
+        total_stolen: CachePadded<AtomicU64>,
+    }
+
+    impl BwsStats {
+        pub(crate) const fn new() -> Self {
+            Self {
+                owner_counter: CachePadded::new(AtomicU64::new(0)),
+                total_stolen: CachePadded::new(AtomicU64::new(0)),
+            }
+        }
+
+        #[inline]
+        pub(crate) fn increment_enqueued(&self, rhs: usize) {
+            let curr = self.owner_counter.load(Relaxed);
+            let new = curr.wrapping_add(rhs as u64);
+            self.owner_counter.store(new, Relaxed);
+        }
+        #[inline]
+        pub(crate) fn increment_dequeued(&self, rhs: usize) {
+            let curr = self.owner_counter.load(Relaxed);
+            let new = curr.wrapping_sub(rhs as u64);
+            self.owner_counter.store(new, Relaxed);
+        }
+
+        #[inline]
+        pub(crate) fn increment_stolen(&self, rhs: usize) {
+            self.total_stolen.fetch_add(rhs as u64, Relaxed);
+        }
+
+        /// Returns the _estimated_ number of currently enqueued items.
+        ///
+        /// Assumes a maximum of usize items in the queue.
+        ///
+        /// Todo: assumes that we enqueue no more than u64::MAX items. This may not be acceptable for tokio.
+        #[inline]
+        pub(crate) fn curr_enqueued(&self) -> usize {
+            let owner_cnt = self.owner_counter.load(Relaxed);
+            let total_stolen = self.total_stolen.load(Relaxed);
+
+            // We assume the `u64` total numbers will never overflow.
+            let num = owner_cnt.saturating_sub(total_stolen);
+            // The maximum queue size is usize::MAX, so this conversion is safe (with the assumption that the u64
+            // counters don't overflow)
+            num as usize
+        }
+    }
+}
+
+#[cfg(feature = "stats")]
+pub(crate) use bwsstats::*;
+
+pub(crate) struct BwsQueue<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> {
+    pub(crate) blocks: CachePadded<[Block<E, { ENTRIES_PER_BLOCK }>; NUM_BLOCKS]>,
+    #[cfg(feature = "stats")]
+    pub(crate) stats: CachePadded<BwsStats>,
+    _pin: PhantomPinned,
+}
+
+pub(crate) struct Block<E, const NE: usize> {
+    /// The index and version of the next writable entry in the block
+    ///
+    /// index == NE signals that the producer has already fully written this block.
+    /// `committed` is only written to by the single producer ([Owner](super::Owner)).
+    pub(crate) committed: CachePadded<AtomicIndexAndVersion<{ NE }>>,
+    /// The index and version of the next readable entry in the block
+    ///
+    /// If consumed == committed, then there are not items that can be read in this block.
+    /// `consumed` is only written by the single consumer ([Owner](super::Owner)).
+    pub(crate) consumed: CachePadded<AtomicIndexAndVersion<{ NE }>>,
+    /// stealer-head - We ensure that consumer and stealer are never on same block
+    pub(crate) reserved: CachePadded<AtomicIndexAndVersion<{ NE }>>,
+    /// stealer-tail - stealing finished
+    pub(crate) stolen: CachePadded<AtomicIndexAndVersion<{ NE }>>,
+    /// Block specific configuration, including a reference to the next block in the bwosqueue.
+    conf: CachePadded<BlockConfig<E, { NE }>>,
+    /// The storage for all entries in this block
+    pub(crate) entries: CachePadded<[UnsafeCell<MaybeUninit<E>>; NE]>,
+}
+
+struct BlockConfig<E, const NE: usize> {
+    /// true if this Block is the HEAD of the queue.
+    beginning: bool,
+    /// Blocks are linked together as a linked list via the `next` pointer to speed up accessing
+    /// the next block. The pointer is fixed, but needs to be initialized after the Block has
+    /// been put behind a shared reference in pinned memory, since we can't directly initialize
+    /// and pin memory on the heap.
+    next: UnsafeCell<*const Block<E, { NE }>>,
+}
+
+impl<E, const NE: usize> BlockConfig<E, { NE }> {
+    fn new(idx: usize) -> BlockConfig<E, NE> {
+        BlockConfig {
+            beginning: idx == 0,
+            next: UnsafeCell::new(null()),
+        }
+    }
+}
+
+impl<E, const NE: usize> Block<E, { NE }> {
+    fn new(idx: usize) -> Block<E, NE> {
+        let is_queue_head = idx == 0;
+        Block {
+            committed: CachePadded::new(AtomicIndexAndVersion::new_owner(is_queue_head)),
+            consumed: CachePadded::new(AtomicIndexAndVersion::new_owner(is_queue_head)),
+            reserved: CachePadded::new(AtomicIndexAndVersion::new_stealer(is_queue_head)),
+            stolen: CachePadded::new(AtomicIndexAndVersion::new_stealer(is_queue_head)),
+            conf: CachePadded::new(BlockConfig::new(idx)),
+            entries: CachePadded::new(array_init(|_| UnsafeCell::new(MaybeUninit::uninit()))),
+        }
+    }
+
+    /// Returns the next Block in the BWoS queue
+    #[inline(always)]
+    pub(crate) fn next(&self) -> *const Self {
+        // SAFETY: The next pointer is static and valid after initialization of the queue for
+        // the whole lifetime of the queue.
+        unsafe { self.conf.next.with(|next| *next) }
+    }
+
+    /// true if this block is the head of the BWoS queue
+    #[inline(always)]
+    pub(crate) fn is_head(&self) -> bool {
+        self.conf.beginning
+    }
+}
+
+impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>
+    BwsQueue<E, { NUM_BLOCKS }, { ENTRIES_PER_BLOCK }>
+{
+    const _ASSERT_NUM_BLOCKS_POW2: () = assert!(NUM_BLOCKS.is_power_of_two());
+    const _ASSERT_NUM_GREATER_1: () = assert!(NUM_BLOCKS > 1);
+
+    pub(crate) fn new() -> Pin<Arc<Self>> {
+        // We need to "use" the assertions here, otherwise the compile-time assertions are ignored.
+        #[allow(clippy::let_unit_value)]
+        let _ = Self::_ASSERT_NUM_BLOCKS_POW2;
+        #[allow(clippy::let_unit_value)]
+        let _ = Self::_ASSERT_NUM_GREATER_1;
+
+        // First create and pin the queue on the heap
+        let q = Arc::pin(BwsQueue {
+            blocks: CachePadded::new(array_init(|idx| Block::new(idx))),
+            #[cfg(feature = "stats")]
+            stats: CachePadded::new(BwsStats::new()),
+            _pin: PhantomPinned,
+        });
+        // Now initialize the fast-path pointers
+        let blocks: &[Block<E, { ENTRIES_PER_BLOCK }>; NUM_BLOCKS] = &q.blocks;
+        for block_window in blocks.windows(2) {
+            // Note: This cannot panic since we asserted at compile-time that BwsQueue has at least
+            // 2 blocks
+            let curr_block = block_window.get(0).expect("INVALID_NUM_BLOCKS");
+            let next_block = block_window.get(1).expect("INVALID_NUM_BLOCKS");
+            // SAFETY: Since our array of blocks is already behind an `Arc` and `Pin`ned we can't
+            // initialize the pointers with safe code, but we do know that at this point in time
+            // no concurrent mutable access is possible, since there are no other references.
+            unsafe {
+                curr_block.conf.next.with_mut(|next_ptr| {
+                    (*next_ptr) = next_block;
+                });
+            }
+        }
+
+        let first_block = blocks.first().expect("INVALID_NUM_BLOCKS");
+        let last_block = blocks.last().expect("INVALID_NUM_BLOCKS");
+
+        // SAFETY: There are no other active references to the curr and next block and no
+        // concurrent access is possible here.
+        unsafe {
+            last_block.conf.next.with_mut(|next_ptr| {
+                (*next_ptr) = first_block;
+            });
+        }
+        // Now all fields in the Queue are initialized correctly
+        q
+    }
+
+    /// The estimated number of elements currently enqueued.
+    ///
+    /// Items which are currently being stolen do not count towards the length,
+    /// so this method is not suited to determine if the queue is full.
+    #[cfg(feature = "stats")]
+    pub(crate) fn estimated_len(&self) -> usize {
+        self.stats.curr_enqueued()
+    }
+
+    #[cfg(feature = "stats")]
+    pub(crate) fn is_empty(&self) -> bool {
+        self.estimated_len() == 0
+    }
+}
diff --git a/bwosqueue/src/lib.rs b/bwosqueue/src/lib.rs
new file mode 100644
index 00000000000..fd803ce686c
--- /dev/null
+++ b/bwosqueue/src/lib.rs
@@ -0,0 +1,790 @@
+//! The BWoS queue is a fast block-based work stealing queue for parallel processing.
+//!
+//! The BWoS queue is based on the [BBQ] (Block-based Bounded Queue) and is specially designed for the
+//! workstealing scenario. Based on the real-world observation that the "stealing" operation is
+//! rare and most of the operations are local enqueues and dequeues this queue implementation
+//! offers a single [Owner] which can enqueue and dequeue without any heavy synchronization mechanisms
+//! on the fast path. Concurrent stealing is possible and does not slow done the Owner too much.
+//! This allows stealing policies which steal single items or in small batches.
+//!
+//! # Queue Semantics
+//!
+//! - The block-based design reduces the synchronization requirements on the fast-path
+//!   inside a block and moves the heavy synchronization operations necessary to support
+//!   multiple stealers to the slow-path when transitioning to the next block.
+//! - The producer (enqueue) may not advance to the next block if the consumer or a stealer
+//!   is still operating on that block. This allows the producer to remove producer-consumer/stealer
+//!   synchronization from its fast-path operations, but reduces the queue capacity by
+//!   at most one block.
+//! - Stealers may not steal from the same block as the consumer. This allows the consumer
+//!   to remove consumer-stealer synchronization from its fast-path operations, but means
+//!   one block is not available for stealing.
+//! - Consumers may "take-over" the next block preventing stealers from stealing in that
+//!   block after the take-over. Stealers will still proceed with already in-progress steal
+//!   operations in this block.
+//! - This queue implementation puts the producer and consumer into a shared Owner struct,
+//!
+//! # Examples
+//!
+//!  todo
+//!
+//! [BBQ]: https://www.usenix.org/conference/atc22/presentation/wang-jiawei
+//!
+//! # Todo:
+//! - Instead of const generics we could use a boxed slice for a dynamically sized array.
+//!   The performance impact be benchmarked though, since this will result in multiple operations
+//!   not being able to be calculated at compile-time anymore.
+
+#![deny(unsafe_op_in_unsafe_fn)]
+#![warn(unreachable_pub)]
+
+use core::{
+    marker::{Send, Sync},
+    pin::Pin,
+};
+use crossbeam_utils::CachePadded;
+use std::fmt::Formatter;
+use std::mem::MaybeUninit;
+
+mod bwos_queue;
+mod loom;
+mod metadata;
+
+use crate::loom::cell::UnsafeCell;
+use crate::loom::sync::atomic::{
+    AtomicUsize,
+    Ordering::{Acquire, Relaxed, Release},
+};
+use crate::loom::sync::Arc;
+use bwos_queue::{Block, BwsQueue};
+use metadata::{Index, IndexAndVersion};
+
+/// The Owner interface to the BWoS queue
+///
+/// The owner is both the single producer and single consumer.
+#[repr(align(128))]
+pub struct Owner<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> {
+    /// Producer cache (single producer）- points to block in self.queue.
+    pcache: CachePadded<*const Block<E, { ENTRIES_PER_BLOCK }>>,
+    /// Consumer cache (single consumer) - points to block in self.queue.
+    ccache: CachePadded<*const Block<E, { ENTRIES_PER_BLOCK }>>,
+    /// Stealer position cache - Allows the owner to quickly check if there are any stealers
+    spos: CachePadded<Arc<AtomicUsize>>,
+    /// `Arc` to the actual queue to ensure the queue lives at least as long as the Owner.
+    #[allow(dead_code)]
+    queue: Pin<Arc<BwsQueue<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>>>,
+}
+
+/// A Stealer interface to the BWoS queue
+///
+/// There may be multiple stealers. Stealers share the stealer position which is used to quickly look up
+/// the next block for attempted stealing.
+#[repr(align(128))]
+pub struct Stealer<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> {
+    /// The actual stealer position is `self.spos % NUM_BLOCKS`. The position is incremented beyond
+    /// `NUM_BLOCKS` to detect ABA problems.
+    spos: CachePadded<Arc<AtomicUsize>>,
+    queue: Pin<Arc<BwsQueue<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>>>,
+}
+
+/// An iterator over elements of one Block.
+///
+/// The iterator borrows all elements up to `committed` to allows batched
+/// operations on the elements. When the iterator is dropped the entries
+/// are marked as consumed in one atomic operation.
+pub struct BlockIter<'a, E, const ENTRIES_PER_BLOCK: usize> {
+    buffer: &'a [UnsafeCell<MaybeUninit<E>>; ENTRIES_PER_BLOCK],
+    /// Index if the next to be consumed entry in the buffer.
+    i: usize,
+    /// Number of committed entries in the buffer.
+    committed: usize,
+}
+
+/// An iterator over elements of one Block of a stealer
+///
+/// Marks the stolen entries as stolen once the iterator has been consumed.
+pub struct StealerBlockIter<'a, E, const ENTRIES_PER_BLOCK: usize> {
+    /// Stealer Block
+    stealer_block: &'a Block<E, ENTRIES_PER_BLOCK>,
+    /// Remember how many entries where reserved for the Drop implementation
+    num_reserved: usize,
+    /// reserved index of the block. We own the entries from `i..block_reserved`
+    block_reserved: usize,
+    /// curr index in the block
+    i: usize,
+}
+
+unsafe impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> Send
+    for Owner<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+}
+
+// todo: is this really needed?
+unsafe impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> Sync
+    for Owner<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+}
+
+unsafe impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> Send
+    for Stealer<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+}
+
+unsafe impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> Sync
+    for Stealer<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+}
+
+impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>
+    Owner<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+    /// Try to enqueue `t` into the FIFO queue.
+    ///
+    /// If the queue is full, `Err(t)` is returned to the caller.
+    #[inline(always)]
+    pub fn enqueue(&mut self, t: E) -> Result<(), E> {
+        loop {
+            // SAFETY: `pcache` always points to a valid `Block` in the queue. We never create a mutable reference
+            // to a Block, so it is safe to construct a shared reference here.
+            let blk = unsafe { &**self.pcache };
+
+            // Load the index of the next free queue entry for the producer. `committed` is only written to by the
+            // single producer, so `Relaxed` reading is fine.
+            let committed = blk.committed.load(Relaxed);
+            let committed_idx = committed.raw_index();
+
+            // Fastpath (the block is not full): Due to the slowpath checks we know that the entire remaining block
+            // is available to the producer and do not need to check the consumed index in the fastpath.
+            if let Some(entry_cell) = blk.entries.get(committed_idx) {
+                // SAFETY: We checked the entry is available for writing and the index can be
+                // post-incremented unconditionally since `index == NE` is valid and means the block
+                // is full.
+                let committed_new = unsafe {
+                    entry_cell.with_mut(|uninit_entry| uninit_entry.write(MaybeUninit::new(t)));
+                    committed.index_add_unchecked(1)
+                };
+                // Synchronizes with `Acquire` ordering on the stealer side.
+                blk.committed.store(committed_new, Release);
+                #[cfg(feature = "stats")]
+                self.queue.stats.increment_enqueued(1);
+                return Ok(());
+            }
+
+            /* slow path, move to the next block */
+            let nblk = unsafe { &*blk.next() };
+            let next = committed.next_version(nblk.is_head());
+
+            /* check if next block is ready */
+            if !self.is_next_block_writable(nblk, next.version()) {
+                return Err(t);
+            };
+
+            /* reset cursor and advance block */
+            nblk.committed.store(next, Relaxed);
+            nblk.stolen.store(next, Relaxed);
+            // Ensures the writes to `committed` and `stolen` are visible when `reserved` is loaded.
+            nblk.reserved.store(next, Release);
+            *self.pcache = nblk;
+        }
+    }
+
+    pub unsafe fn enqueue_batch_unchecked(
+        &mut self,
+        mut iter: Box<dyn Iterator<Item = E> + '_>,
+    ) -> usize {
+        let mut count = 0;
+        loop {
+            // SAFETY: `pcache` always points to a valid `Block` in the queue. We never create a mutable reference
+            // to a Block, so it is safe to construct a shared reference here.
+            let blk = unsafe { &**self.pcache };
+
+            // Load the index of the next free queue entry for the producer. `committed` is only written to by the
+            // single producer, so `Relaxed` reading is fine.
+            let committed = blk.committed.load(Relaxed);
+            let mut new_committed = committed;
+
+            while new_committed.raw_index() < ENTRIES_PER_BLOCK {
+                // Fastpath (the block is not full): Due to the slowpath checks we know that the entire remaining block
+                // is available to the producer and do not need to check the consumed index in the
+                // fastpath.
+                let entry_opt = iter.next();
+                if let Some(entry) = entry_opt {
+                    blk.entries[new_committed.raw_index()].with_mut(|uninit_entry| unsafe {
+                        uninit_entry.write(MaybeUninit::new(entry))
+                    });
+                    new_committed = unsafe { new_committed.index_add_unchecked(1) };
+                    count += 1;
+                } else {
+                    blk.committed.store(new_committed, Release);
+                    #[cfg(feature = "stats")]
+                    self.queue.stats.increment_enqueued(count);
+                    return count;
+                }
+            }
+            /* slow path, move to the next block */
+            let nblk = unsafe { &*blk.next() };
+            let next = new_committed.next_version(nblk.is_head());
+
+            // The caller promises they already confirmed the next block is ready, so we only
+            // debug assert.
+            debug_assert!(
+                self.is_next_block_writable(nblk, next.version()),
+                "Precondition of unchecked enqueue function violated."
+            );
+
+            /* reset cursor and advance block */
+            nblk.committed.store(next, Relaxed);
+            nblk.stolen.store(next, Relaxed);
+            // The changes to `committed` and `stolen` must be visible when reserved is changed.
+            nblk.reserved.store(next, Release);
+            *self.pcache = nblk;
+        }
+    }
+    /// true if the next block is ready for the producer to start writing.
+    fn is_next_block_writable(
+        &self,
+        next_blk: &Block<E, ENTRIES_PER_BLOCK>,
+        next_block_version: usize,
+    ) -> bool {
+        let expected_version = next_block_version.wrapping_sub(1);
+        let consumed = next_blk.consumed.load(Relaxed);
+        let is_consumed = consumed.index().is_full() && expected_version == consumed.version();
+
+        // The next block must be already _fully_ consumed, since we do not want to checked the `consumed` index
+        // in the enqueue fastpath!
+        if !is_consumed {
+            return false;
+        }
+        // The producer must wait until the next block has no active stealers.
+        let stolen = next_blk.stolen.load(Acquire);
+        if !stolen.index().is_full() || stolen.version() != expected_version {
+            return false;
+        }
+        true
+    }
+}
+
+impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>
+    Owner<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+    /// Try to dequeue the oldest element in the queue.
+    #[inline(always)]
+    pub fn dequeue(&mut self) -> Option<E> {
+        let (blk, consumed) = self.get_consumer_block()?;
+
+        // We trust that the correct index is passed to us here.
+        let entry_cell = &blk.entries[consumed.raw_index()];
+        // SAFETY: We know there is an entry to dequeue, so we know the entry is a valid initialized `E`.
+        let item = unsafe { entry_cell.with(|entry| entry.read().assume_init()) };
+        // SAFETY: We already checked that `consumed_idx < ENTRIES_PER_BLOCK`.
+        let new_consumed = unsafe { consumed.index_add_unchecked(1) };
+        blk.consumed.store(new_consumed, Relaxed);
+        #[cfg(feature = "stats")]
+        self.queue.stats.increment_dequeued(1);
+        return Some(item);
+    }
+
+    /// Try to dequeue all remaining committed entries in the current block.
+    pub fn dequeue_block(&mut self) -> Option<BlockIter<'_, E, ENTRIES_PER_BLOCK>> {
+        let (blk, consumed) = self.get_consumer_block()?;
+
+        let committed = blk.committed.load(Relaxed);
+
+        // We are claiming the tasks **before** reading them out of the buffer.
+        // This is safe because only the **current** thread is able to push new
+        // tasks.
+        //
+        // There isn't really any need for memory ordering... Relaxed would
+        // work. This is because all tasks are pushed into the queue from the
+        // current thread (or memory has been acquired if the local queue handle
+        // moved).
+        blk.consumed.store(committed, Relaxed);
+
+        return Some(BlockIter {
+            buffer: &blk.entries,
+            i: consumed.raw_index(),
+            committed: committed.raw_index(),
+        });
+    }
+
+    // returns true on success, false when advancing not possible.
+    fn try_advance_consumer_block(
+        &mut self,
+        next_block: &Block<E, ENTRIES_PER_BLOCK>,
+        curr_consumed: IndexAndVersion<ENTRIES_PER_BLOCK>,
+    ) -> bool {
+        let next_cons_vsn = curr_consumed
+            .version()
+            .wrapping_add(next_block.is_head() as usize);
+
+        // The reserved field is updated last in `enqueue()`. It is only updated by the producer
+        // (`Owner`), so `Relaxed` is sufficient. If the actual reserved version is not equal to the
+        // expected next consumer version, then the producer has not advanced to the next block yet
+        // and we must wait.
+        let next_reserved_vsn = next_block.reserved.load(Relaxed).version();
+        if next_reserved_vsn != next_cons_vsn {
+            debug_assert!(next_reserved_vsn == next_cons_vsn.wrapping_sub(1));
+            return false;
+        }
+
+        /* stop stealers */
+        let reserved_new = IndexAndVersion::new(next_cons_vsn, Index::full());
+        // todo: Why can this be Relaxed?
+        let reserved_old = next_block.reserved.swap(reserved_new, Relaxed);
+        debug_assert_eq!(reserved_old.version(), next_cons_vsn);
+        let reserved_old_idx = reserved_old.raw_index();
+
+        // Number of entries that can't be stolen anymore because we stopped stealing.
+        let num_consumer_owned = ENTRIES_PER_BLOCK.saturating_sub(reserved_old_idx);
+        // Increase `stolen`, by the number of entries that can't be stolen anymore and are now up to the
+        // consumer to deqeuue. This ensures that, once the stealers have finished stealing the already reserved
+        // entries, `nblk.stolen == ENTRIES_PER_BLOCK` holds, i.e. this block is marked as having no active
+        // stealers, which will allow the producer to the enter this block again (in the next round).
+        next_block.stolen.fetch_add(num_consumer_owned, Relaxed);
+
+        /* advance the block and try again */
+        // The consumer must skip already reserved entries.
+        next_block.consumed.store(reserved_old, Relaxed);
+        *self.ccache = next_block;
+        true
+    }
+
+    /// Advance consumer to the next block, unless the producer has not reached the block yet.
+    fn can_advance_consumer_block(
+        &self,
+        next_block: &Block<E, ENTRIES_PER_BLOCK>,
+        curr_consumed: IndexAndVersion<ENTRIES_PER_BLOCK>,
+    ) -> bool {
+        let next_cons_vsn = curr_consumed
+            .version()
+            .wrapping_add(next_block.is_head() as usize);
+        // The reserved field is updated last in `enqueue()`. It is only updated by the producer
+        // (`Owner`), so `Relaxed` is sufficient. If the actual reserved version is not equal to the
+        // expected next consumer version, then the producer has not advanced to the next block yet
+        // and we must wait.
+        let next_reserved_vsn = next_block.reserved.load(Relaxed).version();
+        if next_reserved_vsn != next_cons_vsn {
+            debug_assert!(next_reserved_vsn == next_cons_vsn.wrapping_sub(1));
+            return false;
+        }
+        true
+    }
+
+    // /// Advance consumer to the next block, unless the producer has not reached the block yet.
+    // fn try_advance_consumer_block(
+    //     &mut self,
+    //     next_block: &Block<E, ENTRIES_PER_BLOCK>,
+    //     curr_consumed: IndexAndVersion<ENTRIES_PER_BLOCK>,
+    // ) -> Result<(), ()> {
+    //     if self.can_advance_consumer_block(next_block, curr_consumed) {
+    //         *self.ccache = next_block;
+    //         Ok(())
+    //     } else {
+    //         Err(())
+    //     }
+    // }
+
+    /// Todo: Ideally we would not have this function.
+    pub fn has_stealers(&self) -> bool {
+        let curr_spos = self.spos.load(Relaxed);
+        // spos increments beyond NUM_BLOCKS to prevent ABA problems.
+        let start_block_idx = curr_spos % NUM_BLOCKS;
+        for i in 0..NUM_BLOCKS {
+            let block_idx = (start_block_idx + i) % NUM_BLOCKS;
+            let blk: &Block<E, ENTRIES_PER_BLOCK> = &self.queue.blocks[block_idx];
+            let stolen = blk.stolen.load(Relaxed);
+            let reserved = blk.reserved.load(Relaxed);
+            if reserved != stolen {
+                return true;
+            } else if !reserved.index().is_full() {
+                return false;
+            }
+        }
+        false
+    }
+
+    /// Check if there is a block available for stealing in the queue.
+    ///
+    /// Note that stealing may still fail for a number of reasons even if this function returned true
+    /// Todo: the overhead could be reduced, if we allow this function to return false in some
+    /// cases when the queue size is low.
+    #[cfg(feature = "stats")]
+    pub fn has_stealable_block(&self) -> bool {
+        let n = self.queue.stats.curr_enqueued();
+        // SAFETY: self.ccache always points to a valid Block.
+        let committed_idx = unsafe { (**self.ccache).committed.load(Relaxed).raw_index() };
+        // SAFETY: self.ccache always points to a valid Block.
+        let consumed_idx = unsafe { (**self.ccache).consumed.load(Relaxed).raw_index() };
+        // true if there are more items enqueued in total than enqueued in the current block.
+        n > (committed_idx - consumed_idx)
+    }
+
+    /// Returns `true` if enqueuing one block of entries would succeed.
+    pub fn can_enqueue_block(&self) -> bool {
+        // Note: the current implementation of this function is overly conservative but fast.
+        let current_block = unsafe { &*(**self.pcache).next() };
+        let committed = current_block.committed.load(Relaxed);
+        if committed.index().is_empty() {
+            true
+        } else {
+            self.is_next_block_writable(current_block, committed.version())
+        }
+    }
+
+    /// `true` if there is at least one entry that can be dequeued.
+    ///
+    /// It is possible that a dequeue can still fail, since the item was stolen after we checked
+    /// and before the consumer advanced to the block in question.
+    pub fn can_consume(&self) -> bool {
+        // SAFETY: `ccache` always points to a valid `Block` in the queue. We never create a mutable reference
+        // to a Block, so it is safe to construct a shared reference here.
+        let current_blk_cache = unsafe { &**self.ccache };
+        let mut blk = current_blk_cache;
+        for _ in 0..NUM_BLOCKS + 1 {
+            // check if the block is fully consumed already
+            let consumed = blk.consumed.load(Relaxed);
+            let consumed_idx = consumed.raw_index();
+
+            // Fastpath (Block is not fully consumed yet)
+            if consumed_idx < ENTRIES_PER_BLOCK {
+                // we know the block is not full, but we must first check if there is an entry to
+                // dequeue.
+                let committed_idx = blk.committed.load(Relaxed).raw_index();
+                if consumed_idx == committed_idx {
+                    return false;
+                }
+
+                /* There is an entry to dequeue */
+                return true;
+            }
+
+            /* Slow-path */
+
+            /* Consumer head may never pass the Producer head and Consumer/Stealer tail */
+            let nblk = unsafe { &*blk.next() };
+            if self.can_advance_consumer_block(nblk, consumed) {
+                blk = nblk;
+            } else {
+                return false;
+            }
+            /* We advanced to the next block - loop around and try again */
+        }
+        // Since there is no concurrent enqueuing and the buffer is bounded, we should reach
+        // one of the exit conditions in at most NUM_BLOCKS iterations.
+        unreachable!()
+    }
+
+    fn get_consumer_block(
+        &mut self,
+    ) -> Option<(
+        &Block<E, ENTRIES_PER_BLOCK>,
+        IndexAndVersion<ENTRIES_PER_BLOCK>,
+    )> {
+        // SAFETY: `ccache` always points to a valid `Block` in the queue. We never create a mutable reference
+        // to a Block, so it is safe to construct a shared reference here.
+        let current_blk_cache = unsafe { &**self.ccache };
+        let mut blk = current_blk_cache;
+        // The +1 is necessary to advance again to our original starting block, this time with a
+        // new version. This can happen in the edge-case that all items in the queue where stolen.
+        for _ in 0..NUM_BLOCKS + 1 {
+            // check if the block is fully consumed already
+            let consumed = blk.consumed.load(Relaxed);
+            let consumed_idx = consumed.raw_index();
+
+            // Fastpath (Block is not fully consumed yet)
+            if consumed_idx < ENTRIES_PER_BLOCK {
+                // we know the block is not full, but we must first check if there is an entry to
+                // dequeue.
+                let committed_idx = blk.committed.load(Relaxed).raw_index();
+                if consumed_idx == committed_idx {
+                    return None;
+                }
+
+                /* There is an entry to dequeue */
+                return Some((blk, consumed));
+            }
+
+            /* Slow-path */
+
+            /* Consumer head may never pass the Producer head and Consumer/Stealer tail */
+            let nblk = unsafe { &*blk.next() };
+            if self.try_advance_consumer_block(nblk, consumed) {
+                blk = nblk;
+            } else {
+                return None;
+            }
+            /* We advanced to the next block - loop around and try again */
+        }
+        // Since there is no concurrent enqueuing and the buffer is bounded, we should reach
+        // one of the exit conditions in at most NUM_BLOCKS+1 iterations.
+        unreachable!()
+    }
+}
+
+impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize> Clone
+    for Stealer<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+    fn clone(&self) -> Self {
+        Self {
+            spos: self.spos.clone(),
+            queue: self.queue.clone(),
+        }
+    }
+}
+
+impl<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>
+    Stealer<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>
+{
+    /// Try to steal a single item from the queue
+    #[inline]
+    pub fn steal(&self) -> Option<E> {
+        loop {
+            let (blk, curr_spos) = self.curr_block();
+
+            /* check if the block is fully reserved */
+            let reserved = blk.reserved.load(Acquire);
+            let reserved_idx = reserved.raw_index();
+
+            if reserved_idx < ENTRIES_PER_BLOCK {
+                /* check if we have an entry to occupy */
+                let committed = blk.committed.load(Acquire);
+                let committed_idx = committed.raw_index();
+                if reserved_idx == committed_idx {
+                    return None;
+                }
+                // SAFETY: We checked before that `reserved_idx` < ENTRIES_PER_BLOCK, so the index
+                // can't overflow.
+                let new_reserved = unsafe { reserved.index_add_unchecked(1) };
+                let reserve_res =
+                    blk.reserved
+                        .compare_exchange_weak(reserved, new_reserved, Release, Relaxed);
+                if reserve_res.is_err() {
+                    return None;
+                }
+
+                /* we got the entry */
+
+                #[cfg(feature = "stats")]
+                self.queue.stats.increment_stolen(1);
+
+                // SAFETY: We know the entry is a valid and initialized `E` and is now exclusively owned by us.
+                let t =
+                    unsafe { blk.entries[reserved_idx].with(|entry| entry.read().assume_init()) };
+                // `t` is now owned by us so we mark the stealing as finished. Synchronizes with the Owner Acquire.
+                let old_stolen = blk.stolen.fetch_add(1, Release);
+                debug_assert!(old_stolen.raw_index() < ENTRIES_PER_BLOCK);
+                return Some(t);
+            }
+
+            // Slow-path: The current block is already fully reserved. Try to advance to the next block
+            if !self.can_advance(blk, reserved) {
+                return None;
+            }
+            self.try_advance_spos(curr_spos);
+        }
+    }
+
+    /// Get the current stealer `Block` and the corresponding stealer position (`spos`)
+    ///
+    /// The returned `spos` can be larger than `NUM_BLOCKS` to detect [ABA](https://en.wikipedia.org/wiki/ABA_problem)
+    /// situations.
+    fn curr_block(&self) -> (&Block<E, ENTRIES_PER_BLOCK>, usize) {
+        let curr_spos = self.spos.load(Relaxed);
+        // spos increments beyond NUM_BLOCKS to prevent ABA problems.
+        let block_idx = curr_spos % NUM_BLOCKS;
+        let blk: &Block<E, ENTRIES_PER_BLOCK> = &self.queue.blocks[block_idx];
+        (blk, curr_spos)
+    }
+
+    /// Try to steal a block from `self`.
+    ///
+    /// Tries to steal a full block from `self`. If the block is not fully
+    /// committed yet it will steal up to and including the last committed entry
+    /// of that block.
+    #[inline]
+    pub fn steal_block(&self) -> Option<StealerBlockIter<'_, E, ENTRIES_PER_BLOCK>> {
+        loop {
+            let (blk, curr_spos) = self.curr_block();
+
+            /* check if the block is fully reserved */
+            let reserved = blk.reserved.load(Acquire);
+            let reserved_idx = reserved.raw_index();
+
+            if reserved_idx < ENTRIES_PER_BLOCK {
+                /* check if we have an entry to occupy */
+                let committed = blk.committed.load(Acquire);
+                let committed_idx = committed.raw_index();
+                if reserved_idx == committed_idx {
+                    return None;
+                }
+
+                // Try to steal the block up to the latest committed entry
+                let reserve_res = blk
+                    .reserved
+                    .compare_exchange_weak(reserved, committed, Release, Relaxed);
+
+                if reserve_res.is_err() {
+                    return None;
+                }
+
+                let num_reserved = committed_idx - reserved_idx;
+                // From the statistics perspective we consider the reserved range to already be
+                // stolen, since it is not available for the consumer or other stealers anymore.
+                #[cfg(feature = "stats")]
+                self.queue.stats.increment_stolen(num_reserved);
+                return Some(StealerBlockIter {
+                    stealer_block: blk,
+                    block_reserved: committed_idx,
+                    i: reserved_idx,
+                    num_reserved,
+                });
+            }
+
+            // Slow-path: The current block is already fully reserved. Try to advance to next block
+            if !self.can_advance(blk, reserved) {
+                return None;
+            }
+            self.try_advance_spos(curr_spos);
+        }
+    }
+
+    /// True if the stealer can advance to the next block
+    fn can_advance(
+        &self,
+        curr_block: &Block<E, ENTRIES_PER_BLOCK>,
+        curr_reserved: IndexAndVersion<ENTRIES_PER_BLOCK>,
+    ) -> bool {
+        /* r_head never pass the w_head and r_tail */
+        let nblk = unsafe { &*curr_block.next() };
+        let next_expect_vsn = curr_reserved.version() + nblk.is_head() as usize;
+        let next_actual_vsn = nblk.reserved.load(Relaxed).version();
+        next_expect_vsn == next_actual_vsn
+    }
+
+    /// Try and advance `spos` to the next block.
+    ///
+    /// We are not interested in the failure case, since the next stealer can just try again.
+    fn try_advance_spos(&self, curr_spos: usize) {
+        // Ignore result. Failure means a different stealer succeeded in updating
+        // the stealer block index. In case of a sporadic failure the next stealer will try again.
+        let _ =
+            self.spos
+                .compare_exchange_weak(curr_spos, curr_spos.wrapping_add(1), Relaxed, Relaxed);
+    }
+
+    /// The estimated number of entries currently enqueued.
+    #[cfg(feature = "stats")]
+    pub fn estimated_queue_entries(&self) -> usize {
+        self.queue.estimated_len()
+    }
+}
+
+impl<'a, E, const ENTRIES_PER_BLOCK: usize> Iterator for BlockIter<'a, E, ENTRIES_PER_BLOCK> {
+    type Item = E;
+
+    #[inline]
+    fn next(&mut self) -> Option<E> {
+        let i = self.i;
+        self.i += 1;
+        if i < self.committed {
+            self.buffer.get(i).map(|entry_cell| {
+                entry_cell.with(|entry| {
+                    // SAFETY: we claimed the entries
+                    unsafe { entry.read().assume_init() }
+                })
+            })
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a, E, const ENTRIES_PER_BLOCK: usize> Iterator
+    for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK>
+{
+    type Item = E;
+
+    #[inline]
+    fn next(&mut self) -> Option<E> {
+        if self.i < self.block_reserved {
+            let entry = self.stealer_block.entries[self.i].with(|entry| {
+                // SAFETY: we claimed the entries
+                unsafe { entry.read().assume_init() }
+            });
+            self.i += 1;
+            Some(entry)
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a, E, const ENTRIES_PER_BLOCK: usize> Drop for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> {
+    fn drop(&mut self) {
+        // Ensure `Drop` is called on any items that where not consumed, by consuming the iterator,
+        // which implicitly dequeues all items
+        while self.next().is_some() {}
+        self.stealer_block
+            .stolen
+            .fetch_add(self.num_reserved, Release);
+    }
+}
+
+impl<'a, E, const ENTRIES_PER_BLOCK: usize> StealerBlockIter<'a, E, ENTRIES_PER_BLOCK> {
+    pub fn len(&self) -> usize {
+        self.block_reserved - self.i
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+impl<'a, E, const ENTRIES_PER_BLOCK: usize> core::fmt::Debug
+    for StealerBlockIter<'a, E, ENTRIES_PER_BLOCK>
+{
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_fmt(format_args!(
+            "StealerBlockIter over {} entries",
+            self.block_reserved - self.i
+        ))
+    }
+}
+
+/// Create a new BWoS queue and return the [Owner] and a [Stealer] instance
+///
+/// `NUM_BLOCKS` must be a power two and at least 2. `ENTRIES_PER_BLOCK` can be freely chosen (non-zero).
+/// The total length of the queue is `NUM_BLOCKS * ENTRIES_PER_BLOCK` and must not be more than `usize::MAX`.
+///
+/// ## Performance considerations
+///
+/// The Owner throughput will improve with a larger `ENTRIES_PER_BLOCK` value.
+/// Thieves however will prefer a higher `NUM_BLOCKS` count since it makes it easier to
+/// steal a whole block.
+pub fn new<E, const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>() -> (
+    Owner<E, { NUM_BLOCKS }, { ENTRIES_PER_BLOCK }>,
+    Stealer<E, { NUM_BLOCKS }, { ENTRIES_PER_BLOCK }>,
+) {
+    assert!(NUM_BLOCKS.checked_mul(ENTRIES_PER_BLOCK).is_some());
+    assert!(NUM_BLOCKS.is_power_of_two());
+    assert!(NUM_BLOCKS >= 1);
+    assert!(ENTRIES_PER_BLOCK >= 1);
+
+    let q: Pin<Arc<BwsQueue<E, NUM_BLOCKS, ENTRIES_PER_BLOCK>>> = BwsQueue::new();
+    let first_block = &q.blocks[0];
+
+    let stealer_position = Arc::new(AtomicUsize::new(0));
+
+    (
+        Owner {
+            pcache: CachePadded::new(first_block),
+            ccache: CachePadded::new(first_block),
+            spos: CachePadded::new(stealer_position.clone()),
+            queue: q.clone(),
+        },
+        Stealer {
+            spos: CachePadded::new(stealer_position),
+            queue: q,
+        },
+    )
+}
diff --git a/bwosqueue/src/loom/mocked.rs b/bwosqueue/src/loom/mocked.rs
new file mode 100644
index 00000000000..367d59b43a4
--- /dev/null
+++ b/bwosqueue/src/loom/mocked.rs
@@ -0,0 +1,40 @@
+pub(crate) use loom::*;
+
+pub(crate) mod sync {
+
+    pub(crate) use loom::sync::MutexGuard;
+
+    #[derive(Debug)]
+    pub(crate) struct Mutex<T>(loom::sync::Mutex<T>);
+
+    #[allow(dead_code)]
+    impl<T> Mutex<T> {
+        #[inline]
+        pub(crate) fn new(t: T) -> Mutex<T> {
+            Mutex(loom::sync::Mutex::new(t))
+        }
+
+        #[inline]
+        pub(crate) fn lock(&self) -> MutexGuard<'_, T> {
+            self.0.lock().unwrap()
+        }
+
+        #[inline]
+        pub(crate) fn try_lock(&self) -> Option<MutexGuard<'_, T>> {
+            self.0.try_lock().ok()
+        }
+    }
+    pub(crate) use loom::sync::*;
+}
+
+pub(crate) mod rand {
+    pub(crate) fn seed() -> u64 {
+        1
+    }
+}
+
+pub(crate) mod sys {
+    pub(crate) fn num_cpus() -> usize {
+        2
+    }
+}
diff --git a/bwosqueue/src/loom/mod.rs b/bwosqueue/src/loom/mod.rs
new file mode 100644
index 00000000000..7925aa630b5
--- /dev/null
+++ b/bwosqueue/src/loom/mod.rs
@@ -0,0 +1,16 @@
+//! This module abstracts over `loom` and `std::sync` depending on whether we
+//! are running tests or not.
+//! This module is directly copied from tokio. Everything in this module is subject to the same license as tokio.
+
+#![allow(unused)]
+#![allow(unsafe_op_in_unsafe_fn)]
+
+#[cfg(not(loom))]
+mod std;
+#[cfg(not(loom))]
+pub(crate) use self::std::*;
+
+#[cfg(loom)]
+mod mocked;
+#[cfg(loom)]
+pub(crate) use self::mocked::*;
diff --git a/bwosqueue/src/loom/std/atomic_ptr.rs b/bwosqueue/src/loom/std/atomic_ptr.rs
new file mode 100644
index 00000000000..236645f037b
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_ptr.rs
@@ -0,0 +1,34 @@
+use std::fmt;
+use std::ops::{Deref, DerefMut};
+
+/// `AtomicPtr` providing an additional `load_unsync` function.
+pub(crate) struct AtomicPtr<T> {
+    inner: std::sync::atomic::AtomicPtr<T>,
+}
+
+impl<T> AtomicPtr<T> {
+    pub(crate) fn new(ptr: *mut T) -> AtomicPtr<T> {
+        let inner = std::sync::atomic::AtomicPtr::new(ptr);
+        AtomicPtr { inner }
+    }
+}
+
+impl<T> Deref for AtomicPtr<T> {
+    type Target = std::sync::atomic::AtomicPtr<T>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl<T> DerefMut for AtomicPtr<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
+impl<T> fmt::Debug for AtomicPtr<T> {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/src/loom/std/atomic_u16.rs b/bwosqueue/src/loom/std/atomic_u16.rs
new file mode 100644
index 00000000000..c1c531208c2
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_u16.rs
@@ -0,0 +1,44 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops::Deref;
+
+/// `AtomicU16` providing an additional `load_unsync` function.
+pub(crate) struct AtomicU16 {
+    inner: UnsafeCell<std::sync::atomic::AtomicU16>,
+}
+
+unsafe impl Send for AtomicU16 {}
+unsafe impl Sync for AtomicU16 {}
+
+impl AtomicU16 {
+    pub(crate) const fn new(val: u16) -> AtomicU16 {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicU16::new(val));
+        AtomicU16 { inner }
+    }
+
+    /// Performs an unsynchronized load.
+    ///
+    /// # Safety
+    ///
+    /// All mutations must have happened before the unsynchronized load.
+    /// Additionally, there must be no concurrent mutations.
+    pub(crate) unsafe fn unsync_load(&self) -> u16 {
+        *(*self.inner.get()).get_mut()
+    }
+}
+
+impl Deref for AtomicU16 {
+    type Target = std::sync::atomic::AtomicU16;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicU16 {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/src/loom/std/atomic_u32.rs b/bwosqueue/src/loom/std/atomic_u32.rs
new file mode 100644
index 00000000000..61f95fb30ce
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_u32.rs
@@ -0,0 +1,34 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops::Deref;
+
+/// `AtomicU32` providing an additional `load_unsync` function.
+pub(crate) struct AtomicU32 {
+    inner: UnsafeCell<std::sync::atomic::AtomicU32>,
+}
+
+unsafe impl Send for AtomicU32 {}
+unsafe impl Sync for AtomicU32 {}
+
+impl AtomicU32 {
+    pub(crate) const fn new(val: u32) -> AtomicU32 {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicU32::new(val));
+        AtomicU32 { inner }
+    }
+}
+
+impl Deref for AtomicU32 {
+    type Target = std::sync::atomic::AtomicU32;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicU32 {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/src/loom/std/atomic_u64.rs b/bwosqueue/src/loom/std/atomic_u64.rs
new file mode 100644
index 00000000000..d7a656f2e24
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_u64.rs
@@ -0,0 +1,86 @@
+//! Implementation of an atomic u64 cell. On 64 bit platforms, this is a
+//! re-export of `AtomicU64`. On 32 bit platforms, this is implemented using a
+//! `Mutex`.
+
+// `AtomicU64` can only be used on targets with `target_has_atomic` is 64 or greater.
+// Once `cfg_target_has_atomic` feature is stable, we can replace it with
+// `#[cfg(target_has_atomic = "64")]`.
+// Refs: https://github.com/rust-lang/rust/tree/master/src/librustc_target
+//cfg_has_atomic_u64! {
+pub(crate) use std::sync::atomic::AtomicU64;
+//}
+//
+// cfg_not_has_atomic_u64! {
+//     use crate::loom::sync::Mutex;
+//     use std::sync::atomic::Ordering;
+//
+//     #[derive(Debug)]
+//     pub(crate) struct AtomicU64 {
+//         inner: Mutex<u64>,
+//     }
+//
+//     impl AtomicU64 {
+//         pub(crate) fn new(val: u64) -> Self {
+//             Self {
+//                 inner: Mutex::new(val),
+//             }
+//         }
+//
+//         pub(crate) fn load(&self, _: Ordering) -> u64 {
+//             *self.inner.lock()
+//         }
+//
+//         pub(crate) fn store(&self, val: u64, _: Ordering) {
+//             *self.inner.lock() = val;
+//         }
+//
+//         pub(crate) fn fetch_add(&self, val: u64, _: Ordering) -> u64 {
+//             let mut lock = self.inner.lock();
+//             let prev = *lock;
+//             *lock = prev + val;
+//             prev
+//         }
+//
+//         pub(crate) fn fetch_or(&self, val: u64, _: Ordering) -> u64 {
+//             let mut lock = self.inner.lock();
+//             let prev = *lock;
+//             *lock = prev | val;
+//             prev
+//         }
+//
+//         pub(crate) fn compare_exchange(
+//             &self,
+//             current: u64,
+//             new: u64,
+//             _success: Ordering,
+//             _failure: Ordering,
+//         ) -> Result<u64, u64> {
+//             let mut lock = self.inner.lock();
+//
+//             if *lock == current {
+//                 *lock = new;
+//                 Ok(current)
+//             } else {
+//                 Err(*lock)
+//             }
+//         }
+//
+//         pub(crate) fn compare_exchange_weak(
+//             &self,
+//             current: u64,
+//             new: u64,
+//             success: Ordering,
+//             failure: Ordering,
+//         ) -> Result<u64, u64> {
+//             self.compare_exchange(current, new, success, failure)
+//         }
+//     }
+//
+//     impl Default for AtomicU64 {
+//         fn default() -> AtomicU64 {
+//             Self {
+//                 inner: Mutex::new(0),
+//             }
+//         }
+//     }
+// }
diff --git a/bwosqueue/src/loom/std/atomic_u8.rs b/bwosqueue/src/loom/std/atomic_u8.rs
new file mode 100644
index 00000000000..408aea338c6
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_u8.rs
@@ -0,0 +1,34 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops::Deref;
+
+/// `AtomicU8` providing an additional `load_unsync` function.
+pub(crate) struct AtomicU8 {
+    inner: UnsafeCell<std::sync::atomic::AtomicU8>,
+}
+
+unsafe impl Send for AtomicU8 {}
+unsafe impl Sync for AtomicU8 {}
+
+impl AtomicU8 {
+    pub(crate) const fn new(val: u8) -> AtomicU8 {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicU8::new(val));
+        AtomicU8 { inner }
+    }
+}
+
+impl Deref for AtomicU8 {
+    type Target = std::sync::atomic::AtomicU8;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicU8 {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.deref().fmt(fmt)
+    }
+}
diff --git a/bwosqueue/src/loom/std/atomic_usize.rs b/bwosqueue/src/loom/std/atomic_usize.rs
new file mode 100644
index 00000000000..0d5f36e4310
--- /dev/null
+++ b/bwosqueue/src/loom/std/atomic_usize.rs
@@ -0,0 +1,56 @@
+use std::cell::UnsafeCell;
+use std::fmt;
+use std::ops;
+
+/// `AtomicUsize` providing an additional `load_unsync` function.
+pub(crate) struct AtomicUsize {
+    inner: UnsafeCell<std::sync::atomic::AtomicUsize>,
+}
+
+unsafe impl Send for AtomicUsize {}
+unsafe impl Sync for AtomicUsize {}
+
+impl AtomicUsize {
+    pub(crate) const fn new(val: usize) -> AtomicUsize {
+        let inner = UnsafeCell::new(std::sync::atomic::AtomicUsize::new(val));
+        AtomicUsize { inner }
+    }
+
+    /// Performs an unsynchronized load.
+    ///
+    /// # Safety
+    ///
+    /// All mutations must have happened before the unsynchronized load.
+    /// Additionally, there must be no concurrent mutations.
+    pub(crate) unsafe fn unsync_load(&self) -> usize {
+        *(*self.inner.get()).get_mut()
+    }
+
+    pub(crate) fn with_mut<R>(&mut self, f: impl FnOnce(&mut usize) -> R) -> R {
+        // safety: we have mutable access
+        f(unsafe { (*self.inner.get()).get_mut() })
+    }
+}
+
+impl ops::Deref for AtomicUsize {
+    type Target = std::sync::atomic::AtomicUsize;
+
+    fn deref(&self) -> &Self::Target {
+        // safety: it is always safe to access `&self` fns on the inner value as
+        // we never perform unsafe mutations.
+        unsafe { &*self.inner.get() }
+    }
+}
+
+impl ops::DerefMut for AtomicUsize {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        // safety: we hold `&mut self`
+        unsafe { &mut *self.inner.get() }
+    }
+}
+
+impl fmt::Debug for AtomicUsize {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        (**self).fmt(fmt)
+    }
+}
diff --git a/bwosqueue/src/loom/std/mod.rs b/bwosqueue/src/loom/std/mod.rs
new file mode 100644
index 00000000000..0c70bee74eb
--- /dev/null
+++ b/bwosqueue/src/loom/std/mod.rs
@@ -0,0 +1,108 @@
+#![cfg_attr(any(not(feature = "full"), loom), allow(unused_imports, dead_code))]
+
+mod atomic_ptr;
+mod atomic_u16;
+mod atomic_u32;
+mod atomic_u64;
+mod atomic_u8;
+mod atomic_usize;
+mod mutex;
+#[cfg(feature = "parking_lot")]
+mod parking_lot;
+mod unsafe_cell;
+
+pub(crate) mod cell {
+    pub(crate) use super::unsafe_cell::UnsafeCell;
+}
+
+#[cfg(any(
+    feature = "net",
+    feature = "process",
+    feature = "signal",
+    feature = "sync",
+))]
+pub(crate) mod future {
+    pub(crate) use crate::sync::AtomicWaker;
+}
+
+pub(crate) mod hint {
+    pub(crate) use std::hint::spin_loop;
+}
+
+pub(crate) mod rand {
+    use std::collections::hash_map::RandomState;
+    use std::hash::{BuildHasher, Hash, Hasher};
+    use std::sync::atomic::AtomicU32;
+    use std::sync::atomic::Ordering::Relaxed;
+
+    static COUNTER: AtomicU32 = AtomicU32::new(1);
+
+    pub(crate) fn seed() -> u64 {
+        let rand_state = RandomState::new();
+
+        let mut hasher = rand_state.build_hasher();
+
+        // Hash some unique-ish data to generate some new state
+        COUNTER.fetch_add(1, Relaxed).hash(&mut hasher);
+
+        // Get the seed
+        hasher.finish()
+    }
+}
+
+pub(crate) mod sync {
+    pub(crate) use std::sync::{Arc, Weak};
+
+    // Below, make sure all the feature-influenced types are exported for
+    // internal use. Note however that some are not _currently_ named by
+    // consuming code.
+
+    #[cfg(feature = "parking_lot")]
+    #[allow(unused_imports)]
+    pub(crate) use crate::loom::std::parking_lot::{
+        Condvar, Mutex, MutexGuard, RwLock, RwLockReadGuard, WaitTimeoutResult,
+    };
+
+    #[cfg(not(feature = "parking_lot"))]
+    #[allow(unused_imports)]
+    pub(crate) use std::sync::{Condvar, MutexGuard, RwLock, RwLockReadGuard, WaitTimeoutResult};
+
+    #[cfg(not(feature = "parking_lot"))]
+    pub(crate) use crate::loom::std::mutex::Mutex;
+
+    pub(crate) mod atomic {
+        pub(crate) use crate::loom::std::atomic_ptr::AtomicPtr;
+        pub(crate) use crate::loom::std::atomic_u16::AtomicU16;
+        pub(crate) use crate::loom::std::atomic_u32::AtomicU32;
+        pub(crate) use crate::loom::std::atomic_u64::AtomicU64;
+        pub(crate) use crate::loom::std::atomic_u8::AtomicU8;
+        pub(crate) use crate::loom::std::atomic_usize::AtomicUsize;
+
+        pub(crate) use std::sync::atomic::{fence, AtomicBool, Ordering};
+    }
+}
+
+pub(crate) mod sys {
+    #[cfg(feature = "rt-multi-thread")]
+    pub(crate) fn num_cpus() -> usize {
+        usize::max(1, num_cpus::get())
+    }
+
+    #[cfg(not(feature = "rt-multi-thread"))]
+    pub(crate) fn num_cpus() -> usize {
+        1
+    }
+}
+
+pub(crate) mod thread {
+    #[inline]
+    pub(crate) fn yield_now() {
+        std::hint::spin_loop();
+    }
+
+    #[allow(unused_imports)]
+    pub(crate) use std::thread::{
+        current, panicking, park, park_timeout, sleep, spawn, Builder, JoinHandle, LocalKey,
+        Result, Thread, ThreadId,
+    };
+}
diff --git a/bwosqueue/src/loom/std/mutex.rs b/bwosqueue/src/loom/std/mutex.rs
new file mode 100644
index 00000000000..3f686e0a78e
--- /dev/null
+++ b/bwosqueue/src/loom/std/mutex.rs
@@ -0,0 +1,31 @@
+use std::sync::{self, MutexGuard, TryLockError};
+
+/// Adapter for `std::Mutex` that removes the poisoning aspects
+/// from its api.
+#[derive(Debug)]
+pub(crate) struct Mutex<T: ?Sized>(sync::Mutex<T>);
+
+#[allow(dead_code)]
+impl<T> Mutex<T> {
+    #[inline]
+    pub(crate) fn new(t: T) -> Mutex<T> {
+        Mutex(sync::Mutex::new(t))
+    }
+
+    #[inline]
+    pub(crate) fn lock(&self) -> MutexGuard<'_, T> {
+        match self.0.lock() {
+            Ok(guard) => guard,
+            Err(p_err) => p_err.into_inner(),
+        }
+    }
+
+    #[inline]
+    pub(crate) fn try_lock(&self) -> Option<MutexGuard<'_, T>> {
+        match self.0.try_lock() {
+            Ok(guard) => Some(guard),
+            Err(TryLockError::Poisoned(p_err)) => Some(p_err.into_inner()),
+            Err(TryLockError::WouldBlock) => None,
+        }
+    }
+}
diff --git a/bwosqueue/src/loom/std/parking_lot.rs b/bwosqueue/src/loom/std/parking_lot.rs
new file mode 100644
index 00000000000..e3af258d116
--- /dev/null
+++ b/bwosqueue/src/loom/std/parking_lot.rs
@@ -0,0 +1,184 @@
+//! A minimal adaption of the `parking_lot` synchronization primitives to the
+//! equivalent `std::sync` types.
+//!
+//! This can be extended to additional types/methods as required.
+
+use std::fmt;
+use std::marker::PhantomData;
+use std::ops::{Deref, DerefMut};
+use std::sync::LockResult;
+use std::time::Duration;
+
+// All types in this file are marked with PhantomData to ensure that
+// parking_lot's send_guard feature does not leak through and affect when Tokio
+// types are Send.
+//
+// See <https://github.com/tokio-rs/tokio/pull/4359> for more info.
+
+// Types that do not need wrapping
+pub(crate) use parking_lot::WaitTimeoutResult;
+
+#[derive(Debug)]
+pub(crate) struct Mutex<T: ?Sized>(PhantomData<std::sync::Mutex<T>>, parking_lot::Mutex<T>);
+
+#[derive(Debug)]
+pub(crate) struct RwLock<T>(PhantomData<std::sync::RwLock<T>>, parking_lot::RwLock<T>);
+
+#[derive(Debug)]
+pub(crate) struct Condvar(PhantomData<std::sync::Condvar>, parking_lot::Condvar);
+
+#[derive(Debug)]
+pub(crate) struct MutexGuard<'a, T: ?Sized>(
+    PhantomData<std::sync::MutexGuard<'a, T>>,
+    parking_lot::MutexGuard<'a, T>,
+);
+
+#[derive(Debug)]
+pub(crate) struct RwLockReadGuard<'a, T: ?Sized>(
+    PhantomData<std::sync::RwLockReadGuard<'a, T>>,
+    parking_lot::RwLockReadGuard<'a, T>,
+);
+
+#[derive(Debug)]
+pub(crate) struct RwLockWriteGuard<'a, T: ?Sized>(
+    PhantomData<std::sync::RwLockWriteGuard<'a, T>>,
+    parking_lot::RwLockWriteGuard<'a, T>,
+);
+
+impl<T> Mutex<T> {
+    #[inline]
+    pub(crate) fn new(t: T) -> Mutex<T> {
+        Mutex(PhantomData, parking_lot::Mutex::new(t))
+    }
+
+    #[inline]
+    #[cfg(all(feature = "parking_lot", not(all(loom, test))))]
+    #[cfg_attr(docsrs, doc(cfg(all(feature = "parking_lot",))))]
+    pub(crate) const fn const_new(t: T) -> Mutex<T> {
+        Mutex(PhantomData, parking_lot::const_mutex(t))
+    }
+
+    #[inline]
+    pub(crate) fn lock(&self) -> MutexGuard<'_, T> {
+        MutexGuard(PhantomData, self.1.lock())
+    }
+
+    #[inline]
+    pub(crate) fn try_lock(&self) -> Option<MutexGuard<'_, T>> {
+        self.1
+            .try_lock()
+            .map(|guard| MutexGuard(PhantomData, guard))
+    }
+
+    #[inline]
+    pub(crate) fn get_mut(&mut self) -> &mut T {
+        self.1.get_mut()
+    }
+
+    // Note: Additional methods `is_poisoned` and `into_inner`, can be
+    // provided here as needed.
+}
+
+impl<'a, T: ?Sized> Deref for MutexGuard<'a, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.1.deref()
+    }
+}
+
+impl<'a, T: ?Sized> DerefMut for MutexGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut T {
+        self.1.deref_mut()
+    }
+}
+
+impl<T> RwLock<T> {
+    pub(crate) fn new(t: T) -> RwLock<T> {
+        RwLock(PhantomData, parking_lot::RwLock::new(t))
+    }
+
+    pub(crate) fn read(&self) -> LockResult<RwLockReadGuard<'_, T>> {
+        Ok(RwLockReadGuard(PhantomData, self.1.read()))
+    }
+
+    pub(crate) fn write(&self) -> LockResult<RwLockWriteGuard<'_, T>> {
+        Ok(RwLockWriteGuard(PhantomData, self.1.write()))
+    }
+}
+
+impl<'a, T: ?Sized> Deref for RwLockReadGuard<'a, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.1.deref()
+    }
+}
+
+impl<'a, T: ?Sized> Deref for RwLockWriteGuard<'a, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.1.deref()
+    }
+}
+
+impl<'a, T: ?Sized> DerefMut for RwLockWriteGuard<'a, T> {
+    fn deref_mut(&mut self) -> &mut T {
+        self.1.deref_mut()
+    }
+}
+
+impl Condvar {
+    #[inline]
+    pub(crate) fn new() -> Condvar {
+        Condvar(PhantomData, parking_lot::Condvar::new())
+    }
+
+    #[inline]
+    pub(crate) fn notify_one(&self) {
+        self.1.notify_one();
+    }
+
+    #[inline]
+    pub(crate) fn notify_all(&self) {
+        self.1.notify_all();
+    }
+
+    #[inline]
+    pub(crate) fn wait<'a, T>(
+        &self,
+        mut guard: MutexGuard<'a, T>,
+    ) -> LockResult<MutexGuard<'a, T>> {
+        self.1.wait(&mut guard.1);
+        Ok(guard)
+    }
+
+    #[inline]
+    pub(crate) fn wait_timeout<'a, T>(
+        &self,
+        mut guard: MutexGuard<'a, T>,
+        timeout: Duration,
+    ) -> LockResult<(MutexGuard<'a, T>, WaitTimeoutResult)> {
+        let wtr = self.1.wait_for(&mut guard.1, timeout);
+        Ok((guard, wtr))
+    }
+
+    // Note: Additional methods `wait_timeout_ms`, `wait_timeout_until`,
+    // `wait_until` can be provided here as needed.
+}
+
+impl<'a, T: ?Sized + fmt::Display> fmt::Display for MutexGuard<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.1, f)
+    }
+}
+
+impl<'a, T: ?Sized + fmt::Display> fmt::Display for RwLockReadGuard<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.1, f)
+    }
+}
+
+impl<'a, T: ?Sized + fmt::Display> fmt::Display for RwLockWriteGuard<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.1, f)
+    }
+}
diff --git a/bwosqueue/src/loom/std/unsafe_cell.rs b/bwosqueue/src/loom/std/unsafe_cell.rs
new file mode 100644
index 00000000000..66c1d7943e0
--- /dev/null
+++ b/bwosqueue/src/loom/std/unsafe_cell.rs
@@ -0,0 +1,16 @@
+#[derive(Debug)]
+pub(crate) struct UnsafeCell<T>(std::cell::UnsafeCell<T>);
+
+impl<T> UnsafeCell<T> {
+    pub(crate) const fn new(data: T) -> UnsafeCell<T> {
+        UnsafeCell(std::cell::UnsafeCell::new(data))
+    }
+
+    pub(crate) fn with<R>(&self, f: impl FnOnce(*const T) -> R) -> R {
+        f(self.0.get())
+    }
+
+    pub(crate) fn with_mut<R>(&self, f: impl FnOnce(*mut T) -> R) -> R {
+        f(self.0.get())
+    }
+}
diff --git a/bwosqueue/src/metadata.rs b/bwosqueue/src/metadata.rs
new file mode 100644
index 00000000000..59315bc819c
--- /dev/null
+++ b/bwosqueue/src/metadata.rs
@@ -0,0 +1,233 @@
+//! Contains metadata for the block configuration
+
+use crate::loom::sync::atomic::{AtomicUsize, Ordering};
+use core::fmt::{Debug, Formatter};
+
+/// A container for the current block index and block version
+///
+/// `NE` is the number of elements in a block (index `0..NE`). `index == NE` marks a full block.
+///
+/// Bits `0..=NE_LOG_CEIL`, where `NE_LOG_CEIL` is `(NE+1).next_power_of_two()).log2()`
+/// are reserved for the index.
+/// Bits `(NE_LOG_CEIL + 1)..` are used for the block version. The version field is
+/// used to detect [ABA](https://en.wikipedia.org/wiki/ABA_problem) situations when accessing queue entries.
+#[repr(transparent)]
+#[derive(PartialEq, Eq, Copy, Clone)]
+pub(crate) struct IndexAndVersion<const NE: usize>(usize);
+
+/// The index of the current element in the block
+///
+/// 0 represents an empty block while NE represents a full block.
+#[repr(transparent)]
+pub(crate) struct Index<const NE: usize>(usize);
+
+impl<const NUM_ELEMENTS_PER_BLOCK: usize> Index<NUM_ELEMENTS_PER_BLOCK> {
+    /// Creates an Index for an empty block
+    #[inline(always)]
+    pub(crate) fn empty() -> Self {
+        Self(0)
+    }
+
+    /// Creates an Index for a full block
+    #[inline(always)]
+    pub(crate) fn full() -> Self {
+        Self(NUM_ELEMENTS_PER_BLOCK)
+    }
+
+    /// True if the block is full
+    #[inline(always)]
+    pub(crate) fn is_full(&self) -> bool {
+        self.0 == NUM_ELEMENTS_PER_BLOCK
+    }
+
+    /// True if the block is empty
+    #[inline(always)]
+    pub(crate) fn is_empty(&self) -> bool {
+        self.0 == 0
+    }
+}
+
+// todo: use atomic usize after fixing overflow problem to support 32bit
+#[repr(transparent)]
+pub(crate) struct AtomicIndexAndVersion<const NE: usize>(AtomicUsize);
+
+impl<const NE: usize> IndexAndVersion<{ NE }> {
+    // 0 elements per block make no sense
+    const _ASSERT_NE_GREATER_ZERO: () = assert!(NE > 0);
+    const MIN_VERSION_BITS: u32 = 1;
+    // Subtract 1 to get the maximum number representable by that amount of bits and subtract another one to allow for
+    // representing the full block state (`idx == NE`).
+    const MAX_NE: usize = 2_usize.pow(usize::BITS - Self::MIN_VERSION_BITS) - 2;
+    const _ASSERT_NE_MAX: () = assert!(NE <= Self::MAX_NE);
+
+    #[inline(always)]
+    fn raw(&self) -> usize {
+        self.0
+    }
+
+    #[inline(always)]
+    fn max_version() -> usize {
+        let num_version_bits = usize::BITS - Self::ne_log() as u32;
+        2_usize.pow(num_version_bits).wrapping_sub(1)
+    }
+
+    /// Number of bits used for the Number of elements in a block
+    ///
+    /// Guaranteed to be at least 1.
+    #[inline]
+    fn ne_log() -> usize {
+        #[allow(clippy::let_unit_value)]
+        let _ = Self::_ASSERT_NE_GREATER_ZERO;
+        #[allow(clippy::let_unit_value)]
+        let _ = Self::_ASSERT_NE_MAX;
+        // (const) integer logarithm is not stable yet, so we need to use floating point and
+        // rely on the compiler to optimize this away at compile time.
+        ((NE + 1).next_power_of_two() as f32).log2() as usize
+    }
+
+    #[inline(always)]
+    pub(crate) fn new(version: usize, index: Index<NE>) -> Self {
+        debug_assert!(version <= Self::max_version());
+
+        Self(version.wrapping_shl(Self::ne_log() as u32) | index.0)
+    }
+
+    #[inline(always)]
+    fn from_raw(raw: usize) -> Self {
+        Self(raw)
+    }
+
+    #[inline(always)]
+    pub(crate) fn version(&self) -> usize {
+        self.0.wrapping_shr(Self::ne_log() as u32)
+    }
+
+    /// Increment the version by one if this is the first block and reset index
+    #[inline]
+    pub(crate) fn next_version(&self, is_first_block: bool) -> Self {
+        let cur_version_shifted = self.0 & Self::version_mask();
+        let first_bit_pos_version = Self::ne_log() as u32;
+        let new_version_shifted = cur_version_shifted
+            .wrapping_add((is_first_block as usize).wrapping_shl(first_bit_pos_version));
+        // index is now zeroed.
+        Self(new_version_shifted)
+    }
+
+    /// A bitmask for the bits used for the block index
+    #[inline(always)]
+    fn index_mask() -> usize {
+        // ne_log will be at least 1, so the subtraction will never wrap around
+        1_usize.wrapping_shl(Self::ne_log() as u32) - 1
+    }
+
+    #[inline(always)]
+    fn version_mask() -> usize {
+        !Self::index_mask()
+    }
+
+    #[inline(always)]
+    pub(crate) fn index(&self) -> Index<NE> {
+        // We are sure that the index we stored is valid
+        Index(self.raw_index())
+    }
+
+    #[inline(always)]
+    pub(crate) fn raw_index(&self) -> usize {
+        self.0 & Self::index_mask()
+    }
+
+    #[inline(always)]
+    pub(crate) fn set_full(&self) -> Self {
+        Self((self.0 & Self::version_mask()) | NE)
+    }
+
+    /// Increment the Index by `rhs`.
+    ///
+    /// # Safety
+    ///
+    /// The caller be sure that the result of self + rhs is <= NE.
+    #[inline(always)]
+    pub(crate) unsafe fn index_add_unchecked(&self, rhs: usize) -> Self {
+        debug_assert!(self.raw_index() + rhs <= NE);
+        Self(self.0.wrapping_add(rhs))
+    }
+}
+
+impl<const NE: usize> Debug for IndexAndVersion<{ NE }> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("IndexAndVersion")
+            .field("Index", &self.raw_index())
+            .field("Version", &self.version())
+            .finish()
+    }
+}
+
+impl<const NE: usize> AtomicIndexAndVersion<{ NE }> {
+    #[inline(always)]
+    pub(crate) fn load(&self, order: Ordering) -> IndexAndVersion<NE> {
+        let v = self.0.load(order);
+        IndexAndVersion::from_raw(v)
+    }
+
+    /// Creates a new instance for an `Owner` field (producer or consumer
+    pub(crate) fn new_owner(is_queue_head: bool) -> Self {
+        let empty_val: IndexAndVersion<NE> = if is_queue_head {
+            // The first block (head) starts at version one and with an empty index
+            // to indicate readiness to produce/consume once values where produced.
+            IndexAndVersion::new(1, Index::empty())
+        } else {
+            // The remaining blocks start one version behind and are marked as fully
+            // produced/consumed.
+            IndexAndVersion::new(0, Index::full())
+        };
+        Self(AtomicUsize::new(empty_val.raw()))
+    }
+
+    /// Creates a new instance for a `Stealer` field. The main difference to
+    /// [new_owner](Self::new_owner) is that the stealer is always initialized as full,
+    /// i.e. not ready for stealing. This is because the queue head is reserved for the
+    /// consumer and the stealer may not steal from the same block the consumer is on.
+    pub(crate) fn new_stealer(is_queue_head: bool) -> Self {
+        let full_val: IndexAndVersion<NE> =
+            IndexAndVersion::new(is_queue_head as usize, Index::full());
+        Self(AtomicUsize::new(full_val.raw()))
+    }
+
+    #[inline(always)]
+    pub(crate) fn fetch_add(&self, val: usize, order: Ordering) -> IndexAndVersion<NE> {
+        let old = self.0.fetch_add(val, order);
+        IndexAndVersion::from_raw(old)
+    }
+
+    #[inline(always)]
+    pub(crate) fn compare_exchange_weak(
+        &self,
+        current: IndexAndVersion<NE>,
+        new: IndexAndVersion<NE>,
+        success: Ordering,
+        failure: Ordering,
+    ) -> Result<IndexAndVersion<NE>, IndexAndVersion<NE>> {
+        self.0
+            .compare_exchange_weak(current.raw(), new.raw(), success, failure)
+            .map_err(IndexAndVersion::from_raw)
+            .map(IndexAndVersion::from_raw)
+    }
+
+    #[inline(always)]
+    pub(crate) fn store(&self, val: IndexAndVersion<NE>, order: Ordering) {
+        self.0.store(val.raw(), order)
+    }
+
+    #[inline(always)]
+    pub(crate) fn swap(&self, val: IndexAndVersion<NE>, order: Ordering) -> IndexAndVersion<NE> {
+        let old = self.0.swap(val.raw(), order);
+        IndexAndVersion::from_raw(old)
+    }
+}
+
+impl<const NE: usize> Debug for AtomicIndexAndVersion<{ NE }> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
+        let val = self.load(Ordering::SeqCst);
+        f.write_fmt(format_args!("{:?}", val))
+    }
+}
diff --git a/bwosqueue/tests/blocked_stealer.rs b/bwosqueue/tests/blocked_stealer.rs
new file mode 100644
index 00000000000..a806b5973ad
--- /dev/null
+++ b/bwosqueue/tests/blocked_stealer.rs
@@ -0,0 +1,66 @@
+#![cfg(not(loom))]
+// A test to check behaviour is sane when the queue is full and empty at the same time
+// when waiting on a stealer.
+#[test]
+fn blocked_stealer() {
+    const NUM_BLOCKS: usize = 4;
+    const ENTRIES_PER_BLOCK: usize = 4;
+    let (mut owner, stealer) = bwosqueue::new::<usize, NUM_BLOCKS, ENTRIES_PER_BLOCK>();
+    let mut total_enqueues = 0;
+    for i in 0..ENTRIES_PER_BLOCK + 2 {
+        owner.enqueue(i).unwrap();
+        total_enqueues += 1;
+    }
+    let mut stolen_iter = stealer.steal_block().unwrap();
+    // We have now reserved the items in the block but not dequeued them yet.
+
+    let mut total_dequeued = 0;
+    // consume first block
+    loop {
+        if let Some(val) = owner.dequeue() {
+            assert_eq!(val, total_dequeued);
+            total_dequeued += 1;
+        } else {
+            break;
+        }
+    }
+
+    // push until full
+    while owner.enqueue(total_enqueues).is_ok() {
+        total_enqueues += 1;
+    }
+
+    loop {
+        if let Some(val) = owner.dequeue() {
+            // 2 entries where already reserved by stealer
+            assert_eq!(val, total_dequeued + 2);
+            total_dequeued += 1;
+        } else {
+            break;
+        }
+    }
+
+    // We wrapped around once and are now stuck at the end of the first block waiting on the stealer
+    assert_eq!(total_enqueues, (NUM_BLOCKS + 1) * ENTRIES_PER_BLOCK);
+    assert_eq!(total_dequeued, (NUM_BLOCKS + 1) * ENTRIES_PER_BLOCK - 2);
+
+    assert_eq!(
+        ENTRIES_PER_BLOCK,
+        stolen_iter.next().expect("No stolen item")
+    );
+    // Stealer is not finished yet, so consumer and producer should still be stuck
+    assert_eq!(owner.enqueue(42), Err(42));
+    assert_eq!(owner.dequeue(), None);
+    // Let the stealer finish
+    assert_eq!(
+        ENTRIES_PER_BLOCK + 1,
+        stolen_iter.next().expect("No stolen item")
+    );
+    assert_eq!(stolen_iter.next(), None);
+    // Manually drop to unstuck
+    drop(stolen_iter);
+    // Producer and Consumer are now both unstuck, but the queue is still empty.
+    assert_eq!(owner.dequeue(), None);
+    assert!(owner.enqueue(1).is_ok());
+    assert_eq!(total_enqueues, total_dequeued + 2);
+}
diff --git a/bwosqueue/tests/loom.rs b/bwosqueue/tests/loom.rs
new file mode 100644
index 00000000000..829e289c6e6
--- /dev/null
+++ b/bwosqueue/tests/loom.rs
@@ -0,0 +1,367 @@
+use std::sync::atomic::Ordering::{Acquire, Relaxed, Release, SeqCst};
+use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
+use tracing::{event, span, Level};
+
+#[cfg(loom)]
+use loom::{
+    self, model,
+    thread::{self, JoinHandle},
+};
+#[cfg(not(loom))]
+use std::thread::{self, JoinHandle};
+
+use bwosqueue::Owner;
+
+#[cfg(not(loom))]
+fn model<F>(f: F)
+where
+    F: FnOnce(),
+{
+    f();
+}
+
+type QueueOwner = bwosqueue::Owner<u64, 2, 2>;
+type QueueStealer = bwosqueue::Stealer<u64, 2, 2>;
+
+struct Stat {
+    sum: u64,
+    buf: u64,
+}
+
+impl Stat {
+    fn new() -> Self {
+        Self { sum: 0, buf: 1 }
+    }
+    fn put(&mut self, owner: &mut QueueOwner) {
+        if owner.enqueue(self.buf).is_ok() {
+            event!(Level::INFO, "put succeeded");
+            self.sum += self.buf;
+            self.buf <<= 1;
+        } else {
+            event!(Level::INFO, "put failed");
+        }
+    }
+
+    fn get(&mut self, owner: &mut QueueOwner) -> bool {
+        if let Some(data) = owner.dequeue() {
+            event!(Level::INFO, "get succeeded");
+            self.sum += data;
+            true
+        } else {
+            event!(Level::INFO, "get failed");
+            false
+        }
+    }
+
+    fn steal(&mut self, stealer: &QueueStealer) {
+        event!(Level::INFO, "attempting to steal");
+        if let Some(data) = stealer.steal() {
+            event!(Level::INFO, "steal succeeded");
+            self.sum += data;
+        } else {
+            event!(Level::INFO, "steal failed");
+        }
+    }
+}
+
+fn thread0(
+    mut q_owner: QueueOwner,
+    mut enq_stat: Stat,
+    mut deq_stat: Stat,
+) -> (QueueOwner, Stat, Stat) {
+    let owner = &mut q_owner;
+
+    let span = span!(Level::INFO, "Owner Put A");
+    let guard = span.enter();
+    for i in 0..3 {
+        event!(Level::INFO, put_iter = i);
+        enq_stat.put(owner);
+    }
+    drop(guard);
+
+    let span = span!(Level::INFO, "Owner Get B");
+    let guard = span.enter();
+    for i in 0..2 {
+        event!(Level::INFO, get_iter = i);
+        deq_stat.get(owner);
+    }
+    drop(guard);
+
+    let span = span!(Level::INFO, "Owner Put C");
+    let guard = span.enter();
+    for i in 0..4 {
+        event!(Level::INFO, put_iter = i);
+        enq_stat.put(owner);
+    }
+    drop(guard);
+
+    let span = span!(Level::INFO, "Owner Get D");
+    let guard = span.enter();
+    for i in 0..3 {
+        event!(Level::INFO, get_iter = i);
+        deq_stat.get(owner);
+    }
+    drop(guard);
+
+    let span = span!(Level::INFO, "Owner Put E");
+    let guard = span.enter();
+    for i in 0..3 {
+        event!(Level::INFO, put_iter = i);
+        enq_stat.put(owner);
+    }
+    drop(guard);
+
+    for _ in 0..4 {
+        deq_stat.get(owner);
+    }
+
+    (q_owner, enq_stat, deq_stat)
+}
+
+fn thread1(stealer: QueueStealer, mut s1: Stat) -> Stat {
+    let span = span!(Level::INFO, "Stealer 1");
+    let _guard = span.enter();
+    s1.steal(&stealer);
+    event!(Level::INFO, "Steal A done");
+    s1
+}
+
+fn thread2(stealer: QueueStealer, mut s2: Stat) -> Stat {
+    let span = span!(Level::INFO, "Stealer 2");
+    let _guard = span.enter();
+    s2.steal(&stealer);
+    event!(Level::INFO, "Steal B done");
+    s2.steal(&stealer);
+    event!(Level::INFO, "Steal C done");
+    s2
+}
+
+fn test_inner(stealers: usize) {
+    assert!(stealers <= 2, "We only have 2 stealers implemented");
+    let explored_executions = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(0));
+    let l_explored_executions = explored_executions.clone();
+    println!();
+    model(move || {
+        let current_iteration = l_explored_executions.fetch_add(1, Ordering::Relaxed);
+        let (owner, s1): (QueueOwner, QueueStealer) = bwosqueue::new();
+        let enq_stat = Stat::new();
+        let deq_stat = Stat::new();
+        let s1_stat = Stat::new();
+        let s2_stat = Stat::new();
+
+        let owner_handle = thread::spawn(move || thread0(owner, enq_stat, deq_stat));
+
+        let mut stealer_handles = Vec::with_capacity(stealers);
+        if stealers > 0 {
+            if stealers > 1 {
+                let s2 = s1.clone();
+                stealer_handles.push(thread::spawn(move || thread2(s2, s2_stat)));
+            }
+            stealer_handles.push(thread::spawn(move || thread1(s1, s1_stat)));
+        }
+
+        let (mut owner, enq_stat, mut deq_stat) =
+            owner_handle.join().expect("Owner thread panicked");
+        let total_stolen: u64 = stealer_handles
+            .into_iter()
+            .map(|handle| handle.join().expect("Stealer thread panicked").sum)
+            .sum();
+        while deq_stat.get(&mut owner) {}
+        assert_eq!(enq_stat.sum, deq_stat.sum + total_stolen);
+
+        if current_iteration > 0 && current_iteration % 50_000 == 0 {
+            println!("Explored {current_iteration} iterations");
+        }
+    });
+    println!(
+        "Loom model explored {} interleavings.",
+        explored_executions.load(SeqCst)
+    );
+}
+
+#[test]
+fn no_stealer() {
+    test_inner(0);
+}
+
+#[test]
+fn one_stealer() {
+    test_inner(1);
+}
+
+// This test will take a very long time with loom, so ignore it unless specifically requested
+#[test]
+#[cfg_attr(loom, ignore)]
+fn two_stealers() {
+    test_inner(2);
+}
+
+#[test]
+fn steal_block_loom() {
+    model(|| {
+        const NB: usize = 4;
+        const NE: usize = 4;
+        let (mut owner, stealer) = bwosqueue::new::<u64, NB, NE>();
+        // explicitly not a loom type, since this only for the test and we do not care about reorderings
+        let total_dequeues: std::sync::Arc<AtomicU32> = std::sync::Arc::new(AtomicU32::new(0));
+
+        let mut total_enq = 0;
+        while owner.enqueue(5).is_ok() {
+            total_enq += 1;
+        }
+
+        let mut handles: [Option<JoinHandle<Owner<u64, NB, NE>>>; NB - 1] =
+            array_init::array_init(|_| None);
+        for i in 0..NB - 1 {
+            let local_stealer = stealer.clone();
+            let local_total_dequeues = total_dequeues.clone();
+            let handle = thread::spawn(move || {
+                let (mut dst_owner, _) = bwosqueue::new::<u64, NB, NE>();
+                // Any ordering of stealers and consumer is possible, so maybe the consumer consumed everything
+                // already and there is nothing left to steal.
+                // Stealing could fail sporadically due to steal_block nature, but we can't do much about that.
+                if let Some(stolen_iter) = local_stealer.steal_block() {
+                    let stolen_len =
+                        unsafe { dst_owner.enqueue_batch_unchecked(Box::new(stolen_iter)) };
+                    assert!(
+                        stolen_len > 0,
+                        "Successfull steal implies at least one stolen item"
+                    );
+                    loop {
+                        if let Some(val) = dst_owner.dequeue() {
+                            assert_eq!(val, 5);
+                            local_total_dequeues.fetch_add(1, Relaxed);
+                        } else {
+                            break;
+                        };
+                    }
+                }
+
+                dst_owner
+            });
+            handles[i] = Some(handle);
+        }
+        loop {
+            if let Some(val) = owner.dequeue() {
+                assert_eq!(val, 5);
+                total_dequeues.fetch_add(1, Relaxed);
+            } else {
+                break;
+            };
+        }
+
+        for handle in handles {
+            let mut dst_queue = handle
+                .expect("Handle not initialized")
+                .join()
+                .expect("Join failed");
+            #[cfg(feature = "stats")]
+            assert!(!dst_queue.can_consume());
+            assert!(dst_queue.dequeue().is_none());
+            assert!(dst_queue.dequeue_block().is_none());
+        }
+
+        std::sync::atomic::fence(SeqCst);
+        assert_eq!(total_dequeues.load(SeqCst), total_enq);
+
+        #[cfg(feature = "stats")]
+        assert!(!owner.can_consume());
+    });
+}
+
+#[test]
+fn queue_loom() {
+    model(|| {
+        const NB: usize = 4;
+        const NE: usize = 8;
+        const ITERATIONS: u32 = 80;
+        let (mut owner, stealer) = bwosqueue::new::<u64, NB, NE>();
+        // explicitly not `loom` types, since this only for the test and we do not care about reorderings
+        let total_dequeues = std::sync::Arc::new(AtomicU32::new(0));
+        let finished = std::sync::Arc::new(AtomicBool::new(false));
+
+        let owner_total_deq = total_dequeues.clone();
+        let owner_finished = finished.clone();
+        let owner_thread_handle = thread::spawn(move || {
+            let mut total_enq: u32 = 0;
+            while total_enq < ITERATIONS {
+                while total_enq < ITERATIONS && owner.enqueue(5).is_ok() {
+                    total_enq += 1;
+                }
+
+                loop {
+                    if let Some(res) = owner.dequeue() {
+                        assert_eq!(res, 5);
+                        owner_total_deq.fetch_add(1, Relaxed);
+                    } else {
+                        break;
+                    }
+                }
+            }
+            loop {
+                if let Some(val) = owner.dequeue() {
+                    assert_eq!(val, 5);
+                    owner_total_deq.fetch_add(1, Relaxed);
+                } else {
+                    break;
+                };
+            }
+
+            #[cfg(feature = "stats")]
+            assert!(!owner.can_consume());
+            owner_finished.store(true, Release);
+            total_enq
+        });
+
+        let mut handles: [Option<JoinHandle<_>>; 2] = array_init::array_init(|_| None);
+        for i in 0..2 {
+            let local_stealer = stealer.clone();
+            let local_total_dequeues = total_dequeues.clone();
+            let local_finished = finished.clone();
+
+            let handle = thread::spawn(move || {
+                let (mut dst_owner, _) = bwosqueue::new::<u64, NB, NE>();
+
+                // Any ordering of stealers and consumer is possible, so maybe the consumer consumed everything
+                // already and there is nothing left to steal.
+                // Stealing could fail sporadically due to steal_block nature, but we can't do much about that.
+                while !local_finished.load(Acquire) {
+                    if let Some(stolen_iter) = local_stealer.steal_block() {
+                        let num_stolen =
+                            unsafe { dst_owner.enqueue_batch_unchecked(Box::new(stolen_iter)) };
+                        assert!(
+                            num_stolen > 0,
+                            "Successful steal implies at least one stolen item"
+                        );
+
+                        loop {
+                            if let Some(val) = dst_owner.dequeue() {
+                                assert_eq!(val, 5);
+                                local_total_dequeues.fetch_add(1, Relaxed);
+                            } else {
+                                break;
+                            };
+                        }
+                    }
+                }
+                dst_owner
+            });
+            handles[i] = Some(handle);
+        }
+
+        for handle in handles {
+            let mut dst_queue = handle
+                .expect("Handle not initialized")
+                .join()
+                .expect("Join failed");
+            #[cfg(feature = "stats")]
+            assert!(!dst_queue.can_consume());
+            assert!(dst_queue.dequeue().is_none());
+            assert!(dst_queue.dequeue_block().is_none());
+        }
+
+        let total_enqueued = owner_thread_handle.join().expect("Owner thread failed");
+        std::sync::atomic::fence(SeqCst);
+        assert_eq!(total_dequeues.load(SeqCst), total_enqueued);
+    });
+}
diff --git a/bwosqueue/tests/simple.rs b/bwosqueue/tests/simple.rs
new file mode 100644
index 00000000000..d37840b9019
--- /dev/null
+++ b/bwosqueue/tests/simple.rs
@@ -0,0 +1,71 @@
+//! Simple, single threaded test cases
+
+#[cfg(not(loom))]
+#[test]
+fn simple_enqueue_dequeue() {
+    const NB: usize = 8;
+    const NE: usize = 1024;
+    let (mut owner, _) = bwosqueue::new::<u64, NB, NE>();
+
+    let mut i = 0;
+    while owner.enqueue(i).is_ok() {
+        i += 1;
+    }
+
+    i = 0;
+    loop {
+        if let Some(val) = owner.dequeue() {
+            assert_eq!(val, i);
+            i += 1;
+        } else {
+            break;
+        };
+    }
+
+    // use owner outside of iter to control drop
+    #[cfg(feature = "stats")]
+    assert!(!owner.can_consume())
+}
+
+#[cfg(not(loom))]
+#[test]
+fn steal_block() {
+    const NB: usize = 8;
+    const NE: usize = 1024;
+    let (mut owner, stealer) = bwosqueue::new::<u64, NB, NE>();
+    let (mut dst_owner, _) = bwosqueue::new::<u64, NB, NE>();
+
+    let mut i = 0;
+    while owner.enqueue(i).is_ok() {
+        i += 1;
+    }
+    // steal all blocks except the consumer block
+    for _ in 0..NB - 1 {
+        let items = stealer.steal_block().unwrap();
+        unsafe { dst_owner.enqueue_batch_unchecked(Box::new(items)) };
+    }
+
+    i = 0;
+    loop {
+        if let Some(val) = owner.dequeue() {
+            assert_eq!(val, i);
+            i += 1;
+        } else {
+            break;
+        };
+    }
+
+    #[cfg(feature = "stats")]
+    assert!(!owner.can_consume());
+
+    loop {
+        if let Some(val) = dst_owner.dequeue() {
+            assert_eq!(val, i);
+            i += 1;
+        } else {
+            break;
+        };
+    }
+    #[cfg(feature = "stats")]
+    assert!(!dst_owner.can_consume());
+}
diff --git a/bwosqueue/tests/steal_block.rs b/bwosqueue/tests/steal_block.rs
new file mode 100644
index 00000000000..90cc858ffb2
--- /dev/null
+++ b/bwosqueue/tests/steal_block.rs
@@ -0,0 +1,228 @@
+extern crate core;
+
+use bwosqueue::{Owner, Stealer};
+use core::time;
+use core_affinity::CoreId;
+use rand;
+use rand::Rng;
+use std::arch::asm;
+use std::sync::atomic::AtomicBool;
+use std::sync::atomic::Ordering::{Relaxed, SeqCst};
+use std::sync::Arc;
+use std::thread;
+
+#[derive(Copy, Clone)]
+struct TestParams {
+    num_stealers: usize,
+    duration: usize,
+    idle_loop: usize,
+    push_percentage: usize,
+    stealer_core: Option<usize>,
+    steal_blocks: bool,
+}
+
+impl Default for TestParams {
+    fn default() -> Self {
+        Self {
+            num_stealers: 0,
+            duration: 1,
+            idle_loop: 0,
+            push_percentage: 50,
+            stealer_core: None,
+            steal_blocks: false,
+        }
+    }
+}
+fn owner_thread<const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>(
+    mut owner: Owner<u64, NUM_BLOCKS, ENTRIES_PER_BLOCK>,
+    push_percentage: usize,
+    stop_signal: Arc<AtomicBool>,
+) -> (usize, usize) {
+    let mut counter: usize = 0;
+    let mut rng = rand::thread_rng();
+
+    let mut enqueued_count = 0;
+    let mut dequeued_count = 0;
+
+    loop {
+        if rng.gen_range(1..=100) > push_percentage {
+            loop {
+                if let Some(data) = owner.dequeue() {
+                    assert_eq!(data, 12345);
+                    dequeued_count += 1;
+                } else {
+                    break;
+                }
+            }
+        } else {
+            while owner.enqueue(12345).is_ok() {
+                enqueued_count += 1;
+            }
+        }
+        counter = counter.wrapping_add(1);
+        if counter % 1000 == 0 {
+            if stop_signal.load(Relaxed) {
+                break;
+            }
+        }
+    }
+    // dequeue until empty
+    loop {
+        match owner.dequeue() {
+            Some(data) => {
+                assert_eq!(data, 12345);
+                dequeued_count += 1;
+            }
+            None => {
+                break;
+            }
+        }
+    }
+
+    (enqueued_count, dequeued_count)
+}
+
+fn stealer_thread<const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>(
+    stealer: Stealer<u64, NUM_BLOCKS, ENTRIES_PER_BLOCK>,
+    stealer_core: Option<CoreId>,
+    stop_signal: Arc<AtomicBool>,
+    stealer_work_noops: usize,
+) -> usize {
+    if let Some(core_id) = stealer_core {
+        core_affinity::set_for_current(core_id);
+    }
+    let mut num_stolen = 0;
+    let mut counter: usize = 0;
+    loop {
+        if let Some(data) = stealer.steal() {
+            assert!(data > 0);
+            num_stolen += 1;
+            for _ in 0..stealer_work_noops {
+                unsafe { asm!("nop") }
+            }
+        }
+        counter = counter.wrapping_add(1);
+        if counter % 1000 == 0 && stop_signal.load(Relaxed) {
+            break;
+        }
+    }
+    num_stolen
+}
+
+fn steal_block_thread<const NUM_BLOCKS: usize, const ENTRIES_PER_BLOCK: usize>(
+    stealer: Stealer<u64, NUM_BLOCKS, ENTRIES_PER_BLOCK>,
+    stealer_core: Option<CoreId>,
+    stop_signal: Arc<AtomicBool>,
+    stealer_work_noops: usize,
+) -> usize {
+    if let Some(core_id) = stealer_core {
+        core_affinity::set_for_current(core_id);
+    }
+    let mut num_stolen = 0;
+    let mut counter: usize = 0;
+    loop {
+        if let Some(mut items) = stealer.steal_block() {
+            assert!(items.len() > 0);
+            let stolen = items.len();
+            num_stolen += items.len();
+            for _ in 0..stealer_work_noops {
+                unsafe { asm!("nop") }
+            }
+            // start at one to account for `data` which was not enqueued into the local queue.
+            let mut local_dequeues = 0;
+            loop {
+                if let Some(data) = items.next() {
+                    assert!(data > 0);
+                    local_dequeues += 1;
+                } else {
+                    break;
+                }
+            }
+            assert_eq!(local_dequeues, stolen);
+        }
+        counter = counter.wrapping_add(1);
+        if counter % 1000 == 0 && stop_signal.load(Relaxed) {
+            break;
+        }
+    }
+    num_stolen
+}
+
+fn test_queue(params: TestParams) {
+    let stop = Arc::new(AtomicBool::new(false));
+    let (owner, stealer) = bwosqueue::new::<u64, 8, 32>();
+
+    let producer_stop = stop.clone();
+    let owner_handle =
+        thread::spawn(move || owner_thread(owner, params.push_percentage, producer_stop));
+    let mut stealer_handles = Vec::with_capacity(params.num_stealers);
+    for k in 0..params.num_stealers {
+        let stealer = stealer.clone();
+        let stealer_stop = stop.clone();
+
+        let stealer_core = if let Some(core_id) = params.stealer_core {
+            Some(CoreId { id: core_id })
+        } else {
+            Some(CoreId { id: k + 1 })
+        };
+        let stealer_handle = thread::spawn(move || {
+            if params.steal_blocks {
+                steal_block_thread(stealer, stealer_core, stealer_stop, params.idle_loop)
+            } else {
+                stealer_thread(stealer, stealer_core, stealer_stop, params.idle_loop)
+            }
+        });
+        stealer_handles.push(stealer_handle);
+    }
+
+    thread::sleep(time::Duration::from_secs(params.duration as u64));
+    println!("Test finished");
+    stop.store(true, SeqCst);
+    let total_stolen: usize = stealer_handles
+        .into_iter()
+        .map(|handle| handle.join().expect("Joining stealer failed"))
+        .sum();
+    println!("Waiting for owner to finish");
+    let (total_enqueued, total_dequeued) = owner_handle.join().expect("Failed to join owner");
+    assert_eq!(total_enqueued, total_dequeued + total_stolen);
+}
+
+#[test]
+fn no_stealers_short() {
+    let p = TestParams {
+        num_stealers: 0,
+        duration: 10,
+        idle_loop: 0,
+        push_percentage: 50,
+        stealer_core: None,
+        steal_blocks: false,
+    };
+    test_queue(p);
+}
+
+#[test]
+fn with_stealers_short() {
+    let p = TestParams {
+        num_stealers: 2,
+        duration: 10,
+        idle_loop: 0,
+        push_percentage: 50,
+        stealer_core: None,
+        steal_blocks: false,
+    };
+    test_queue(p);
+}
+
+#[test]
+#[ignore]
+fn with_stealers_long() {
+    let p = TestParams {
+        num_stealers: 5,
+        duration: 100,
+        idle_loop: 500000,
+        push_percentage: 70,
+        stealer_core: None,
+        steal_blocks: true,
+    };
+    test_queue(p);
+}
diff --git a/tokio/Cargo.toml b/tokio/Cargo.toml
index 7dffee1ab7f..5e82094b195 100644
--- a/tokio/Cargo.toml
+++ b/tokio/Cargo.toml
@@ -24,7 +24,8 @@ keywords = ["io", "async", "non-blocking", "futures"]
 [features]
 # Include nothing by default
 default = []
-
+# todo: behind unstable flag
+bwos = ["bwosqueue"]
 # enable everything
 full = [
   "fs",
@@ -99,6 +100,8 @@ autocfg = "1.1"
 [dependencies]
 tokio-macros = { version = "~2.1.0", path = "../tokio-macros", optional = true }
 
+bwosqueue = { path = "../bwosqueue", features = ["stats"], optional = true }
+
 pin-project-lite = "0.2.0"
 
 # Everything else is optional...
diff --git a/tokio/src/runtime/builder.rs b/tokio/src/runtime/builder.rs
index dda21a3ae27..523964832ad 100644
--- a/tokio/src/runtime/builder.rs
+++ b/tokio/src/runtime/builder.rs
@@ -188,11 +188,20 @@ cfg_unstable! {
 
 pub(crate) type ThreadNameFn = std::sync::Arc<dyn Fn() -> String + Send + Sync + 'static>;
 
+#[derive(Clone, Copy)]
+pub(crate) enum MultiThreadFlavor {
+    Default,
+    // The size of Bwos (and potentially also the default queue) could be configured in
+    // the future.
+    #[cfg(feature = "bwos")]
+    Bwos,
+}
+
 #[derive(Clone, Copy)]
 pub(crate) enum Kind {
     CurrentThread,
     #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))]
-    MultiThread,
+    MultiThread(MultiThreadFlavor),
 }
 
 impl Builder {
@@ -222,7 +231,17 @@ impl Builder {
         #[cfg_attr(docsrs, doc(cfg(feature = "rt-multi-thread")))]
         pub fn new_multi_thread() -> Builder {
             // The number `61` is fairly arbitrary. I believe this value was copied from golang.
-            Builder::new(Kind::MultiThread, 61, 61)
+            Builder::new(Kind::MultiThread(MultiThreadFlavor::Default), 61, 61)
+        }
+
+        /// Returns a new builder with the BWoS multi thread scheduler selected.
+        ///
+        /// Configuration methods can be chained on the return value.
+        #[cfg(all(feature = "rt-multi-thread", feature = "bwos"))]
+        #[cfg_attr(docsrs, doc(cfg(feature = "bwos")))]
+        pub fn new_multi_thread_bwos() -> Builder {
+            // The number `61` is copied from `new_multi_thread()`.
+            Builder::new(Kind::MultiThread(MultiThreadFlavor::Bwos), 61, 61)
         }
     }
 
@@ -649,7 +668,7 @@ impl Builder {
         match &self.kind {
             Kind::CurrentThread => self.build_current_thread_runtime(),
             #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))]
-            Kind::MultiThread => self.build_threaded_runtime(),
+            Kind::MultiThread(flavor) => self.build_threaded_runtime(*flavor),
         }
     }
 
@@ -658,7 +677,7 @@ impl Builder {
             enable_pause_time: match self.kind {
                 Kind::CurrentThread => true,
                 #[cfg(all(feature = "rt-multi-thread", not(tokio_wasi)))]
-                Kind::MultiThread => false,
+                Kind::MultiThread(_) => false,
             },
             enable_io: self.enable_io,
             enable_time: self.enable_time,
@@ -1163,7 +1182,7 @@ cfg_test_util! {
 
 cfg_rt_multi_thread! {
     impl Builder {
-        fn build_threaded_runtime(&mut self) -> io::Result<Runtime> {
+        fn build_threaded_runtime(&mut self, flavor: MultiThreadFlavor) -> io::Result<Runtime> {
             use crate::loom::sys::num_cpus;
             use crate::runtime::{Config, runtime::Scheduler};
             use crate::runtime::scheduler::{self, MultiThread};
@@ -1183,6 +1202,7 @@ cfg_rt_multi_thread! {
 
             let (scheduler, handle, launch) = MultiThread::new(
                 core_threads,
+                flavor,
                 driver,
                 driver_handle,
                 blocking_spawner,
diff --git a/tokio/src/runtime/scheduler/multi_thread/mod.rs b/tokio/src/runtime/scheduler/multi_thread/mod.rs
index 47cd1f3d7ae..35cdb56ece6 100644
--- a/tokio/src/runtime/scheduler/multi_thread/mod.rs
+++ b/tokio/src/runtime/scheduler/multi_thread/mod.rs
@@ -11,6 +11,8 @@ pub(crate) use park::{Parker, Unparker};
 
 pub(crate) mod queue;
 
+//pub(crate) mod queue;
+
 mod worker;
 pub(crate) use worker::Launch;
 
@@ -24,6 +26,7 @@ use crate::runtime::{
 };
 use crate::util::RngSeedGenerator;
 
+use crate::runtime::builder::MultiThreadFlavor;
 use std::fmt;
 use std::future::Future;
 
@@ -35,6 +38,7 @@ pub(crate) struct MultiThread;
 impl MultiThread {
     pub(crate) fn new(
         size: usize,
+        flavor: MultiThreadFlavor,
         driver: Driver,
         driver_handle: driver::Handle,
         blocking_spawner: blocking::Spawner,
@@ -44,6 +48,7 @@ impl MultiThread {
         let parker = Parker::new(driver);
         let (handle, launch) = worker::create(
             size,
+            flavor,
             parker,
             driver_handle,
             blocking_spawner,
diff --git a/tokio/src/runtime/scheduler/multi_thread/queue.rs b/tokio/src/runtime/scheduler/multi_thread/queue.rs
index faf56db2e91..71f9f7e9d13 100644
--- a/tokio/src/runtime/scheduler/multi_thread/queue.rs
+++ b/tokio/src/runtime/scheduler/multi_thread/queue.rs
@@ -1,533 +1,82 @@
-//! Run-queue structures to support a work-stealing scheduler
-
-use crate::loom::cell::UnsafeCell;
-use crate::loom::sync::Arc;
-use crate::runtime::task::{self, Inject};
-use crate::runtime::MetricsBatch;
-
-use std::mem::{self, MaybeUninit};
-use std::ptr;
-use std::sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release};
-
-// Use wider integers when possible to increase ABA resilience.
-//
-// See issue #5041: <https://github.com/tokio-rs/tokio/issues/5041>.
-cfg_has_atomic_u64! {
-    type UnsignedShort = u32;
-    type UnsignedLong = u64;
-    type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU32;
-    type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU64;
-}
-cfg_not_has_atomic_u64! {
-    type UnsignedShort = u16;
-    type UnsignedLong = u32;
-    type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU16;
-    type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU32;
-}
-
-/// Producer handle. May only be used from a single thread.
-pub(crate) struct Local<T: 'static> {
-    inner: Arc<Inner<T>>,
-}
-
-/// Consumer handle. May be used from many threads.
-pub(crate) struct Steal<T: 'static>(Arc<Inner<T>>);
-
-pub(crate) struct Inner<T: 'static> {
-    /// Concurrently updated by many threads.
-    ///
-    /// Contains two `UnsignedShort` values. The LSB byte is the "real" head of
-    /// the queue. The `UnsignedShort` in the MSB is set by a stealer in process
-    /// of stealing values. It represents the first value being stolen in the
-    /// batch. The `UnsignedShort` indices are intentionally wider than strictly
-    /// required for buffer indexing in order to provide ABA mitigation and make
-    /// it possible to distinguish between full and empty buffers.
-    ///
-    /// When both `UnsignedShort` values are the same, there is no active
-    /// stealer.
-    ///
-    /// Tracking an in-progress stealer prevents a wrapping scenario.
-    head: AtomicUnsignedLong,
-
-    /// Only updated by producer thread but read by many threads.
-    tail: AtomicUnsignedShort,
-
-    /// Elements
-    buffer: Box<[UnsafeCell<MaybeUninit<task::Notified<T>>>; LOCAL_QUEUE_CAPACITY]>,
-}
-
-unsafe impl<T> Send for Inner<T> {}
-unsafe impl<T> Sync for Inner<T> {}
-
-#[cfg(not(loom))]
-const LOCAL_QUEUE_CAPACITY: usize = 256;
-
-// Shrink the size of the local queue when using loom. This shouldn't impact
-// logic, but allows loom to test more edge cases in a reasonable a mount of
-// time.
-#[cfg(loom)]
-const LOCAL_QUEUE_CAPACITY: usize = 4;
-
-const MASK: usize = LOCAL_QUEUE_CAPACITY - 1;
-
-// Constructing the fixed size array directly is very awkward. The only way to
-// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as
-// the contents are not Copy. The trick with defining a const doesn't work for
-// generic types.
-fn make_fixed_size<T>(buffer: Box<[T]>) -> Box<[T; LOCAL_QUEUE_CAPACITY]> {
-    assert_eq!(buffer.len(), LOCAL_QUEUE_CAPACITY);
-
-    // safety: We check that the length is correct.
-    unsafe { Box::from_raw(Box::into_raw(buffer).cast()) }
-}
-
-/// Create a new local run-queue
-pub(crate) fn local<T: 'static>() -> (Steal<T>, Local<T>) {
-    let mut buffer = Vec::with_capacity(LOCAL_QUEUE_CAPACITY);
-
-    for _ in 0..LOCAL_QUEUE_CAPACITY {
-        buffer.push(UnsafeCell::new(MaybeUninit::uninit()));
+#[cfg(feature = "bwos")]
+mod bwosq;
+
+mod tokioq;
+
+use crate::runtime::builder::MultiThreadFlavor;
+use crate::runtime::task::Inject;
+use crate::runtime::{task, MetricsBatch};
+
+pub(crate) fn local<T: 'static>(
+    flavor: MultiThreadFlavor,
+) -> (
+    Box<dyn Stealer<T> + Send + Sync>,
+    Box<dyn Owner<T> + Send + Sync>,
+) {
+    match flavor {
+        MultiThreadFlavor::Default => tokioq::local(),
+        #[cfg(feature = "bwos")]
+        MultiThreadFlavor::Bwos => bwosq::local(),
     }
-
-    let inner = Arc::new(Inner {
-        head: AtomicUnsignedLong::new(0),
-        tail: AtomicUnsignedShort::new(0),
-        buffer: make_fixed_size(buffer.into_boxed_slice()),
-    });
-
-    let local = Local {
-        inner: inner.clone(),
-    };
-
-    let remote = Steal(inner);
-
-    (remote, local)
 }
 
-impl<T> Local<T> {
+pub(crate) trait Owner<T>: Send + Sync {
     /// Returns true if the queue has entries that can be stolen.
-    pub(crate) fn is_stealable(&self) -> bool {
-        !self.inner.is_empty()
-    }
+    fn is_stealable(&self) -> bool;
 
-    /// Returns false if there are any entries in the queue
-    ///
-    /// Separate to is_stealable so that refactors of is_stealable to "protect"
-    /// some tasks from stealing won't affect this
-    pub(crate) fn has_tasks(&self) -> bool {
-        !self.inner.is_empty()
-    }
+    /// Returns true if there are entries in the queue.
+    fn has_tasks(&self) -> bool;
 
     /// Pushes a task to the back of the local queue, skipping the LIFO slot.
-    pub(crate) fn push_back(
+    fn push_back(
         &mut self,
-        mut task: task::Notified<T>,
+        task: task::Notified<T>,
         inject: &Inject<T>,
         metrics: &mut MetricsBatch,
-    ) {
-        let tail = loop {
-            let head = self.inner.head.load(Acquire);
-            let (steal, real) = unpack(head);
-
-            // safety: this is the **only** thread that updates this cell.
-            let tail = unsafe { self.inner.tail.unsync_load() };
-
-            if tail.wrapping_sub(steal) < LOCAL_QUEUE_CAPACITY as UnsignedShort {
-                // There is capacity for the task
-                break tail;
-            } else if steal != real {
-                // Concurrently stealing, this will free up capacity, so only
-                // push the task onto the inject queue
-                inject.push(task);
-                return;
-            } else {
-                // Push the current task and half of the queue into the
-                // inject queue.
-                match self.push_overflow(task, real, tail, inject, metrics) {
-                    Ok(_) => return,
-                    // Lost the race, try again
-                    Err(v) => {
-                        task = v;
-                    }
-                }
-            }
-        };
-
-        // Map the position to a slot index.
-        let idx = tail as usize & MASK;
-
-        self.inner.buffer[idx].with_mut(|ptr| {
-            // Write the task to the slot
-            //
-            // Safety: There is only one producer and the above `if`
-            // condition ensures we don't touch a cell if there is a
-            // value, thus no consumer.
-            unsafe {
-                ptr::write((*ptr).as_mut_ptr(), task);
-            }
-        });
+    );
 
-        // Make the task available. Synchronizes with a load in
-        // `steal_into2`.
-        self.inner.tail.store(tail.wrapping_add(1), Release);
-    }
-
-    /// Moves a batch of tasks into the inject queue.
+    /// Push a batch of tasks to the back of the local queue
     ///
-    /// This will temporarily make some of the tasks unavailable to stealers.
-    /// Once `push_overflow` is done, a notification is sent out, so if other
-    /// workers "missed" some of the tasks during a steal, they will get
-    /// another opportunity.
-    #[inline(never)]
-    fn push_overflow(
+    /// # Safety:
+    ///
+    /// The caller must ensure that the queue has enough capacity to accept
+    /// all tasks, e.g. by calling `can_enqueue` beforehand.
+    unsafe fn push_back_batch_unchecked(
         &mut self,
-        task: task::Notified<T>,
-        head: UnsignedShort,
-        tail: UnsignedShort,
-        inject: &Inject<T>,
+        tasks: Box<dyn Iterator<Item = task::Notified<T>> + '_>,
         metrics: &mut MetricsBatch,
-    ) -> Result<(), task::Notified<T>> {
-        /// How many elements are we taking from the local queue.
-        ///
-        /// This is one less than the number of tasks pushed to the inject
-        /// queue as we are also inserting the `task` argument.
-        const NUM_TASKS_TAKEN: UnsignedShort = (LOCAL_QUEUE_CAPACITY / 2) as UnsignedShort;
-
-        assert_eq!(
-            tail.wrapping_sub(head) as usize,
-            LOCAL_QUEUE_CAPACITY,
-            "queue is not full; tail = {}; head = {}",
-            tail,
-            head
-        );
-
-        let prev = pack(head, head);
-
-        // Claim a bunch of tasks
-        //
-        // We are claiming the tasks **before** reading them out of the buffer.
-        // This is safe because only the **current** thread is able to push new
-        // tasks.
-        //
-        // There isn't really any need for memory ordering... Relaxed would
-        // work. This is because all tasks are pushed into the queue from the
-        // current thread (or memory has been acquired if the local queue handle
-        // moved).
-        if self
-            .inner
-            .head
-            .compare_exchange(
-                prev,
-                pack(
-                    head.wrapping_add(NUM_TASKS_TAKEN),
-                    head.wrapping_add(NUM_TASKS_TAKEN),
-                ),
-                Release,
-                Relaxed,
-            )
-            .is_err()
-        {
-            // We failed to claim the tasks, losing the race. Return out of
-            // this function and try the full `push` routine again. The queue
-            // may not be full anymore.
-            return Err(task);
-        }
-
-        /// An iterator that takes elements out of the run queue.
-        struct BatchTaskIter<'a, T: 'static> {
-            buffer: &'a [UnsafeCell<MaybeUninit<task::Notified<T>>>; LOCAL_QUEUE_CAPACITY],
-            head: UnsignedLong,
-            i: UnsignedLong,
-        }
-        impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> {
-            type Item = task::Notified<T>;
-
-            #[inline]
-            fn next(&mut self) -> Option<task::Notified<T>> {
-                if self.i == UnsignedLong::from(NUM_TASKS_TAKEN) {
-                    None
-                } else {
-                    let i_idx = self.i.wrapping_add(self.head) as usize & MASK;
-                    let slot = &self.buffer[i_idx];
-
-                    // safety: Our CAS from before has assumed exclusive ownership
-                    // of the task pointers in this range.
-                    let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
-
-                    self.i += 1;
-                    Some(task)
-                }
-            }
-        }
-
-        // safety: The CAS above ensures that no consumer will look at these
-        // values again, and we are the only producer.
-        let batch_iter = BatchTaskIter {
-            buffer: &self.inner.buffer,
-            head: head as UnsignedLong,
-            i: 0,
-        };
-        inject.push_batch(batch_iter.chain(std::iter::once(task)));
-
-        // Add 1 to factor in the task currently being scheduled.
-        metrics.incr_overflow_count();
-
-        Ok(())
-    }
-
-    /// Pops a task from the local queue.
-    pub(crate) fn pop(&mut self) -> Option<task::Notified<T>> {
-        let mut head = self.inner.head.load(Acquire);
-
-        let idx = loop {
-            let (steal, real) = unpack(head);
-
-            // safety: this is the **only** thread that updates this cell.
-            let tail = unsafe { self.inner.tail.unsync_load() };
-
-            if real == tail {
-                // queue is empty
-                return None;
-            }
-
-            let next_real = real.wrapping_add(1);
+    );
 
-            // If `steal == real` there are no concurrent stealers. Both `steal`
-            // and `real` are updated.
-            let next = if steal == real {
-                pack(next_real, next_real)
-            } else {
-                assert_ne!(steal, next_real);
-                pack(steal, next_real)
-            };
-
-            // Attempt to claim a task.
-            let res = self
-                .inner
-                .head
-                .compare_exchange(head, next, AcqRel, Acquire);
+    /// Ok ( if enqueuing of `num` items will succeed.
+    ///
+    /// Returns an optional hint how many items can be enqueued.
+    fn can_enqueue(&self, num: u16) -> Result<(), Option<u16>>;
 
-            match res {
-                Ok(_) => break real as usize & MASK,
-                Err(actual) => head = actual,
-            }
-        };
+    /// Pop one task from the front of the queue.
+    fn pop(&mut self) -> Option<task::Notified<T>>;
 
-        Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() }))
-    }
+    // /// approximate length of the queue
+    // fn len(&self) -> usize {
+    //     todo!()
+    // }
 }
 
-impl<T> Steal<T> {
-    pub(crate) fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
+pub(crate) trait Stealer<T>: Send + Sync {
+    // Todo: `is_empty()` is hard to implement for BWoS, since
+    // the stealer doesn't really have access to this kind of information,
+    // at least not in an inexpensive way, that doesn't interfere with the
+    // owner. Check if the upper layers really need this function!
+    fn is_empty(&self) -> bool;
 
     /// Steals half the tasks from self and place them into `dst`.
-    pub(crate) fn steal_into(
+    fn steal_into(
         &self,
-        dst: &mut Local<T>,
+        dst: &mut dyn Owner<T>,
         dst_metrics: &mut MetricsBatch,
-    ) -> Option<task::Notified<T>> {
-        // Safety: the caller is the only thread that mutates `dst.tail` and
-        // holds a mutable reference.
-        let dst_tail = unsafe { dst.inner.tail.unsync_load() };
-
-        // To the caller, `dst` may **look** empty but still have values
-        // contained in the buffer. If another thread is concurrently stealing
-        // from `dst` there may not be enough capacity to steal.
-        let (steal, _) = unpack(dst.inner.head.load(Acquire));
-
-        if dst_tail.wrapping_sub(steal) > LOCAL_QUEUE_CAPACITY as UnsignedShort / 2 {
-            // we *could* try to steal less here, but for simplicity, we're just
-            // going to abort.
-            return None;
-        }
-
-        // Steal the tasks into `dst`'s buffer. This does not yet expose the
-        // tasks in `dst`.
-        let mut n = self.steal_into2(dst, dst_tail);
-
-        if n == 0 {
-            // No tasks were stolen
-            return None;
-        }
+    ) -> Option<task::Notified<T>>;
 
-        dst_metrics.incr_steal_count(n as u16);
-        dst_metrics.incr_steal_operations();
-
-        // We are returning a task here
-        n -= 1;
-
-        let ret_pos = dst_tail.wrapping_add(n);
-        let ret_idx = ret_pos as usize & MASK;
-
-        // safety: the value was written as part of `steal_into2` and not
-        // exposed to stealers, so no other thread can access it.
-        let ret = dst.inner.buffer[ret_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
-
-        if n == 0 {
-            // The `dst` queue is empty, but a single task was stolen
-            return Some(ret);
-        }
-
-        // Make the stolen items available to consumers
-        dst.inner.tail.store(dst_tail.wrapping_add(n), Release);
-
-        Some(ret)
+    cfg_metrics! {
+        /// Number of tasks in the queue.
+        fn len(&self) -> usize;
     }
-
-    // Steal tasks from `self`, placing them into `dst`. Returns the number of
-    // tasks that were stolen.
-    fn steal_into2(&self, dst: &mut Local<T>, dst_tail: UnsignedShort) -> UnsignedShort {
-        let mut prev_packed = self.0.head.load(Acquire);
-        let mut next_packed;
-
-        let n = loop {
-            let (src_head_steal, src_head_real) = unpack(prev_packed);
-            let src_tail = self.0.tail.load(Acquire);
-
-            // If these two do not match, another thread is concurrently
-            // stealing from the queue.
-            if src_head_steal != src_head_real {
-                return 0;
-            }
-
-            // Number of available tasks to steal
-            let n = src_tail.wrapping_sub(src_head_real);
-            let n = n - n / 2;
-
-            if n == 0 {
-                // No tasks available to steal
-                return 0;
-            }
-
-            // Update the real head index to acquire the tasks.
-            let steal_to = src_head_real.wrapping_add(n);
-            assert_ne!(src_head_steal, steal_to);
-            next_packed = pack(src_head_steal, steal_to);
-
-            // Claim all those tasks. This is done by incrementing the "real"
-            // head but not the steal. By doing this, no other thread is able to
-            // steal from this queue until the current thread completes.
-            let res = self
-                .0
-                .head
-                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
-
-            match res {
-                Ok(_) => break n,
-                Err(actual) => prev_packed = actual,
-            }
-        };
-
-        assert!(
-            n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2,
-            "actual = {}",
-            n
-        );
-
-        let (first, _) = unpack(next_packed);
-
-        // Take all the tasks
-        for i in 0..n {
-            // Compute the positions
-            let src_pos = first.wrapping_add(i);
-            let dst_pos = dst_tail.wrapping_add(i);
-
-            // Map to slots
-            let src_idx = src_pos as usize & MASK;
-            let dst_idx = dst_pos as usize & MASK;
-
-            // Read the task
-            //
-            // safety: We acquired the task with the atomic exchange above.
-            let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
-
-            // Write the task to the new slot
-            //
-            // safety: `dst` queue is empty and we are the only producer to
-            // this queue.
-            dst.inner.buffer[dst_idx]
-                .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) });
-        }
-
-        let mut prev_packed = next_packed;
-
-        // Update `src_head_steal` to match `src_head_real` signalling that the
-        // stealing routine is complete.
-        loop {
-            let head = unpack(prev_packed).1;
-            next_packed = pack(head, head);
-
-            let res = self
-                .0
-                .head
-                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
-
-            match res {
-                Ok(_) => return n,
-                Err(actual) => {
-                    let (actual_steal, actual_real) = unpack(actual);
-
-                    assert_ne!(actual_steal, actual_real);
-
-                    prev_packed = actual;
-                }
-            }
-        }
-    }
-}
-
-cfg_metrics! {
-    impl<T> Steal<T> {
-        pub(crate) fn len(&self) -> usize {
-            self.0.len() as _
-        }
-    }
-}
-
-impl<T> Clone for Steal<T> {
-    fn clone(&self) -> Steal<T> {
-        Steal(self.0.clone())
-    }
-}
-
-impl<T> Drop for Local<T> {
-    fn drop(&mut self) {
-        if !std::thread::panicking() {
-            assert!(self.pop().is_none(), "queue not empty");
-        }
-    }
-}
-
-impl<T> Inner<T> {
-    fn len(&self) -> UnsignedShort {
-        let (_, head) = unpack(self.head.load(Acquire));
-        let tail = self.tail.load(Acquire);
-
-        tail.wrapping_sub(head)
-    }
-
-    fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-}
-
-/// Split the head value into the real head and the index a stealer is working
-/// on.
-fn unpack(n: UnsignedLong) -> (UnsignedShort, UnsignedShort) {
-    let real = n & UnsignedShort::MAX as UnsignedLong;
-    let steal = n >> (mem::size_of::<UnsignedShort>() * 8);
-
-    (steal as UnsignedShort, real as UnsignedShort)
-}
-
-/// Join the two head values
-fn pack(steal: UnsignedShort, real: UnsignedShort) -> UnsignedLong {
-    (real as UnsignedLong) | ((steal as UnsignedLong) << (mem::size_of::<UnsignedShort>() * 8))
-}
-
-#[test]
-fn test_local_queue_capacity() {
-    assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize);
 }
diff --git a/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs b/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs
new file mode 100644
index 00000000000..868fd647b13
--- /dev/null
+++ b/tokio/src/runtime/scheduler/multi_thread/queue/bwosq.rs
@@ -0,0 +1,148 @@
+use std::convert::TryInto;
+
+use crate::runtime::scheduler::multi_thread::queue::Owner as OwnerTrait;
+use crate::runtime::task::{self, Inject, Notified};
+use crate::runtime::MetricsBatch;
+use bwosqueue::{Owner, Stealer};
+
+// todo: Discuss using const generics or runtime values. Benchmark performance difference.
+const NUM_BLOCKS: usize = 8;
+const ELEMENTS_PER_BLOCK: usize = 32;
+
+/// Producer handle. May only be used from a single thread.
+pub(crate) struct Local<T: 'static> {
+    inner: Owner<task::Notified<T>, NUM_BLOCKS, ELEMENTS_PER_BLOCK>,
+}
+
+/// Consumer handle. May be used from many threads.
+pub(crate) struct Steal<T: 'static>(Stealer<task::Notified<T>, NUM_BLOCKS, ELEMENTS_PER_BLOCK>);
+
+/// Create a new local run-queue
+pub(crate) fn local<T: 'static>() -> (
+    Box<dyn super::Stealer<T> + Send + Sync>,
+    Box<dyn super::Owner<T> + Send + Sync>,
+) {
+    let (owner, stealer) = bwosqueue::new::<task::Notified<T>, NUM_BLOCKS, ELEMENTS_PER_BLOCK>();
+
+    let local = Local { inner: owner };
+
+    let remote = Steal(stealer);
+
+    (Box::new(remote), Box::new(local))
+}
+
+impl<T> super::Owner<T> for Local<T> {
+    /// Returns true if the queue has entries that can be stolen.
+    fn is_stealable(&self) -> bool {
+        self.inner.has_stealable_block()
+    }
+
+    /// Returns true if there are entries in the queue.
+    fn has_tasks(&self) -> bool {
+        self.inner.can_consume()
+    }
+
+    /// Pushes a task to the back of the local queue, skipping the LIFO slot.
+    fn push_back(
+        &mut self,
+        task: task::Notified<T>,
+        inject: &Inject<T>,
+        metrics: &mut MetricsBatch,
+    ) {
+        if let Err(t) = self.inner.enqueue(task) {
+            inject.push(t);
+            // note: the current implementation is slow
+            // if self.inner.has_stealers() {
+            //     inject.push(t);
+            // } else {
+            //     // push overflow of old queue
+            //     if let Some(block_iter) = self.inner.dequeue_block() {
+            //         // could use `and_then` to chain block dequeues a couple of times if
+            //         // successfull, if we want to steal more than one block
+            //         inject.push_batch(block_iter.chain(std::iter::once(t)))
+            //     } else {
+            //         // Give up and use inject queue.
+            //         inject.push(t)
+            //     }
+            // }
+            // Add 1 to factor in the task currently being scheduled.
+            metrics.incr_overflow_count();
+        };
+    }
+
+    unsafe fn push_back_batch_unchecked(
+        &mut self,
+        tasks: Box<dyn Iterator<Item = Notified<T>> + '_>,
+        _metrics: &mut MetricsBatch,
+    ) {
+        let _num_enqueued = self.inner.enqueue_batch_unchecked(tasks);
+    }
+
+    fn can_enqueue(&self, num: u16) -> Result<(), Option<u16>> {
+        // todo: trait signature probably needs to be modified.
+        if self.inner.can_enqueue_block() {
+            Ok(())
+        } else {
+            Err(None)
+        }
+    }
+
+    fn pop(&mut self) -> Option<task::Notified<T>> {
+        self.inner.dequeue()
+    }
+}
+
+impl<T> Drop for Local<T> {
+    fn drop(&mut self) {
+        if !std::thread::panicking() {
+            assert!(self.pop().is_none(), "queue not empty");
+        }
+    }
+}
+
+impl<T> super::Stealer<T> for Steal<T> {
+    fn is_empty(&self) -> bool {
+        self.0.estimated_queue_entries() == 0
+    }
+
+    /// Steals one block from self and place them into `dst`.
+    fn steal_into(
+        &self,
+        dst: &mut dyn OwnerTrait<T>,
+        dst_metrics: &mut MetricsBatch,
+    ) -> Option<task::Notified<T>> {
+        // We know `dst` is empty, so we expect this to enqueue to succeed in most cases.
+        // In the rare case that the `dst` queue is at the same time also full, because the
+        // producer is blocked waiting on a stealer we only attempt to steal a single task
+        // todo: can_enqueue...
+        if dst.can_enqueue(ELEMENTS_PER_BLOCK as u16).is_err() {
+            return self.0.steal();
+        }
+
+        if let Some(mut stolen_tasks) = self.0.steal_block() {
+            let num_stolen = stolen_tasks.len();
+            let first = stolen_tasks.next();
+            debug_assert!(first.is_some());
+            unsafe { dst.push_back_batch_unchecked(Box::new(stolen_tasks), dst_metrics) }
+            dst_metrics.incr_steal_count(num_stolen.try_into().unwrap());
+            dst_metrics.incr_steal_operations();
+            first
+        } else {
+            None
+        }
+    }
+}
+
+impl<T> Clone for Steal<T> {
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
+
+cfg_metrics! {
+    impl<T> Steal<T> {
+        pub(crate) fn len(&self) -> usize {
+            self.0.estimated_queue_entries()
+        }
+    }
+}
diff --git a/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs b/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs
new file mode 100644
index 00000000000..644b39c5668
--- /dev/null
+++ b/tokio/src/runtime/scheduler/multi_thread/queue/tokioq.rs
@@ -0,0 +1,706 @@
+//! Run-queue structures to support a work-stealing scheduler
+
+use crate::loom::cell::UnsafeCell;
+use crate::loom::sync::Arc;
+use crate::runtime::task::{self, Inject, Notified};
+use crate::runtime::MetricsBatch;
+
+use crate::runtime::scheduler::multi_thread::queue::{Owner, Stealer};
+use std::mem::{self, MaybeUninit};
+use std::ptr;
+use std::sync::atomic::Ordering::{AcqRel, Acquire, Relaxed, Release};
+
+// Use wider integers when possible to increase ABA resilience.
+//
+// See issue #5041: <https://github.com/tokio-rs/tokio/issues/5041>.
+cfg_has_atomic_u64! {
+    type UnsignedShort = u32;
+    type UnsignedLong = u64;
+    type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU32;
+    type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU64;
+}
+cfg_not_has_atomic_u64! {
+    type UnsignedShort = u16;
+    type UnsignedLong = u32;
+    type AtomicUnsignedShort = crate::loom::sync::atomic::AtomicU16;
+    type AtomicUnsignedLong = crate::loom::sync::atomic::AtomicU32;
+}
+
+/// Producer handle. May only be used from a single thread.
+pub(crate) struct Local<T: 'static> {
+    inner: Arc<Inner<T>>,
+}
+
+/// Consumer handle. May be used from many threads.
+pub(crate) struct Steal<T: 'static>(Arc<Inner<T>>);
+
+pub(crate) struct Inner<T: 'static> {
+    /// Concurrently updated by many threads.
+    ///
+    /// Contains two `UnsignedShort` values. The LSB byte is the "real" head of
+    /// the queue. The `UnsignedShort` in the MSB is set by a stealer in process
+    /// of stealing values. It represents the first value being stolen in the
+    /// batch. The `UnsignedShort` indices are intentionally wider than strictly
+    /// required for buffer indexing in order to provide ABA mitigation and make
+    /// it possible to distinguish between full and empty buffers.
+    ///
+    /// When both `UnsignedShort` values are the same, there is no active
+    /// stealer.
+    ///
+    /// Tracking an in-progress stealer prevents a wrapping scenario.
+    head: AtomicUnsignedLong,
+
+    /// Only updated by producer thread but read by many threads.
+    tail: AtomicUnsignedShort,
+
+    /// Elements
+    buffer: Box<[UnsafeCell<MaybeUninit<task::Notified<T>>>; LOCAL_QUEUE_CAPACITY]>,
+}
+
+unsafe impl<T> Send for Inner<T> {}
+unsafe impl<T> Sync for Inner<T> {}
+
+#[cfg(not(loom))]
+const LOCAL_QUEUE_CAPACITY: usize = 256;
+
+// Shrink the size of the local queue when using loom. This shouldn't impact
+// logic, but allows loom to test more edge cases in a reasonable a mount of
+// time.
+#[cfg(loom)]
+const LOCAL_QUEUE_CAPACITY: usize = 4;
+
+const MASK: usize = LOCAL_QUEUE_CAPACITY - 1;
+
+// Constructing the fixed size array directly is very awkward. The only way to
+// do it is to repeat `UnsafeCell::new(MaybeUninit::uninit())` 256 times, as
+// the contents are not Copy. The trick with defining a const doesn't work for
+// generic types.
+fn make_fixed_size<T>(buffer: Box<[T]>) -> Box<[T; LOCAL_QUEUE_CAPACITY]> {
+    assert_eq!(buffer.len(), LOCAL_QUEUE_CAPACITY);
+
+    // safety: We check that the length is correct.
+    unsafe { Box::from_raw(Box::into_raw(buffer).cast()) }
+}
+
+#[clippy::has_significant_drop]
+struct StealerIterator<'a, T: 'static> {
+    stealer: &'a Steal<T>,
+    // `tail` index of the stealer in the queue. Should not change
+    stolen_tail: UnsignedShort,
+    /// current position in the iterator
+    cur_pos: UnsignedShort,
+    /// Head of the stealer (one element past the last reserved item)
+    head: UnsignedShort,
+}
+
+impl<'a, T> StealerIterator<'a, T> {
+    fn is_empty(&self) -> bool {
+        // tail will always be behind head, but head could have wrapped around already,
+        // so calculate `new_tail` before comparing with head.
+        let new_tail = self.stolen_tail.wrapping_add(self.cur_pos);
+        new_tail >= self.head
+    }
+}
+
+impl<'a, T> Iterator for StealerIterator<'a, T> {
+    type Item = task::Notified<T>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // tail will always be behind head, but head could have wrapped around already,
+        // so calculate `new_tail` before comparing with head.
+        let new_tail = self.stolen_tail.wrapping_add(self.cur_pos);
+        if new_tail < self.head {
+            let idx = (new_tail as usize) & MASK;
+            let task = self.stealer.0.buffer[idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+            self.cur_pos += 1;
+            Some(task)
+        } else {
+            None
+        }
+    }
+}
+
+impl<'a, T> Drop for StealerIterator<'a, T> {
+    fn drop(&mut self) {
+        debug_assert!(self.is_empty());
+        // This is the value of head if no further enqueues happened concurrently.
+        let mut prev_packed = pack(self.stolen_tail, self.head);
+
+        let mut new_real = self.head;
+
+        // Update `head_steal` to match `head_real` signalling that the
+        // stealing routine is complete.
+        loop {
+            let next_packed = pack(new_real, new_real);
+
+            let res =
+                self.stealer
+                    .0
+                    .head
+                    .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => return (),
+                Err(actual) => {
+                    let (actual_steal, actual_real) = unpack(actual);
+
+                    assert_ne!(actual_steal, actual_real);
+                    // We don't concurrently steal, so the actual steal shouldn't have changed.
+                    debug_assert_eq!(self.stolen_tail, actual_steal);
+                    prev_packed = actual;
+                    new_real = actual_real;
+                }
+            }
+        }
+    }
+}
+
+/// Create a new local run-queue
+pub(crate) fn local<T: 'static>() -> (
+    Box<dyn Stealer<T> + Send + Sync>,
+    Box<dyn Owner<T> + Send + Sync>,
+) {
+    let mut buffer = Vec::with_capacity(LOCAL_QUEUE_CAPACITY);
+
+    for _ in 0..LOCAL_QUEUE_CAPACITY {
+        buffer.push(UnsafeCell::new(MaybeUninit::uninit()));
+    }
+
+    let inner = Arc::new(Inner {
+        head: AtomicUnsignedLong::new(0),
+        tail: AtomicUnsignedShort::new(0),
+        buffer: make_fixed_size(buffer.into_boxed_slice()),
+    });
+
+    let local = Local {
+        inner: inner.clone(),
+    };
+
+    let remote = Steal(inner);
+
+    (
+        Box::new(remote) as Box<dyn Stealer<T> + Send + Sync>,
+        Box::new(local) as Box<dyn Owner<T> + Send + Sync>,
+    )
+}
+
+impl<T> Local<T> {
+    /// Moves a batch of tasks into the inject queue.
+    ///
+    /// This will temporarily make some of the tasks unavailable to stealers.
+    /// Once `push_overflow` is done, a notification is sent out, so if other
+    /// workers "missed" some of the tasks during a steal, they will get
+    /// another opportunity.
+    #[inline(never)]
+    fn push_overflow(
+        &mut self,
+        task: task::Notified<T>,
+        head: UnsignedShort,
+        tail: UnsignedShort,
+        inject: &Inject<T>,
+        metrics: &mut MetricsBatch,
+    ) -> Result<(), task::Notified<T>> {
+        /// How many elements are we taking from the local queue.
+        ///
+        /// This is one less than the number of tasks pushed to the inject
+        /// queue as we are also inserting the `task` argument.
+        const NUM_TASKS_TAKEN: UnsignedShort = (LOCAL_QUEUE_CAPACITY / 2) as UnsignedShort;
+
+        assert_eq!(
+            tail.wrapping_sub(head) as usize,
+            LOCAL_QUEUE_CAPACITY,
+            "queue is not full; tail = {}; head = {}",
+            tail,
+            head
+        );
+
+        let prev = pack(head, head);
+
+        // Claim a bunch of tasks
+        //
+        // We are claiming the tasks **before** reading them out of the buffer.
+        // This is safe because only the **current** thread is able to push new
+        // tasks.
+        //
+        // There isn't really any need for memory ordering... Relaxed would
+        // work. This is because all tasks are pushed into the queue from the
+        // current thread (or memory has been acquired if the local queue handle
+        // moved).
+        if self
+            .inner
+            .head
+            .compare_exchange(
+                prev,
+                pack(
+                    head.wrapping_add(NUM_TASKS_TAKEN),
+                    head.wrapping_add(NUM_TASKS_TAKEN),
+                ),
+                Release,
+                Relaxed,
+            )
+            .is_err()
+        {
+            // We failed to claim the tasks, losing the race. Return out of
+            // this function and try the full `push` routine again. The queue
+            // may not be full anymore.
+            return Err(task);
+        }
+
+        /// An iterator that takes elements out of the run queue.
+        struct BatchTaskIter<'a, T: 'static> {
+            buffer: &'a [UnsafeCell<MaybeUninit<task::Notified<T>>>; LOCAL_QUEUE_CAPACITY],
+            head: UnsignedLong,
+            i: UnsignedLong,
+        }
+        impl<'a, T: 'static> Iterator for BatchTaskIter<'a, T> {
+            type Item = task::Notified<T>;
+
+            #[inline]
+            fn next(&mut self) -> Option<task::Notified<T>> {
+                if self.i == UnsignedLong::from(NUM_TASKS_TAKEN) {
+                    None
+                } else {
+                    let i_idx = self.i.wrapping_add(self.head) as usize & MASK;
+                    let slot = &self.buffer[i_idx];
+
+                    // safety: Our CAS from before has assumed exclusive ownership
+                    // of the task pointers in this range.
+                    let task = slot.with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+
+                    self.i += 1;
+                    Some(task)
+                }
+            }
+        }
+
+        // safety: The CAS above ensures that no consumer will look at these
+        // values again, and we are the only producer.
+        let batch_iter = BatchTaskIter {
+            buffer: &self.inner.buffer,
+            head: head as UnsignedLong,
+            i: 0,
+        };
+        inject.push_batch(batch_iter.chain(std::iter::once(task)));
+
+        // Add 1 to factor in the task currently being scheduled.
+        metrics.incr_overflow_count();
+
+        Ok(())
+    }
+}
+
+impl<T: 'static> super::Owner<T> for Local<T> {
+    /// Returns true if the queue has entries that can be stolen.
+    fn is_stealable(&self) -> bool {
+        !self.inner.is_empty()
+    }
+
+    /// Returns false if there are any entries in the queue
+    ///
+    /// Separate to is_stealable so that refactors of is_stealable to "protect"
+    /// some tasks from stealing won't affect this
+    fn has_tasks(&self) -> bool {
+        !self.inner.is_empty()
+    }
+
+    /// Pushes a task to the back of the local queue, skipping the LIFO slot.
+    fn push_back(
+        &mut self,
+        mut task: task::Notified<T>,
+        inject: &Inject<T>,
+        metrics: &mut MetricsBatch,
+    ) {
+        let tail = loop {
+            let head = self.inner.head.load(Acquire);
+            let (steal, real) = unpack(head);
+
+            // safety: this is the **only** thread that updates this cell.
+            let tail = unsafe { self.inner.tail.unsync_load() };
+
+            if tail.wrapping_sub(steal) < LOCAL_QUEUE_CAPACITY as UnsignedShort {
+                // There is capacity for the task
+                break tail;
+            } else if steal != real {
+                // Concurrently stealing, this will free up capacity, so only
+                // push the task onto the inject queue
+                inject.push(task);
+                return;
+            } else {
+                // Push the current task and half of the queue into the
+                // inject queue.
+                match self.push_overflow(task, real, tail, inject, metrics) {
+                    Ok(_) => return,
+                    // Lost the race, try again
+                    Err(v) => {
+                        task = v;
+                    }
+                }
+            }
+        };
+
+        // Map the position to a slot index.
+        let idx = tail as usize & MASK;
+
+        self.inner.buffer[idx].with_mut(|ptr| {
+            // Write the task to the slot
+            //
+            // Safety: There is only one producer and the above `if`
+            // condition ensures we don't touch a cell if there is a
+            // value, thus no consumer.
+            unsafe {
+                ptr::write((*ptr).as_mut_ptr(), task);
+            }
+        });
+
+        // Make the task available. Synchronizes with a load in
+        // `steal_into2`.
+        self.inner.tail.store(tail.wrapping_add(1), Release);
+    }
+
+    // fn len(&self) -> usize {
+    //     // Safety: We own the queue and thus are the only ones that could potentially mutate
+    //     // `inner.tail`.
+    //     let dst_tail = unsafe { self.inner.tail.unsync_load() };
+    //
+    //     // To the caller, `dst` may **look** empty but still have values
+    //     // contained in the buffer. If another thread is concurrently stealing
+    //     // from `dst` there may not be enough capacity to steal.
+    //     let (steal, real_head) = unpack(self.inner.head.load(Acquire));
+    // }
+
+    #[deny(unsafe_op_in_unsafe_fn)]
+    unsafe fn push_back_batch_unchecked(
+        &mut self,
+        tasks: Box<dyn Iterator<Item = Notified<T>> + '_>,
+        metrics: &mut MetricsBatch,
+    ) {
+        // Safety: this is the **only** thread that updates this cell.
+        let tail = unsafe { self.inner.tail.unsync_load() };
+        let mut count = 0;
+        for task in tasks {
+            let idx = tail.wrapping_add(count) as usize & MASK;
+            // Write the task to the new slot
+            //
+            // Safety: We are the queue Owner and the caller assures the queue has sufficient capacity.
+            self.inner.buffer[idx].with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) });
+            count += 1;
+        }
+
+        metrics.incr_steal_count(count as u16);
+        metrics.incr_steal_operations();
+
+        // Make the stolen items available to consumers
+        self.inner.tail.store(tail.wrapping_add(count), Release);
+    }
+
+    fn can_enqueue(&self, num: u16) -> Result<(), Option<u16>> {
+        // Safety: We own the queue and thus are the only ones that could potentially mutate
+        // `inner.tail`.
+        let tail = unsafe { self.inner.tail.unsync_load() };
+
+        // To the caller, `dst` may **look** empty but still have values
+        // contained in the buffer. If another thread is concurrently stealing
+        // from `dst` there may not be enough capacity to steal.
+        let (steal, _real_head) = unpack(self.inner.head.load(Acquire));
+
+        // `steal` is behind `real_head` when there is an in-progress steal, otherwise it is
+        // equal to `real_head`. `tail` - `steal` is the amount of queue slots currently used.
+        // `tail` is always larger then `steal`, since the counter is monotonically increasing,
+        // at least until it wraps around at `UnsignedShort::MAX`. wrapping_sub always gives the
+        // correct difference.
+        let capacity = LOCAL_QUEUE_CAPACITY as UnsignedShort - (tail.wrapping_sub(steal));
+        if capacity > num as UnsignedShort {
+            Ok(())
+        } else {
+            Err(Some(capacity as u16))
+        }
+    }
+
+    /// Pops a task from the local queue.
+    fn pop(&mut self) -> Option<task::Notified<T>> {
+        let mut head = self.inner.head.load(Acquire);
+
+        let idx = loop {
+            let (steal, real) = unpack(head);
+
+            // safety: this is the **only** thread that updates this cell.
+            let tail = unsafe { self.inner.tail.unsync_load() };
+
+            if real == tail {
+                // queue is empty
+                return None;
+            }
+
+            let next_real = real.wrapping_add(1);
+
+            // If `steal == real` there are no concurrent stealers. Both `steal`
+            // and `real` are updated.
+            let next = if steal == real {
+                pack(next_real, next_real)
+            } else {
+                assert_ne!(steal, next_real);
+                pack(steal, next_real)
+            };
+
+            // Attempt to claim a task.
+            let res = self
+                .inner
+                .head
+                .compare_exchange(head, next, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => break real as usize & MASK,
+                Err(actual) => head = actual,
+            }
+        };
+
+        Some(self.inner.buffer[idx].with(|ptr| unsafe { ptr::read(ptr).assume_init() }))
+    }
+}
+
+impl<T: 'static> super::Stealer<T> for Steal<T> {
+    fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    /// Steals half the tasks from self and place them into `dst`.
+    fn steal_into(
+        &self,
+        dst: &mut dyn Owner<T>,
+        dst_metrics: &mut MetricsBatch,
+    ) -> Option<task::Notified<T>> {
+        if dst.can_enqueue(LOCAL_QUEUE_CAPACITY as u16 / 2).is_err() {
+            // we *could* try to steal less here, but for simplicity, we're just
+            // going to abort.
+            return None;
+        }
+
+        let mut stolen_tasks = self.steal_half()?;
+
+        // We take the first task from the iterator to directly return it.
+        let first = stolen_tasks.next()?;
+
+        if stolen_tasks.is_empty() {
+            dst_metrics.incr_steal_count(1);
+            dst_metrics.incr_steal_operations();
+            return Some(first);
+        } else {
+            // Safety: We checked that `dst` has sufficient capacity, and we are the owner
+            // thread, so the capacity can only have increased in the meantime.
+            unsafe { dst.push_back_batch_unchecked(Box::new(stolen_tasks), dst_metrics) }
+        }
+
+        Some(first)
+    }
+
+    cfg_metrics! {
+            fn len(&self) -> usize {
+                self.0.len() as _
+        }
+    }
+}
+
+impl<T> Steal<T> {
+    fn steal_half(&self) -> Option<StealerIterator<'_, T>> {
+        let mut prev_packed = self.0.head.load(Acquire);
+        let mut next_packed;
+
+        let (steal_head, real_head) = loop {
+            let (src_head_steal, src_head_real) = unpack(prev_packed);
+            let src_tail = self.0.tail.load(Acquire);
+            // If these two do not match, another thread is concurrently
+            // stealing from the queue.
+            if src_head_steal != src_head_real {
+                return None;
+            }
+            // Number of available tasks to steal
+            let n = src_tail.wrapping_sub(src_head_real);
+            let n = n - n / 2;
+
+            if n == 0 {
+                // No tasks available to steal
+                return None;
+            }
+            // Update the real head index to acquire the tasks.
+            let steal_to = src_head_real.wrapping_add(n);
+            assert_ne!(src_head_steal, steal_to);
+            next_packed = pack(src_head_steal, steal_to);
+
+            // Claim all those tasks. This is done by incrementing the "real"
+            // head but not the steal. By doing this, no other thread is able to
+            // steal from this queue until the current thread completes.
+            let res = self
+                .0
+                .head
+                .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+
+            match res {
+                Ok(_) => {
+                    break (src_head_steal, steal_to);
+                }
+                Err(actual) => prev_packed = actual,
+            }
+        };
+
+        let n = real_head.wrapping_sub(steal_head);
+        assert!(
+            n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2,
+            "actual = {}",
+            n
+        );
+
+        Some(StealerIterator {
+            stealer: &self,
+            stolen_tail: steal_head,
+            cur_pos: 0,
+            head: real_head,
+        })
+    }
+
+    // // Steal tasks from `self`, placing them into `dst`. Returns the number of
+    // // tasks that were stolen.
+    // fn steal_into2(&self, dst: &mut Local<T>, dst_tail: UnsignedShort) -> UnsignedShort {
+    //     let mut prev_packed = self.0.head.load(Acquire);
+    //     let mut next_packed;
+    //
+    //     let n = loop {
+    //         let (src_head_steal, src_head_real) = unpack(prev_packed);
+    //         let src_tail = self.0.tail.load(Acquire);
+    //
+    //         // If these two do not match, another thread is concurrently
+    //         // stealing from the queue.
+    //         if src_head_steal != src_head_real {
+    //             return 0;
+    //         }
+    //
+    //         // Number of available tasks to steal
+    //         let n = src_tail.wrapping_sub(src_head_real);
+    //         let n = n - n / 2;
+    //
+    //         if n == 0 {
+    //             // No tasks available to steal
+    //             return 0;
+    //         }
+    //
+    //         // Update the real head index to acquire the tasks.
+    //         let steal_to = src_head_real.wrapping_add(n);
+    //         assert_ne!(src_head_steal, steal_to);
+    //         next_packed = pack(src_head_steal, steal_to);
+    //
+    //         // Claim all those tasks. This is done by incrementing the "real"
+    //         // head but not the steal. By doing this, no other thread is able to
+    //         // steal from this queue until the current thread completes.
+    //         let res = self
+    //             .0
+    //             .head
+    //             .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+    //
+    //         match res {
+    //             Ok(_) => break n,
+    //             Err(actual) => prev_packed = actual,
+    //         }
+    //     };
+    //
+    //     assert!(
+    //         n <= LOCAL_QUEUE_CAPACITY as UnsignedShort / 2,
+    //         "actual = {}",
+    //         n
+    //     );
+    //
+    //     let (first, _) = unpack(next_packed);
+    //
+    //     // Take all the tasks
+    //     for i in 0..n {
+    //         // Compute the positions
+    //         let src_pos = first.wrapping_add(i);
+    //         let dst_pos = dst_tail.wrapping_add(i);
+    //
+    //         // Map to slots
+    //         let src_idx = src_pos as usize & MASK;
+    //         let dst_idx = dst_pos as usize & MASK;
+    //
+    //         // Read the task
+    //         //
+    //         // safety: We acquired the task with the atomic exchange above.
+    //         let task = self.0.buffer[src_idx].with(|ptr| unsafe { ptr::read((*ptr).as_ptr()) });
+    //
+    //         // Write the task to the new slot
+    //         //
+    //         // safety: `dst` queue is empty and we are the only producer to
+    //         // this queue.
+    //         dst.inner.buffer[dst_idx]
+    //             .with_mut(|ptr| unsafe { ptr::write((*ptr).as_mut_ptr(), task) });
+    //     }
+    //
+    //     let mut prev_packed = next_packed;
+    //
+    //     // Update `src_head_steal` to match `src_head_real` signalling that the
+    //     // stealing routine is complete.
+    //     loop {
+    //         let head = unpack(prev_packed).1;
+    //         next_packed = pack(head, head);
+    //
+    //         let res = self
+    //             .0
+    //             .head
+    //             .compare_exchange(prev_packed, next_packed, AcqRel, Acquire);
+    //
+    //         match res {
+    //             Ok(_) => return n,
+    //             Err(actual) => {
+    //                 let (actual_steal, actual_real) = unpack(actual);
+    //
+    //                 assert_ne!(actual_steal, actual_real);
+    //
+    //                 prev_packed = actual;
+    //             }
+    //         }
+    //     }
+    // }
+}
+
+impl<T> Clone for Steal<T> {
+    fn clone(&self) -> Steal<T> {
+        Steal(self.0.clone())
+    }
+}
+
+impl<T> Inner<T> {
+    fn len(&self) -> UnsignedShort {
+        let (_, head) = unpack(self.head.load(Acquire));
+        let tail = self.tail.load(Acquire);
+
+        tail.wrapping_sub(head)
+    }
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+/// Split the head value into the real head and the index a stealer is working
+/// on.
+fn unpack(n: UnsignedLong) -> (UnsignedShort, UnsignedShort) {
+    let real = n & UnsignedShort::MAX as UnsignedLong;
+    let steal = n >> (mem::size_of::<UnsignedShort>() * 8);
+
+    (steal as UnsignedShort, real as UnsignedShort)
+}
+
+/// Join the two head values
+fn pack(steal: UnsignedShort, real: UnsignedShort) -> UnsignedLong {
+    (real as UnsignedLong) | ((steal as UnsignedLong) << (mem::size_of::<UnsignedShort>() * 8))
+}
+
+impl<T> Drop for Local<T> {
+    fn drop(&mut self) {
+        if !std::thread::panicking() {
+            assert!(self.pop().is_none(), "queue not empty");
+        }
+    }
+}
+
+#[test]
+fn test_local_queue_capacity() {
+    assert!(LOCAL_QUEUE_CAPACITY - 1 <= u8::MAX as usize);
+}
diff --git a/tokio/src/runtime/scheduler/multi_thread/worker.rs b/tokio/src/runtime/scheduler/multi_thread/worker.rs
index c59f19e59d5..936eff697f4 100644
--- a/tokio/src/runtime/scheduler/multi_thread/worker.rs
+++ b/tokio/src/runtime/scheduler/multi_thread/worker.rs
@@ -67,6 +67,7 @@ use crate::runtime::{
 use crate::util::atomic_cell::AtomicCell;
 use crate::util::rand::{FastRand, RngSeedGenerator};
 
+use crate::runtime::builder::MultiThreadFlavor;
 use std::cell::RefCell;
 use std::time::Duration;
 
@@ -95,7 +96,7 @@ struct Core {
     lifo_slot: Option<Notified>,
 
     /// The worker-local run queue.
-    run_queue: queue::Local<Arc<Handle>>,
+    run_queue: Box<dyn queue::Owner<Arc<Handle>> + Send + Sync>,
 
     /// True if the worker is currently searching for more work. Searching
     /// involves attempting to steal from other workers.
@@ -153,7 +154,7 @@ pub(super) struct Shared {
 /// Used to communicate with a worker from other threads.
 struct Remote {
     /// Steals tasks from this worker.
-    steal: queue::Steal<Arc<Handle>>,
+    steal: Box<dyn queue::Stealer<Arc<Handle>> + Send + Sync>,
 
     /// Unparks the associated worker thread
     unpark: Unparker,
@@ -187,6 +188,7 @@ scoped_thread_local!(static CURRENT: Context);
 
 pub(super) fn create(
     size: usize,
+    flavor: MultiThreadFlavor,
     park: Parker,
     driver_handle: driver::Handle,
     blocking_spawner: blocking::Spawner,
@@ -199,7 +201,7 @@ pub(super) fn create(
 
     // Create the local queues
     for _ in 0..size {
-        let (steal, run_queue) = queue::local();
+        let (steal, run_queue) = queue::local(flavor);
 
         let park = park.clone();
         let unpark = park.unpark();
@@ -636,7 +638,7 @@ impl Core {
             let target = &worker.handle.shared.remotes[i];
             if let Some(task) = target
                 .steal
-                .steal_into(&mut self.run_queue, &mut self.metrics)
+                .steal_into(&mut *self.run_queue, &mut self.metrics)
             {
                 return Some(task);
             }
diff --git a/tokio/src/runtime/tests/queue.rs b/tokio/src/runtime/tests/queue.rs
index ac80fa7332f..ec059d2ebb3 100644
--- a/tokio/src/runtime/tests/queue.rs
+++ b/tokio/src/runtime/tests/queue.rs
@@ -2,6 +2,7 @@ use crate::runtime::scheduler::multi_thread::queue;
 use crate::runtime::task::{self, Inject, Schedule, Task};
 use crate::runtime::MetricsBatch;
 
+use crate::runtime::builder::MultiThreadFlavor;
 use std::thread;
 use std::time::Duration;
 
@@ -28,7 +29,7 @@ fn metrics_batch() -> MetricsBatch {
 
 #[test]
 fn fits_256() {
-    let (_, mut local) = queue::local();
+    let (_, mut local) = queue::local(MultiThreadFlavor::Default);
     let inject = Inject::new();
     let mut metrics = metrics_batch();
 
@@ -48,7 +49,7 @@ fn fits_256() {
 
 #[test]
 fn overflow() {
-    let (_, mut local) = queue::local();
+    let (_, mut local) = queue::local(MultiThreadFlavor::Default);
     let inject = Inject::new();
     let mut metrics = metrics_batch();
 
@@ -78,8 +79,8 @@ fn overflow() {
 fn steal_batch() {
     let mut metrics = metrics_batch();
 
-    let (steal1, mut local1) = queue::local();
-    let (_, mut local2) = queue::local();
+    let (steal1, mut local1) = queue::local(MultiThreadFlavor::Default);
+    let (_, mut local2) = queue::local(MultiThreadFlavor::Default);
     let inject = Inject::new();
 
     for _ in 0..4 {
@@ -87,7 +88,7 @@ fn steal_batch() {
         local1.push_back(task, &inject, &mut metrics);
     }
 
-    assert!(steal1.steal_into(&mut local2, &mut metrics).is_some());
+    assert!(steal1.steal_into(&mut *local2, &mut metrics).is_some());
 
     cfg_metrics! {
         assert_metrics!(metrics, steal_count == 2);
@@ -114,6 +115,8 @@ const fn normal_or_miri(normal: usize, miri: usize) -> usize {
     }
 }
 
+// todo: stolen increments by one here, so counting seems incorrect even before our queue
+// joins the picture
 #[test]
 fn stress1() {
     const NUM_ITER: usize = 5;
@@ -125,16 +128,16 @@ fn stress1() {
     let mut metrics = metrics_batch();
 
     for _ in 0..NUM_ITER {
-        let (steal, mut local) = queue::local();
+        let (steal, mut local) = queue::local(MultiThreadFlavor::Default);
         let inject = Inject::new();
 
         let th = thread::spawn(move || {
             let mut metrics = metrics_batch();
-            let (_, mut local) = queue::local();
+            let (_, mut local) = queue::local(MultiThreadFlavor::Default);
             let mut n = 0;
 
             for _ in 0..NUM_STEAL {
-                if steal.steal_into(&mut local, &mut metrics).is_some() {
+                if steal.steal_into(&mut *local, &mut metrics).is_some() {
                     n += 1;
                 }
 
@@ -188,16 +191,16 @@ fn stress2() {
     let mut metrics = metrics_batch();
 
     for _ in 0..NUM_ITER {
-        let (steal, mut local) = queue::local();
+        let (steal, mut local) = queue::local(MultiThreadFlavor::Default);
         let inject = Inject::new();
 
         let th = thread::spawn(move || {
             let mut stats = metrics_batch();
-            let (_, mut local) = queue::local();
+            let (_, mut local) = queue::local(MultiThreadFlavor::Default);
             let mut n = 0;
 
             for _ in 0..NUM_STEAL {
-                if steal.steal_into(&mut local, &mut stats).is_some() {
+                if steal.steal_into(&mut *local, &mut stats).is_some() {
                     n += 1;
                 }