Skip to content

Commit

Permalink
Auto merge of #44963 - JLockerman:fix_spsc, r=alexcrichton
Browse files Browse the repository at this point in the history
Improve performance of spsc_queue and stream.

This PR makes two main changes:

1. It switches the `spsc_queue` node caching strategy from keeping a shared
counter of the number of nodes in the cache to keeping a consumer only counter
of the number of node eligible to be cached.
2. It separates the consumer and producers fields of `spsc_queue` and `stream` into
a producer cache line and consumer cache line.

Overall, it speeds up `mpsc` in `spsc` mode by 2-10x.
Variance is higher than I'd like (that 2-10x speedup is on one benchmark), I believe this is due to the drop check in `send` (`fn stream::Queue::send:107`). I think this check can be combined with the sleep detection code into a version which only uses 1 shared variable, and only one atomic access per `send`, but I haven't looked through the select implementation enough to be sure.

The code currently assumes a cache line size of 64 bytes. I added a CacheAligned newtype in `mpsc` which I expect to reuse for `shared`. It doesn't really belong there, it would probably be best put in `core::sync::atomic`, but putting it in `core` would involve making it public, which I thought would require an RFC.

Benchmark runner is [here](https://github.com/JLockerman/queues/tree/3eca46279c53eb75833c5ecd416de2ac220bd022/shootout), benchmarks [here](https://github.com/JLockerman/queues/blob/3eca46279c53eb75833c5ecd416de2ac220bd022/queue_bench/src/lib.rs#L170-L293).

Fixes #44512.
  • Loading branch information
bors committed Oct 11, 2017
2 parents cbf5d39 + bb7945e commit a47c9f8
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 107 deletions.
2 changes: 2 additions & 0 deletions src/libstd/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@
#![feature(allow_internal_unstable)]
#![feature(align_offset)]
#![feature(asm)]
#![feature(attr_literals)]
#![feature(box_syntax)]
#![feature(cfg_target_has_atomic)]
#![feature(cfg_target_thread_local)]
Expand Down Expand Up @@ -290,6 +291,7 @@
#![feature(prelude_import)]
#![feature(rand)]
#![feature(raw)]
#![feature(repr_align)]
#![feature(repr_simd)]
#![feature(rustc_attrs)]
#![cfg_attr(not(stage0), feature(rustc_const_unstable))]
Expand Down
37 changes: 37 additions & 0 deletions src/libstd/sync/mpsc/cache_aligned.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use ops::{Deref, DerefMut};

#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(align(64))]
pub(super) struct Aligner;

#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(super) struct CacheAligned<T>(pub T, pub Aligner);

impl<T> Deref for CacheAligned<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}

impl<T> DerefMut for CacheAligned<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

impl<T> CacheAligned<T> {
pub(super) fn new(t: T) -> Self {
CacheAligned(t, Aligner)
}
}
2 changes: 2 additions & 0 deletions src/libstd/sync/mpsc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ mod sync;
mod mpsc_queue;
mod spsc_queue;

mod cache_aligned;

/// The receiving half of Rust's [`channel`][] (or [`sync_channel`]) type.
/// This half can only be owned by one thread.
///
Expand Down
153 changes: 90 additions & 63 deletions src/libstd/sync/mpsc/spsc_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,51 +22,71 @@ use core::cell::UnsafeCell;

use sync::atomic::{AtomicPtr, AtomicUsize, Ordering};

use super::cache_aligned::CacheAligned;

// Node within the linked list queue of messages to send
struct Node<T> {
// FIXME: this could be an uninitialized T if we're careful enough, and
// that would reduce memory usage (and be a bit faster).
// is it worth it?
value: Option<T>, // nullable for re-use of nodes
cached: bool, // This node goes into the node cache
next: AtomicPtr<Node<T>>, // next node in the queue
}

/// The single-producer single-consumer queue. This structure is not cloneable,
/// but it can be safely shared in an Arc if it is guaranteed that there
/// is only one popper and one pusher touching the queue at any one point in
/// time.
pub struct Queue<T> {
pub struct Queue<T, ProducerAddition=(), ConsumerAddition=()> {
// consumer fields
consumer: CacheAligned<Consumer<T, ConsumerAddition>>,

// producer fields
producer: CacheAligned<Producer<T, ProducerAddition>>,
}

struct Consumer<T, Addition> {
tail: UnsafeCell<*mut Node<T>>, // where to pop from
tail_prev: AtomicPtr<Node<T>>, // where to pop from
cache_bound: usize, // maximum cache size
cached_nodes: AtomicUsize, // number of nodes marked as cachable
addition: Addition,
}

// producer fields
struct Producer<T, Addition> {
head: UnsafeCell<*mut Node<T>>, // where to push to
first: UnsafeCell<*mut Node<T>>, // where to get new nodes from
tail_copy: UnsafeCell<*mut Node<T>>, // between first/tail

// Cache maintenance fields. Additions and subtractions are stored
// separately in order to allow them to use nonatomic addition/subtraction.
cache_bound: usize,
cache_additions: AtomicUsize,
cache_subtractions: AtomicUsize,
addition: Addition,
}

unsafe impl<T: Send> Send for Queue<T> { }
unsafe impl<T: Send, P: Send + Sync, C: Send + Sync> Send for Queue<T, P, C> { }

unsafe impl<T: Send> Sync for Queue<T> { }
unsafe impl<T: Send, P: Send + Sync, C: Send + Sync> Sync for Queue<T, P, C> { }

impl<T> Node<T> {
fn new() -> *mut Node<T> {
Box::into_raw(box Node {
value: None,
cached: false,
next: AtomicPtr::new(ptr::null_mut::<Node<T>>()),
})
}
}

impl<T> Queue<T> {
/// Creates a new queue.
impl<T, ProducerAddition, ConsumerAddition> Queue<T, ProducerAddition, ConsumerAddition> {

/// Creates a new queue. With given additional elements in the producer and
/// consumer portions of the queue.
///
/// Due to the performance implications of cache-contention,
/// we wish to keep fields used mainly by the producer on a separate cache
/// line than those used by the consumer.
/// Since cache lines are usually 64 bytes, it is unreasonably expensive to
/// allocate one for small fields, so we allow users to insert additional
/// fields into the cache lines already allocated by this for the producer
/// and consumer.
///
/// This is unsafe as the type system doesn't enforce a single
/// consumer-producer relationship. It also allows the consumer to `pop`
Expand All @@ -83,19 +103,28 @@ impl<T> Queue<T> {
/// cache (if desired). If the value is 0, then the cache has
/// no bound. Otherwise, the cache will never grow larger than
/// `bound` (although the queue itself could be much larger.
pub unsafe fn new(bound: usize) -> Queue<T> {
pub unsafe fn with_additions(
bound: usize,
producer_addition: ProducerAddition,
consumer_addition: ConsumerAddition,
) -> Self {
let n1 = Node::new();
let n2 = Node::new();
(*n1).next.store(n2, Ordering::Relaxed);
Queue {
tail: UnsafeCell::new(n2),
tail_prev: AtomicPtr::new(n1),
head: UnsafeCell::new(n2),
first: UnsafeCell::new(n1),
tail_copy: UnsafeCell::new(n1),
cache_bound: bound,
cache_additions: AtomicUsize::new(0),
cache_subtractions: AtomicUsize::new(0),
consumer: CacheAligned::new(Consumer {
tail: UnsafeCell::new(n2),
tail_prev: AtomicPtr::new(n1),
cache_bound: bound,
cached_nodes: AtomicUsize::new(0),
addition: consumer_addition
}),
producer: CacheAligned::new(Producer {
head: UnsafeCell::new(n2),
first: UnsafeCell::new(n1),
tail_copy: UnsafeCell::new(n1),
addition: producer_addition
}),
}
}

Expand All @@ -109,35 +138,25 @@ impl<T> Queue<T> {
assert!((*n).value.is_none());
(*n).value = Some(t);
(*n).next.store(ptr::null_mut(), Ordering::Relaxed);
(**self.head.get()).next.store(n, Ordering::Release);
*self.head.get() = n;
(**self.producer.head.get()).next.store(n, Ordering::Release);
*(&self.producer.head).get() = n;
}
}

unsafe fn alloc(&self) -> *mut Node<T> {
// First try to see if we can consume the 'first' node for our uses.
// We try to avoid as many atomic instructions as possible here, so
// the addition to cache_subtractions is not atomic (plus we're the
// only one subtracting from the cache).
if *self.first.get() != *self.tail_copy.get() {
if self.cache_bound > 0 {
let b = self.cache_subtractions.load(Ordering::Relaxed);
self.cache_subtractions.store(b + 1, Ordering::Relaxed);
}
let ret = *self.first.get();
*self.first.get() = (*ret).next.load(Ordering::Relaxed);
if *self.producer.first.get() != *self.producer.tail_copy.get() {
let ret = *self.producer.first.get();
*self.producer.0.first.get() = (*ret).next.load(Ordering::Relaxed);
return ret;
}
// If the above fails, then update our copy of the tail and try
// again.
*self.tail_copy.get() = self.tail_prev.load(Ordering::Acquire);
if *self.first.get() != *self.tail_copy.get() {
if self.cache_bound > 0 {
let b = self.cache_subtractions.load(Ordering::Relaxed);
self.cache_subtractions.store(b + 1, Ordering::Relaxed);
}
let ret = *self.first.get();
*self.first.get() = (*ret).next.load(Ordering::Relaxed);
*self.producer.0.tail_copy.get() =
self.consumer.tail_prev.load(Ordering::Acquire);
if *self.producer.first.get() != *self.producer.tail_copy.get() {
let ret = *self.producer.first.get();
*self.producer.0.first.get() = (*ret).next.load(Ordering::Relaxed);
return ret;
}
// If all of that fails, then we have to allocate a new node
Expand All @@ -153,27 +172,27 @@ impl<T> Queue<T> {
// sentinel from where we should start popping from. Hence, look at
// tail's next field and see if we can use it. If we do a pop, then
// the current tail node is a candidate for going into the cache.
let tail = *self.tail.get();
let tail = *self.consumer.tail.get();
let next = (*tail).next.load(Ordering::Acquire);
if next.is_null() { return None }
assert!((*next).value.is_some());
let ret = (*next).value.take();

*self.tail.get() = next;
if self.cache_bound == 0 {
self.tail_prev.store(tail, Ordering::Release);
*self.consumer.0.tail.get() = next;
if self.consumer.cache_bound == 0 {
self.consumer.tail_prev.store(tail, Ordering::Release);
} else {
// FIXME: this is dubious with overflow.
let additions = self.cache_additions.load(Ordering::Relaxed);
let subtractions = self.cache_subtractions.load(Ordering::Relaxed);
let size = additions - subtractions;

if size < self.cache_bound {
self.tail_prev.store(tail, Ordering::Release);
self.cache_additions.store(additions + 1, Ordering::Relaxed);
let cached_nodes = self.consumer.cached_nodes.load(Ordering::Relaxed);
if cached_nodes < self.consumer.cache_bound && !(*tail).cached {
self.consumer.cached_nodes.store(cached_nodes, Ordering::Relaxed);
(*tail).cached = true;
}

if (*tail).cached {
self.consumer.tail_prev.store(tail, Ordering::Release);
} else {
(*self.tail_prev.load(Ordering::Relaxed))
.next.store(next, Ordering::Relaxed);
(*self.consumer.tail_prev.load(Ordering::Relaxed))
.next.store(next, Ordering::Relaxed);
// We have successfully erased all references to 'tail', so
// now we can safely drop it.
let _: Box<Node<T>> = Box::from_raw(tail);
Expand All @@ -194,17 +213,25 @@ impl<T> Queue<T> {
// This is essentially the same as above with all the popping bits
// stripped out.
unsafe {
let tail = *self.tail.get();
let tail = *self.consumer.tail.get();
let next = (*tail).next.load(Ordering::Acquire);
if next.is_null() { None } else { (*next).value.as_mut() }
}
}

pub fn producer_addition(&self) -> &ProducerAddition {
&self.producer.addition
}

pub fn consumer_addition(&self) -> &ConsumerAddition {
&self.consumer.addition
}
}

impl<T> Drop for Queue<T> {
impl<T, ProducerAddition, ConsumerAddition> Drop for Queue<T, ProducerAddition, ConsumerAddition> {
fn drop(&mut self) {
unsafe {
let mut cur = *self.first.get();
let mut cur = *self.producer.first.get();
while !cur.is_null() {
let next = (*cur).next.load(Ordering::Relaxed);
let _n: Box<Node<T>> = Box::from_raw(cur);
Expand All @@ -224,7 +251,7 @@ mod tests {
#[test]
fn smoke() {
unsafe {
let queue = Queue::new(0);
let queue = Queue::with_additions(0, (), ());
queue.push(1);
queue.push(2);
assert_eq!(queue.pop(), Some(1));
Expand All @@ -241,7 +268,7 @@ mod tests {
#[test]
fn peek() {
unsafe {
let queue = Queue::new(0);
let queue = Queue::with_additions(0, (), ());
queue.push(vec![1]);

// Ensure the borrowchecker works
Expand All @@ -264,7 +291,7 @@ mod tests {
#[test]
fn drop_full() {
unsafe {
let q: Queue<Box<_>> = Queue::new(0);
let q: Queue<Box<_>> = Queue::with_additions(0, (), ());
q.push(box 1);
q.push(box 2);
}
Expand All @@ -273,7 +300,7 @@ mod tests {
#[test]
fn smoke_bound() {
unsafe {
let q = Queue::new(0);
let q = Queue::with_additions(0, (), ());
q.push(1);
q.push(2);
assert_eq!(q.pop(), Some(1));
Expand All @@ -295,7 +322,7 @@ mod tests {
}

unsafe fn stress_bound(bound: usize) {
let q = Arc::new(Queue::new(bound));
let q = Arc::new(Queue::with_additions(bound, (), ()));

let (tx, rx) = channel();
let q2 = q.clone();
Expand Down
Loading

0 comments on commit a47c9f8

Please sign in to comment.