executor: rewrite the work-stealing thread pool (tokio-rs#1657)

This patch is a ground up rewrite of the existing work-stealing thread pool. The goal is to reduce overhead while simplifying code when possible. At a high level, the following architectural changes were made: - The local run queues were switched for bounded circle buffer queues. - Reduce cross-thread synchronization. - Refactor task constructs to use a single allocation and always include a join handle (tokio-rs#887). - Simplify logic around putting workers to sleep and waking them up. **Local run queues** Move away from crossbeam's implementation of the Chase-Lev deque. This implementation included unnecessary overhead as it supported capabilities that are not needed for the work-stealing thread pool. Instead, a fixed size circle buffer is used for the local queue. When the local queue is full, half of the tasks contained in it are moved to the global run queue. **Reduce cross-thread synchronization** This is done via many small improvements. Primarily, an upper bound is placed on the number of concurrent stealers. Limiting the number of stealers results in lower contention. Secondly, the rate at which workers are notified and woken up is throttled. This also reduces contention by preventing many threads from racing to steal work. **Refactor task structure** Now that Tokio is able to target a rust version that supports `std::alloc` as well as `std::task`, the pool is able to optimize how the task structure is laid out. Now, a single allocation per task is required and a join handle is always provided enabling the spawner to retrieve the result of the task (tokio-rs#887). **Simplifying logic** When possible, complexity is reduced in the implementation. This is done by using locks and other simpler constructs in cold paths. The set of sleeping workers is now represented as a `Mutex<VecDeque<usize>>`. Instead of optimizing access to this structure, we reduce the amount the pool must access this structure. Secondly, we have (temporarily) removed `threadpool::blocking`. This capability will come back later, but the original implementation was way more complicated than necessary. **Results** The thread pool benchmarks have improved significantly: Old thread pool: ``` test chained_spawn ... bench: 2,019,796 ns/iter (+/- 302,168) test ping_pong ... bench: 1,279,948 ns/iter (+/- 154,365) test spawn_many ... bench: 10,283,608 ns/iter (+/- 1,284,275) test yield_many ... bench: 21,450,748 ns/iter (+/- 1,201,337) ``` New thread pool: ``` test chained_spawn ... bench: 147,943 ns/iter (+/- 6,673) test ping_pong ... bench: 537,744 ns/iter (+/- 20,928) test spawn_many ... bench: 7,454,898 ns/iter (+/- 283,449) test yield_many ... bench: 16,771,113 ns/iter (+/- 733,424) ``` Real-world benchmarks improve significantly as well. This is testing the hyper hello world server using: `wrk -t1 -c50 -d10`: Old scheduler: ``` Running 10s test @ http://127.0.0.1:3000 1 threads and 50 connections Thread Stats Avg Stdev Max +/- Stdev Latency 371.53us 99.05us 1.97ms 60.53% Req/Sec 114.61k 8.45k 133.85k 67.00% 1139307 requests in 10.00s, 95.61MB read Requests/sec: 113923.19 Transfer/sec: 9.56MB ``` New scheduler: ``` Running 10s test @ http://127.0.0.1:3000 1 threads and 50 connections Thread Stats Avg Stdev Max +/- Stdev Latency 275.05us 69.81us 1.09ms 73.57% Req/Sec 153.17k 10.68k 171.51k 71.00% 1522671 requests in 10.00s, 127.79MB read Requests/sec: 152258.70 Transfer/sec: 12.78MB ```
Kestrer · Oct 19, 2019 · ed5a94e · ed5a94e
1 parent 2a18132
commit ed5a94e
Show file tree

Hide file tree

Showing 100 changed files with 7,408 additions and 6,795 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -65,7 +65,7 @@ jobs:
       tokio-codec: []
       tokio-executor:
         - current-thread
-        - threadpool
+        - thread-pool
       tokio-io:
         - util
       tokio-sync:
@@ -94,6 +94,13 @@ jobs:
         - tokio-no-features
         - tokio-with-net
 
+# Run loom tests
+- template: ci/azure-loom.yml
+  parameters:
+    rust: beta
+    crates:
+      - tokio-executor
+
 # Try cross compiling
 - template: ci/azure-cross-compile.yml
   parameters:

diff --git a/ci/azure-loom.yml b/ci/azure-loom.yml
@@ -0,0 +1,18 @@
+jobs:
+- job: loom
+  displayName: Loom tests
+  pool:
+    vmImage: ubuntu-16.04
+
+  steps:
+  - template: azure-install-rust.yml
+    parameters:
+      rust_version: ${{ parameters.rust }}
+
+  - ${{ each crate in parameters.crates }}:
+    - script: RUSTFLAGS="--cfg loom" cargo test --lib --release
+      env:
+        LOOM_MAX_PREEMPTIONS: 2
+        CI: 'True'
+      displayName: test ${{ crate }}
+      workingDirectory: $(Build.SourcesDirectory)/${{ crate }}
diff --git a/tokio-executor/Cargo.toml b/tokio-executor/Cargo.toml
@@ -23,40 +23,31 @@ categories = ["concurrency", "asynchronous"]
 [features]
 blocking = ["tokio-sync", "lazy_static"]
 current-thread = ["crossbeam-channel"]
-threadpool = [
-  "tokio-sync",
-  "crossbeam-deque",
-  "crossbeam-queue",
-  "crossbeam-utils",
-  "futures-core-preview",
-  "num_cpus",
-  "lazy_static",
-  "slab",
-]
+thread-pool = ["num_cpus"]
 
 [dependencies]
+futures-util-preview = { version = "=0.3.0-alpha.19", features = ["channel"] }
 tokio-sync = { version = "=0.2.0-alpha.6", optional = true, path = "../tokio-sync" }
 
 tracing = { version = "0.1.5", optional = true }
-futures-util-preview = { version = "=0.3.0-alpha.19", features = ["channel"] }
 
 # current-thread dependencies
 crossbeam-channel = { version = "0.3.8", optional = true }
 
 # threadpool dependencies
-crossbeam-deque = { version = "0.7.0", optional = true }
-crossbeam-queue = { version = "0.1.0", optional = true }
-crossbeam-utils = { version = "0.6.4", optional = true }
-futures-core-preview = { version = "=0.3.0-alpha.19", optional = true }
 num_cpus = { version = "1.2", optional = true }
+
+# blocking
+futures-core-preview = { version = "=0.3.0-alpha.19", optional = true }
 lazy_static = { version = "1", optional = true }
-slab = { version = "0.4.1", optional = true }
 
 [dev-dependencies]
 tokio = { version = "=0.2.0-alpha.6", path = "../tokio" }
+tokio-sync = { version = "=0.2.0-alpha.6", path = "../tokio-sync" }
 tokio-test = { version = "=0.2.0-alpha.6", path = "../tokio-test" }
 
 futures-core-preview = "=0.3.0-alpha.19"
+loom = { version = "0.2.9", features = ["futures", "checkpoint"] }
 rand = "0.7"
 
 [package.metadata.docs.rs]

diff --git a/tokio-executor/benches/blocking.rs b/tokio-executor/benches/blocking.rs
diff --git a/tokio-executor/benches/thread_pool.rs b/tokio-executor/benches/thread_pool.rs
@@ -0,0 +1,161 @@
+#![feature(test)]
+
+extern crate test;
+
+use tokio_executor::thread_pool::{Builder, Spawner, ThreadPool};
+use tokio_sync::oneshot;
+
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering::Relaxed;
+use std::sync::{mpsc, Arc};
+use std::task::{Context, Poll};
+
+struct Backoff(usize);
+
+impl Future for Backoff {
+    type Output = ();
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<()> {
+        if self.0 == 0 {
+            Poll::Ready(())
+        } else {
+            self.0 -= 1;
+            cx.waker().wake_by_ref();
+            Poll::Pending
+        }
+    }
+}
+
+const NUM_THREADS: usize = 6;
+
+#[bench]
+fn spawn_many(b: &mut test::Bencher) {
+    const NUM_SPAWN: usize = 10_000;
+
+    let threadpool = Builder::new().num_threads(NUM_THREADS).build();
+
+    let (tx, rx) = mpsc::sync_channel(1000);
+    let rem = Arc::new(AtomicUsize::new(0));
+
+    b.iter(|| {
+        rem.store(NUM_SPAWN, Relaxed);
+
+        for _ in 0..NUM_SPAWN {
+            let tx = tx.clone();
+            let rem = rem.clone();
+
+            threadpool.spawn(async move {
+                if 1 == rem.fetch_sub(1, Relaxed) {
+                    tx.send(()).unwrap();
+                }
+            });
+        }
+
+        let _ = rx.recv().unwrap();
+    });
+}
+
+#[bench]
+fn yield_many(b: &mut test::Bencher) {
+    const NUM_YIELD: usize = 1_000;
+    const TASKS_PER_CPU: usize = 50;
+
+    let threadpool = Builder::new().num_threads(NUM_THREADS).build();
+
+    let tasks = TASKS_PER_CPU * num_cpus::get_physical();
+    let (tx, rx) = mpsc::sync_channel(tasks);
+
+    b.iter(move || {
+        for _ in 0..tasks {
+            let tx = tx.clone();
+
+            threadpool.spawn(async move {
+                let backoff = Backoff(NUM_YIELD);
+                backoff.await;
+                tx.send(()).unwrap();
+            });
+        }
+
+        for _ in 0..tasks {
+            let _ = rx.recv().unwrap();
+        }
+    });
+}
+
+#[bench]
+fn ping_pong(b: &mut test::Bencher) {
+    const NUM_PINGS: usize = 1_000;
+
+    let threadpool = Builder::new().num_threads(NUM_THREADS).build();
+
+    let (done_tx, done_rx) = mpsc::sync_channel(1000);
+    let rem = Arc::new(AtomicUsize::new(0));
+
+    b.iter(|| {
+        let done_tx = done_tx.clone();
+        let rem = rem.clone();
+        rem.store(NUM_PINGS, Relaxed);
+
+        let spawner = threadpool.spawner().clone();
+
+        threadpool.spawn(async move {
+            for _ in 0..NUM_PINGS {
+                let rem = rem.clone();
+                let done_tx = done_tx.clone();
+
+                let spawner2 = spawner.clone();
+
+                spawner.spawn(async move {
+                    let (tx1, rx1) = oneshot::channel();
+                    let (tx2, rx2) = oneshot::channel();
+
+                    spawner2.spawn(async move {
+                        rx1.await.unwrap();
+                        tx2.send(()).unwrap();
+                    });
+
+                    tx1.send(()).unwrap();
+                    rx2.await.unwrap();
+
+                    if 1 == rem.fetch_sub(1, Relaxed) {
+                        done_tx.send(()).unwrap();
+                    }
+                });
+            }
+        });
+
+        done_rx.recv().unwrap();
+    });
+}
+
+#[bench]
+fn chained_spawn(b: &mut test::Bencher) {
+    const ITER: usize = 1_000;
+
+    let threadpool = Builder::new().num_threads(NUM_THREADS).build();
+
+    fn iter(spawner: Spawner, done_tx: mpsc::SyncSender<()>, n: usize) {
+        if n == 0 {
+            done_tx.send(()).unwrap();
+        } else {
+            let s2 = spawner.clone();
+            spawner.spawn(async move {
+                iter(s2, done_tx, n - 1);
+            });
+        }
+    }
+
+    let (done_tx, done_rx) = mpsc::sync_channel(1000);
+
+    b.iter(move || {
+        let done_tx = done_tx.clone();
+        let spawner = threadpool.spawner().clone();
+        threadpool.spawn(async move {
+            iter(spawner, done_tx, ITER);
+        });
+
+        done_rx.recv().unwrap();
+    });
+}