diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..1f02ed3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,37 @@
+---
+name: Bug report
+about: Found a bug? Report it here.
+title: "[BUG]"
+labels: bug
+assignees: bshoshany
+
+---
+
+**Describe the bug**
+
+A clear and concise description of what the bug is.
+
+**Minimal working example**
+
+A short but complete program that can be compiled to reproduce the error. Paste the program between the two code fences. If it's too long or requires multiple files, attach the file(s) instead.
+
+```cpp
+```
+
+**Behavior**
+
+What behavior did you expect to get? What actually happened? If the code failed to compile, please include the full output of the compiler.
+
+**System information**
+
+* CPU model, architecture, # of cores and threads:
+* Operating system:
+* Name and version of C++ compiler:
+* Full command used for compiling, including all compiler flags:
+* Thread pool library version:
+
+(Please note that only the latest version of the thread pool library is supported.)
+
+**Additional information**
+
+Include any additional information here.
diff --git a/.github/ISSUE_TEMPLATE/failed-tests.md b/.github/ISSUE_TEMPLATE/failed-tests.md
new file mode 100644
index 0000000..1d20f23
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/failed-tests.md
@@ -0,0 +1,26 @@
+---
+name: Failed tests
+about: The provided automated tests failed on your system? Report it here.
+title: "[TEST]"
+labels: bug
+assignees: bshoshany
+
+---
+
+**System information**
+
+* CPU model, architecture, # of cores and threads:
+* Operating system:
+* Name and version of C++ compiler:
+* Full command used for compiling, including all compiler flags:
+* Thread pool library version:
+
+(Please note that only the latest version of the thread pool library is supported.)
+
+**Log file**
+
+Please attach the log file generated by the automated test program to this issue.
+
+**Additional information**
+
+Include any additional information here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..7137cea
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,23 @@
+---
+name: Feature request
+about: Want a new feature? Suggest it here.
+title: "[REQ]"
+labels: enhancement
+assignees: bshoshany
+
+---
+
+**Describe the new feature**
+
+A clear and concise description of the feature you want.
+
+**Code example**
+
+An example of code that utilizes the suggested feature. Paste or write it between the two code fences.
+
+```cpp
+```
+
+**Additional information**
+
+Include any additional information here.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000..cabbe5a
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,20 @@
+**Pull request policy (please read)**
+
+> Contributions are always welcome. However, I release my projects in cumulative updates after editing and testing them locally on my system, so my policy is not to accept any pull requests. If you open a pull request, and I decide to incorporate your suggestion into the project, I will first modify your code to comply with the project's coding conventions (formatting, syntax, naming, comments, programming practices, etc.), and perform some tests to ensure that the change doesn't break anything. I will then merge it into the next release of the project, possibly together with some other changes. The new release will also include a note in `CHANGELOG.md` with a link to your pull request, and modifications to the documentation in `README.md` as needed.
+
+**Describe the changes**
+
+What does your pull request fix or add to the library?
+
+**Testing**
+
+Have you tested the new code using the provided automated test program and/or performed any other tests to ensure that it works correctly? If so, please provide information about the test system(s):
+
+* CPU model, architecture, # of cores and threads:
+* Operating system:
+* Name and version of C++ compiler:
+* Full command used for compiling, including all compiler flags:
+
+**Additional information**
+
+Include any additional information here.
diff --git a/BS_thread_pool.hpp b/BS_thread_pool.hpp
new file mode 100644
index 0000000..b06700c
--- /dev/null
+++ b/BS_thread_pool.hpp
@@ -0,0 +1,516 @@
+#pragma once
+
+/**
+ * @file BS_thread_pool.hpp
+ * @author Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)
+ * @version 3.0.0
+ * @date 2022-05-30
+ * @copyright Copyright (c) 2022 Barak Shoshany. Licensed under the MIT license. If you use this library in software of any kind, please provide a link to the GitHub repository https://github.com/bshoshany/thread-pool in the source code and documentation. If you use this library in published research, please cite it as follows: Barak Shoshany, "A C++17 Thread Pool for High-Performance Scientific Computing", doi:10.5281/zenodo.4742687, arXiv:2105.00613 (May 2021)
+ *
+ * @brief BS::thread_pool: a fast, lightweight, and easy-to-use C++17 thread pool library. This header file contains the entire library, including the main BS::thread_pool class and the helper classes BS::multi_future, BS:synced_stream, and BS::timer.
+ */
+
+#define BS_THREAD_POOL_VERSION "v3.0.0 (2022-05-30)"
+
+#include <atomic>             // std::atomic
+#include <chrono>             // std::chrono
+#include <condition_variable> // std::condition_variable
+#include <exception>          // std::current_exception
+#include <functional>         // std::function
+#include <future>             // std::future, std::promise
+#include <iostream>           // std::cout, std::ostream
+#include <memory>             // std::make_shared, std::make_unique, std::shared_ptr, std::unique_ptr
+#include <mutex>              // std::mutex, std::scoped_lock, std::unique_lock
+#include <queue>              // std::queue
+#include <thread>             // std::thread
+#include <type_traits>        // std::common_type_t, std::decay_t, std::is_void_v, std::invoke_result_t
+#include <utility>            // std::move, std::swap
+#include <vector>             // std::vector
+
+namespace BS
+{
+using concurrency_t = std::invoke_result_t<decltype(std::thread::hardware_concurrency)>;
+
+// ============================================================================================= //
+//                                    Begin class multi_future                                    //
+
+/**
+ * @brief A helper class to facilitate waiting for and/or getting the results of multiple futures at once.
+ */
+template <typename T>
+class multi_future
+{
+public:
+    /**
+     * @brief Construct a multi_future object with the given number of futures.
+     *
+     * @param num_futures_ The desired number of futures to store.
+     */
+    explicit multi_future(const size_t num_futures_ = 0) : f(num_futures_) {}
+
+    /**
+     * @brief Get the results from all the futures stored in this multi_future object.
+     *
+     * @return A vector containing the results.
+     */
+    std::vector<T> get()
+    {
+        std::vector<T> results(f.size());
+        for (size_t i = 0; i < f.size(); ++i)
+            results[i] = f[i].get();
+        return results;
+    }
+
+    /**
+     * @brief Wait for all the futures stored in this multi_future object.
+     */
+    void wait() const
+    {
+        for (size_t i = 0; i < f.size(); ++i)
+            f[i].wait();
+    }
+
+    /**
+     * @brief A vector to store the futures.
+     */
+    std::vector<std::future<T>> f;
+};
+
+//                                     End class multi_future                                     //
+// ============================================================================================= //
+
+// ============================================================================================= //
+//                                    Begin class thread_pool                                    //
+
+/**
+ * @brief A fast, lightweight, and easy-to-use C++17 thread pool class.
+ */
+class thread_pool
+{
+public:
+    // ============================
+    // Constructors and destructors
+    // ============================
+
+    /**
+     * @brief Construct a new thread pool.
+     *
+     * @param thread_count_ The number of threads to use. The default value is the total number of hardware threads available, as reported by the implementation. This is usually determined by the number of cores in the CPU. If a core is hyperthreaded, it will count as two threads.
+     */
+    explicit thread_pool(const concurrency_t thread_count_ = std::thread::hardware_concurrency()) : thread_count(thread_count_ ? thread_count_ : std::thread::hardware_concurrency()), threads(std::make_unique<std::thread[]>(thread_count_ ? thread_count_ : std::thread::hardware_concurrency()))
+    {
+        create_threads();
+    }
+
+    /**
+     * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads. Note that if the variable paused is set to true, then any tasks still in the queue will never be executed.
+     */
+    ~thread_pool()
+    {
+        wait_for_tasks();
+        destroy_threads();
+    }
+
+    // =======================
+    // Public member functions
+    // =======================
+
+    /**
+     * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
+     *
+     * @return The number of queued tasks.
+     */
+    size_t get_tasks_queued() const
+    {
+        const std::scoped_lock tasks_lock(tasks_mutex);
+        return tasks.size();
+    }
+
+    /**
+     * @brief Get the number of tasks currently being executed by the threads.
+     *
+     * @return The number of running tasks.
+     */
+    size_t get_tasks_running() const
+    {
+        const std::scoped_lock tasks_lock(tasks_mutex);
+        return tasks_total - tasks.size();
+    }
+
+    /**
+     * @brief Get the total number of unfinished tasks: either still in the queue, or running in a thread. Note that get_tasks_total() == get_tasks_queued() + get_tasks_running().
+     *
+     * @return The total number of tasks.
+     */
+    size_t get_tasks_total() const
+    {
+        return tasks_total;
+    }
+
+    /**
+     * @brief Get the number of threads in the pool.
+     *
+     * @return The number of threads.
+     */
+    concurrency_t get_thread_count() const
+    {
+        return thread_count;
+    }
+
+    /**
+     * @brief Parallelize a loop by automatically splitting it into blocks and submitting each block separately to the queue.
+     *
+     * @tparam F The type of the function to loop through.
+     * @tparam T1 The type of the first index in the loop. Should be a signed or unsigned integer.
+     * @tparam T2 The type of the index after the last index in the loop. Should be a signed or unsigned integer. If T1 is not the same as T2, a common type will be automatically inferred.
+     * @tparam T The common type of T1 and T2.
+     * @tparam R The return value of the loop function F (can be void).
+     * @param first_index The first index in the loop.
+     * @param index_after_last The index after the last index in the loop. The loop will iterate from first_index to (index_after_last - 1) inclusive. In other words, it will be equivalent to "for (T i = first_index; i < index_after_last; ++i)". Note that if first_index == index_after_last, no blocks will be submitted.
+     * @param loop The function to loop through. Will be called once per block. Should take exactly two arguments: the first index in the block and the index after the last index in the block. loop(start, end) should typically involve a loop of the form "for (T i = start; i < end; ++i)".
+     * @param num_blocks The maximum number of blocks to split the loop into. The default is to use the number of threads in the pool.
+     * @return A multi_future object that can be used to wait for all the blocks to finish. If the loop function returns a value, the multi_future object can be used to obtain the values returned by each block.
+     */
+    template <typename F, typename T1, typename T2, typename T = std::common_type_t<T1, T2>, typename R = std::invoke_result_t<std::decay_t<F>, T, T>>
+    multi_future<R> parallelize_loop(const T1& first_index, const T2& index_after_last, const F& loop, size_t num_blocks = 0)
+    {
+        T first_index_T = static_cast<T>(first_index);
+        T index_after_last_T = static_cast<T>(index_after_last);
+        if (first_index_T == index_after_last_T)
+            return multi_future<R>();
+        if (index_after_last_T < first_index_T)
+            std::swap(index_after_last_T, first_index_T);
+        if (num_blocks == 0)
+            num_blocks = thread_count;
+        const size_t total_size = static_cast<size_t>(index_after_last_T - first_index_T);
+        size_t block_size = static_cast<size_t>(total_size / num_blocks);
+        if (block_size == 0)
+        {
+            block_size = 1;
+            num_blocks = total_size > 1 ? total_size : 1;
+        }
+        multi_future<R> mf(num_blocks);
+        for (size_t i = 0; i < num_blocks; ++i)
+        {
+            const T start = (static_cast<T>(i * block_size) + first_index_T);
+            const T end = (i == num_blocks - 1) ? index_after_last_T : (static_cast<T>((i + 1) * block_size) + first_index_T);
+            mf.f[i] = submit(loop, start, end);
+        }
+        return mf;
+    }
+
+    /**
+     * @brief Push a function with zero or more arguments, but no return value, into the task queue.
+     *
+     * @tparam F The type of the function.
+     * @tparam A The types of the arguments.
+     * @param task The function to push.
+     * @param args The arguments to pass to the function.
+     */
+    template <typename F, typename... A>
+    void push_task(const F& task, const A&... args)
+    {
+        {
+            const std::scoped_lock tasks_lock(tasks_mutex);
+            if constexpr (sizeof...(args) == 0)
+                tasks.push(std::function<void()>(task));
+            else
+                tasks.push(std::function<void()>([task, args...] { task(args...); }));
+        }
+        ++tasks_total;
+        task_available_cv.notify_one();
+    }
+
+    /**
+     * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be completed, then destroys all threads in the pool and creates a new thread pool with the new number of threads. Any tasks that were waiting in the queue before the pool was reset will then be executed by the new threads. If the pool was paused before resetting it, the new pool will be paused as well.
+     *
+     * @param thread_count_ The number of threads to use. The default value is the total number of hardware threads available, as reported by the implementation. This is usually determined by the number of cores in the CPU. If a core is hyperthreaded, it will count as two threads.
+     */
+    void reset(const concurrency_t thread_count_ = std::thread::hardware_concurrency())
+    {
+        const bool was_paused = paused;
+        paused = true;
+        wait_for_tasks();
+        destroy_threads();
+        thread_count = thread_count_ ? thread_count_ : std::thread::hardware_concurrency();
+        threads = std::make_unique<std::thread[]>(thread_count);
+        paused = was_paused;
+        create_threads();
+    }
+
+    /**
+     * @brief Submit a function with zero or more arguments into the task queue. If the function has a return value, get a future for the eventual returned value. If the function has no return value, get an std::future<void> which can be used to wait until the task finishes.
+     *
+     * @tparam F The type of the function.
+     * @tparam A The types of the zero or more arguments to pass to the function.
+     * @tparam R The return type of the function (can be void).
+     * @param task The function to submit.
+     * @param args The zero or more arguments to pass to the function.
+     * @return A future to be used later to wait for the function to finish executing and/or obtain its returned value if it has one.
+     */
+    template <typename F, typename... A, typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
+    std::future<R> submit(const F& task, const A&... args)
+    {
+        std::shared_ptr<std::promise<R>> task_promise = std::make_shared<std::promise<R>>();
+        push_task(
+            [task, args..., task_promise]
+            {
+                try
+                {
+                    if constexpr (std::is_void_v<R>)
+                    {
+                        task(args...);
+                        task_promise->set_value();
+                    }
+                    else
+                    {
+                        task_promise->set_value(task(args...));
+                    }
+                }
+                catch (...)
+                {
+                    try
+                    {
+                        task_promise->set_exception(std::current_exception());
+                    }
+                    catch (...)
+                    {
+                    }
+                }
+            });
+        return task_promise->get_future();
+    }
+
+    /**
+     * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those that are currently running in the threads and those that are still waiting in the queue. However, if the pool is paused, this function only waits for the currently running tasks (otherwise it would wait forever). Note: To wait for just one specific task, use submit() instead, and call the wait() member function of the generated future.
+     */
+    void wait_for_tasks()
+    {
+        waiting = true;
+        std::unique_lock<std::mutex> tasks_lock(tasks_mutex);
+        task_done_cv.wait(tasks_lock, [this] { return (tasks_total == (paused ? tasks.size() : 0)); });
+        waiting = false;
+    }
+
+    // ===========
+    // Public data
+    // ===========
+
+    /**
+     * @brief An atomic variable indicating whether the workers should pause. When set to true, the workers temporarily stop retrieving new tasks out of the queue, although any tasks already executed will keep running until they are finished. Set to false again to resume retrieving tasks.
+     */
+    std::atomic<bool> paused = false;
+
+private:
+    // ========================
+    // Private member functions
+    // ========================
+
+    /**
+     * @brief Create the threads in the pool and assign a worker to each thread.
+     */
+    void create_threads()
+    {
+        running = true;
+        for (concurrency_t i = 0; i < thread_count; ++i)
+        {
+            threads[i] = std::thread(&thread_pool::worker, this);
+        }
+    }
+
+    /**
+     * @brief Destroy the threads in the pool.
+     */
+    void destroy_threads()
+    {
+        running = false;
+        task_available_cv.notify_all();
+        for (concurrency_t i = 0; i < thread_count; ++i)
+        {
+            threads[i].join();
+        }
+    }
+
+    /**
+     * @brief A worker function to be assigned to each thread in the pool. Waits until it is notified by push_task() that a task is available, and then retrieves the task from the queue and executes it. Once the task finishes, the worker notifies wait_for_tasks() in case it is waiting.
+     */
+    void worker()
+    {
+        while (running)
+        {
+            std::function<void()> task;
+            std::unique_lock<std::mutex> tasks_lock(tasks_mutex);
+            task_available_cv.wait(tasks_lock, [&] { return !tasks.empty() || !running; });
+            if (running && !paused)
+            {
+                task = std::move(tasks.front());
+                tasks.pop();
+                tasks_lock.unlock();
+                task();
+                --tasks_total;
+                if (waiting)
+                    task_done_cv.notify_one();
+            }
+        }
+    }
+
+    // ============
+    // Private data
+    // ============
+
+    /**
+     * @brief An atomic variable indicating to the workers to keep running. When set to false, the workers permanently stop working.
+     */
+    std::atomic<bool> running = false;
+
+    /**
+     * @brief A condition variable used to notify worker() that a new task has become available.
+     */
+    std::condition_variable task_available_cv = {};
+
+    /**
+     * @brief A condition variable used to notify wait_for_tasks() that a tasks is done.
+     */
+    std::condition_variable task_done_cv = {};
+
+    /**
+     * @brief A queue of tasks to be executed by the threads.
+     */
+    std::queue<std::function<void()>> tasks = {};
+
+    /**
+     * @brief An atomic variable to keep track of the total number of unfinished tasks - either still in the queue, or running in a thread.
+     */
+    std::atomic<size_t> tasks_total = 0;
+
+    /**
+     * @brief A mutex to synchronize access to the task queue by different threads.
+     */
+    mutable std::mutex tasks_mutex = {};
+
+    /**
+     * @brief The number of threads in the pool.
+     */
+    concurrency_t thread_count = 0;
+
+    /**
+     * @brief A smart pointer to manage the memory allocated for the threads.
+     */
+    std::unique_ptr<std::thread[]> threads = nullptr;
+
+    /**
+     * @brief An atomic variable indicating that wait_for_tasks() is active and expects to be notified whenever a task is done.
+     */
+    std::atomic<bool> waiting = false;
+};
+
+//                                     End class thread_pool                                     //
+// ============================================================================================= //
+
+// ============================================================================================= //
+//                                   Begin class synced_stream                                   //
+
+/**
+ * @brief A helper class to synchronize printing to an output stream by different threads.
+ */
+class synced_stream
+{
+public:
+    /**
+     * @brief Construct a new synced stream.
+     *
+     * @param out_stream_ The output stream to print to. The default value is std::cout.
+     */
+    explicit synced_stream(std::ostream& out_stream_ = std::cout) : out_stream(out_stream_) {};
+
+    /**
+     * @brief Print any number of items into the output stream. Ensures that no other threads print to this stream simultaneously, as long as they all exclusively use the same synced_stream object to print.
+     *
+     * @tparam T The types of the items
+     * @param items The items to print.
+     */
+    template <typename... T>
+    void print(const T&... items)
+    {
+        const std::scoped_lock lock(stream_mutex);
+        (out_stream << ... << items);
+    }
+
+    /**
+     * @brief Print any number of items into the output stream, followed by a newline character. Ensures that no other threads print to this stream simultaneously, as long as they all exclusively use the same synced_stream object to print.
+     *
+     * @tparam T The types of the items
+     * @param items The items to print.
+     */
+    template <typename... T>
+    void println(const T&... items)
+    {
+        print(items..., '\n');
+    }
+
+private:
+    /**
+     * @brief The output stream to print to.
+     */
+    std::ostream& out_stream;
+
+    /**
+     * @brief A mutex to synchronize printing.
+     */
+    mutable std::mutex stream_mutex = {};
+};
+
+//                                    End class synced_stream                                    //
+// ============================================================================================= //
+
+// ============================================================================================= //
+//                                       Begin class timer                                       //
+
+/**
+ * @brief A helper class to measure execution time for benchmarking purposes.
+ */
+class timer
+{
+public:
+    /**
+     * @brief Start (or restart) measuring time.
+     */
+    void start()
+    {
+        start_time = std::chrono::steady_clock::now();
+    }
+
+    /**
+     * @brief Stop measuring time and store the elapsed time since start().
+     */
+    void stop()
+    {
+        elapsed_time = std::chrono::steady_clock::now() - start_time;
+    }
+
+    /**
+     * @brief Get the number of milliseconds that have elapsed between start() and stop().
+     *
+     * @return The number of milliseconds.
+     */
+    std::chrono::milliseconds::rep ms() const
+    {
+        return (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed_time)).count();
+    }
+
+private:
+    /**
+     * @brief The time point when measuring started.
+     */
+    std::chrono::time_point<std::chrono::steady_clock> start_time = std::chrono::steady_clock::now();
+
+    /**
+     * @brief The duration that has elapsed between start() and stop().
+     */
+    std::chrono::duration<double> elapsed_time = std::chrono::duration<double>::zero();
+};
+
+//                                        End class timer                                        //
+// ============================================================================================= //
+
+} // namespace BS
diff --git a/BS_thread_pool_test.cpp b/BS_thread_pool_test.cpp
new file mode 100644
index 0000000..73bba4b
--- /dev/null
+++ b/BS_thread_pool_test.cpp
@@ -0,0 +1,801 @@
+/**
+ * @file BS_thread_pool_test.cpp
+ * @author Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)
+ * @version 3.0.0
+ * @date 2022-05-30
+ * @copyright Copyright (c) 2022 Barak Shoshany. Licensed under the MIT license. If you use this library in software of any kind, please provide a link to the GitHub repository https://github.com/bshoshany/thread-pool in the source code and documentation. If you use this library in published research, please cite it as follows: Barak Shoshany, "A C++17 Thread Pool for High-Performance Scientific Computing", doi:10.5281/zenodo.4742687, arXiv:2105.00613 (May 2021)
+ *
+ * @brief BS::thread_pool: a fast, lightweight, and easy-to-use C++17 thread pool library. This program tests all aspects of the library, but is not needed in order to use the library.
+ */
+
+// Get rid of annoying MSVC warning.
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <algorithm> // std::min, std::min_element, std::sort, std::unique
+#include <atomic>    // std::atomic
+#include <chrono>    // std::chrono
+#include <cmath>     // std::abs, std::llround, std::round, std::sqrt
+#include <ctime>     // std::localtime, std::strftime, std::time_t
+#include <exception> // std::exception
+#include <fstream>   // std::ofstream
+#include <future>    // std::future
+#include <iomanip>   // std::setprecision, std::setw
+#include <ios>       // std::fixed
+#include <iostream>  // std::cout
+#include <limits>    // std::numeric_limits
+#include <random>    // std::mt19937_64, std::random_device, std::uniform_int_distribution, std::uniform_real_distribution
+#include <stdexcept> // std::runtime_error
+#include <string>    // std::string, std::to_string
+#include <thread>    // std::this_thread, std::thread
+#include <utility>   // std::pair
+#include <vector>    // std::begin, std::end, std::vector
+
+// Include the header file for the thread pool library.
+#include "BS_thread_pool.hpp"
+
+// ================
+// Global variables
+// ================
+
+// Set to false to disable output to a log file.
+constexpr bool output_log = true;
+
+// Set to false to disable testing.
+constexpr bool enable_tests = true;
+
+// Set to false to disable the benchmarks.
+constexpr bool enable_benchmarks = true;
+
+// Two global synced_streams objects. One prints to std::cout, and the other to a file.
+BS::synced_stream sync_cout(std::cout);
+std::ofstream log_file;
+BS::synced_stream sync_file(log_file);
+
+// A global thread pool object to be used throughout the test.
+BS::thread_pool pool;
+
+// A global random_device object to be used to seed some random number generators.
+std::random_device rd;
+
+// Global variables to measure how many checks succeeded and how many failed.
+size_t tests_succeeded = 0;
+size_t tests_failed = 0;
+
+// ================
+// Helper functions
+// ================
+
+/**
+ * @brief Print any number of items into both std::cout and the log file, syncing both independently.
+ *
+ * @tparam T The types of the items.
+ * @param items The items to print.
+ */
+template <typename... T>
+void dual_print(const T&... items)
+{
+    sync_cout.print(items...);
+    if (output_log)
+        sync_file.print(items...);
+}
+
+/**
+ * @brief Print any number of items into both std::cout and the log file, followed by a newline character, syncing both independently.
+ *
+ * @tparam T The types of the items.
+ * @param items The items to print.
+ */
+template <typename... T>
+void dual_println(const T&... items)
+{
+    dual_print(items..., '\n');
+}
+
+/**
+ * @brief Print a stylized header.
+ *
+ * @param text The text of the header. Will appear between two lines.
+ * @param symbol The symbol to use for the lines. Default is '='.
+ */
+void print_header(const std::string& text, const char symbol = '=')
+{
+    dual_println();
+    dual_println(std::string(text.length(), symbol));
+    dual_println(text);
+    dual_println(std::string(text.length(), symbol));
+}
+
+/**
+ * @brief Get a string representing the current time.
+ *
+ * @return The string.
+ */
+std::string get_time()
+{
+    const std::time_t t = std::time(nullptr);
+    char time_string[32];
+    std::strftime(time_string, sizeof(time_string), "%Y-%m-%d_%H.%M.%S", std::localtime(&t));
+    return std::string(time_string);
+}
+
+/**
+ * @brief Check if a condition is met, report the result, and keep count of the total number of successes and failures.
+ *
+ * @param condition The condition to check.
+ */
+void check(const bool condition)
+{
+    if (condition)
+    {
+        dual_println("-> PASSED!");
+        ++tests_succeeded;
+    }
+    else
+    {
+        dual_println("-> FAILED!");
+        ++tests_failed;
+    }
+}
+
+// =========================================
+// Functions to verify the number of threads
+// =========================================
+
+/**
+ * @brief Store the ID of the current thread in memory. Waits for a short time to ensure it does not get evaluated by more than one thread.
+ *
+ * @param location A pointer to the location where the thread ID should be stored.
+ */
+void store_ID(std::thread::id* location)
+{
+    *location = std::this_thread::get_id();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+}
+
+/**
+ * @brief Count the number of unique threads in the thread pool to ensure that the correct number of individual threads was created. Pushes a number of tasks equal to four times the thread count into the thread pool, and count the number of unique thread IDs returned by the tasks.
+ */
+BS::concurrency_t count_unique_threads()
+{
+    std::vector<std::thread::id> thread_IDs(pool.get_thread_count() * 4);
+    BS::multi_future<void> futures;
+    for (std::thread::id& id : thread_IDs)
+        futures.f.push_back(pool.submit(store_ID, &id));
+    futures.wait();
+    std::sort(thread_IDs.begin(), thread_IDs.end());
+    BS::concurrency_t unique_threads = (BS::concurrency_t)(std::unique(thread_IDs.begin(), thread_IDs.end()) - thread_IDs.begin());
+    return unique_threads;
+}
+
+/**
+ * @brief Check that the constructor works.
+ */
+void check_constructor()
+{
+    dual_println("Checking that the thread pool reports a number of threads equal to the hardware concurrency...");
+    check(pool.get_thread_count() == std::thread::hardware_concurrency());
+    dual_println("Checking that the manually counted number of unique thread IDs is equal to the reported number of threads...");
+    check(pool.get_thread_count() == count_unique_threads());
+}
+
+/**
+ * @brief Check that reset() works.
+ */
+void check_reset()
+{
+    pool.reset(std::thread::hardware_concurrency() / 2);
+    dual_println("Checking that after reset() the thread pool reports a number of threads equal to half the hardware concurrency...");
+    check(pool.get_thread_count() == std::thread::hardware_concurrency() / 2);
+    dual_println("Checking that after reset() the manually counted number of unique thread IDs is equal to the reported number of threads...");
+    check(pool.get_thread_count() == count_unique_threads());
+    pool.reset(std::thread::hardware_concurrency());
+    dual_println("Checking that after a second reset() the thread pool reports a number of threads equal to the hardware concurrency...");
+    check(pool.get_thread_count() == std::thread::hardware_concurrency());
+    dual_println("Checking that after a second reset() the manually counted number of unique thread IDs is equal to the reported number of threads...");
+    check(pool.get_thread_count() == count_unique_threads());
+}
+
+// =======================================
+// Functions to verify submission of tasks
+// =======================================
+
+/**
+ * @brief Check that push_task() works.
+ */
+void check_push_task()
+{
+    dual_println("Checking that push_task() works for a function with no arguments or return value...");
+    {
+        bool flag = false;
+        pool.push_task([&flag] { flag = true; });
+        pool.wait_for_tasks();
+        check(flag);
+    }
+    dual_println("Checking that push_task() works for a function with one argument and no return value...");
+    {
+        bool flag = false;
+        pool.push_task([](bool* flag_) { *flag_ = true; }, &flag);
+        pool.wait_for_tasks();
+        check(flag);
+    }
+    dual_println("Checking that push_task() works for a function with two arguments and no return value...");
+    {
+        bool flag1 = false;
+        bool flag2 = false;
+        pool.push_task([](bool* flag1_, bool* flag2_) { *flag1_ = *flag2_ = true; }, &flag1, &flag2);
+        pool.wait_for_tasks();
+        check(flag1 && flag2);
+    }
+}
+
+/**
+ * @brief Check that submit() works.
+ */
+void check_submit()
+{
+    dual_println("Checking that submit() works for a function with no arguments or return value...");
+    {
+        bool flag = false;
+        pool.submit([&flag] { flag = true; }).wait();
+        check(flag);
+    }
+    dual_println("Checking that submit() works for a function with one argument and no return value...");
+    {
+        bool flag = false;
+        pool.submit([](bool* flag_) { *flag_ = true; }, &flag).wait();
+        check(flag);
+    }
+    dual_println("Checking that submit() works for a function with two arguments and no return value...");
+    {
+        bool flag1 = false;
+        bool flag2 = false;
+        pool.submit([](bool* flag1_, bool* flag2_) { *flag1_ = *flag2_ = true; }, &flag1, &flag2).wait();
+        check(flag1 && flag2);
+    }
+    dual_println("Checking that submit() works for a function with no arguments and a return value...");
+    {
+        bool flag = false;
+        std::future<int> my_future = pool.submit(
+            [&flag]
+            {
+                flag = true;
+                return 42;
+            });
+        check(my_future.get() == 42 && flag);
+    }
+    dual_println("Checking that submit() works for a function with one argument and a return value...");
+    {
+        bool flag = false;
+        std::future<int> my_future = pool.submit(
+            [](bool* flag_)
+            {
+                *flag_ = true;
+                return 42;
+            },
+            &flag);
+        check(my_future.get() == 42 && flag);
+    }
+    dual_println("Checking that submit() works for a function with two arguments and a return value...");
+    {
+        bool flag1 = false;
+        bool flag2 = false;
+        std::future<int> my_future = pool.submit(
+            [](bool* flag1_, bool* flag2_)
+            {
+                *flag1_ = *flag2_ = true;
+                return 42;
+            },
+            &flag1, &flag2);
+        check(my_future.get() == 42 && flag1 && flag2);
+    }
+}
+
+/**
+ * @brief Check that wait_for_tasks() works.
+ */
+void check_wait_for_tasks()
+{
+    const BS::concurrency_t n = pool.get_thread_count() * 10;
+    std::vector<std::atomic<bool>> flags(n);
+    for (BS::concurrency_t i = 0; i < n; ++i)
+        pool.push_task(
+            [&flags, i]
+            {
+                std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                flags[i] = true;
+            });
+    pool.wait_for_tasks();
+    bool all_flags = true;
+    for (BS::concurrency_t i = 0; i < n; ++i)
+        all_flags = all_flags && flags[i];
+    check(all_flags);
+}
+
+// ========================================
+// Functions to verify loop parallelization
+// ========================================
+
+/**
+ * @brief Check that parallelize_loop() works for a specific number of indices split over a specific number of tasks, with no return value.
+ *
+ * @param start The first index in the loop.
+ * @param end The last index in the loop plus 1.
+ * @param num_tasks The number of tasks.
+ */
+void check_parallelize_loop_no_return(const int64_t random_start, int64_t random_end, const BS::concurrency_t num_tasks)
+{
+    if (random_start == random_end)
+        ++random_end;
+    dual_println("Verifying that a loop from ", random_start, " to ", random_end, " with ", num_tasks, num_tasks == 1 ? " task" : " tasks", " modifies all indices...");
+    const size_t num_indices = static_cast<size_t>(std::abs(random_end - random_start));
+    const int64_t offset = std::min(random_start, random_end);
+    std::vector<std::atomic<bool>> flags(num_indices);
+    pool.parallelize_loop(
+            random_start, random_end,
+            [&flags, offset](const int64_t start, const int64_t end)
+            {
+                for (int64_t i = start; i < end; ++i)
+                    flags[(size_t)(i - offset)] = true;
+            },
+            num_tasks)
+        .wait();
+    bool all_flags = true;
+    for (size_t i = 0; i < num_indices; ++i)
+        all_flags = all_flags && flags[i];
+    check(all_flags);
+}
+
+/**
+ * @brief Check that parallelize_loop() works for a specific number of indices split over a specific number of tasks, with a return value.
+ *
+ * @param start The first index in the loop.
+ * @param end The last index in the loop plus 1.
+ * @param num_tasks The number of tasks.
+ */
+void check_parallelize_loop_return(const int64_t random_start, int64_t random_end, const BS::concurrency_t num_tasks)
+{
+    if (random_start == random_end)
+        ++random_end;
+    dual_println("Verifying that a loop from ", random_start, " to ", random_end, " with ", num_tasks, num_tasks == 1 ? " task" : " tasks", " correctly sums all indices...");
+    const std::vector<int64_t> sums_vector = pool.parallelize_loop(
+                                                     random_start, random_end,
+                                                     [](const int64_t start, const int64_t end)
+                                                     {
+                                                         int64_t total = 0;
+                                                         for (int64_t i = start; i < end; ++i)
+                                                             total += i;
+                                                         return total;
+                                                     },
+                                                     num_tasks)
+                                                 .get();
+    int64_t sum = 0;
+    for (const int64_t& s : sums_vector)
+        sum += s;
+    check(sum * 2 == std::abs(random_start - random_end) * (random_start + random_end - 1));
+}
+
+/**
+ * @brief Check that parallelize_loop() works using several different random values for the range of indices and number of tasks.
+ */
+void check_parallelize_loop()
+{
+    std::mt19937_64 mt(rd());
+    std::uniform_int_distribution<int64_t> index_dist(-1000000, 1000000);
+    std::uniform_int_distribution<BS::concurrency_t> task_dist(1, pool.get_thread_count());
+    constexpr uint64_t n = 10;
+    for (uint64_t i = 0; i < n; ++i)
+        check_parallelize_loop_no_return(index_dist(mt), index_dist(mt), task_dist(mt));
+    for (uint64_t i = 0; i < n; ++i)
+        check_parallelize_loop_return(index_dist(mt), index_dist(mt), task_dist(mt));
+}
+
+// ===============================================
+// Functions to verify task monitoring and control
+// ===============================================
+
+/**
+ * @brief Check that task monitoring works.
+ */
+void check_task_monitoring()
+{
+    BS::concurrency_t n = std::min<BS::concurrency_t>(std::thread::hardware_concurrency(), 4);
+    dual_println("Resetting pool to ", n, " threads.");
+    pool.reset(n);
+    dual_println("Submitting ", n * 3, " tasks.");
+    std::vector<std::atomic<bool>> release(n * 3);
+    for (BS::concurrency_t i = 0; i < n * 3; ++i)
+        pool.push_task(
+            [&release, i]
+            {
+                while (!release[i])
+                    std::this_thread::yield();
+                dual_println("Task ", i, " released.");
+            });
+    constexpr std::chrono::milliseconds sleep_time(300);
+    std::this_thread::sleep_for(sleep_time);
+    dual_println("After submission, should have: ", n * 3, " tasks total, ", n, " tasks running, ", n * 2, " tasks queued...");
+    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n * 2);
+    for (BS::concurrency_t i = 0; i < n; ++i)
+        release[i] = true;
+    std::this_thread::sleep_for(sleep_time);
+    dual_println("After releasing ", n, " tasks, should have: ", n * 2, " tasks total, ", n, " tasks running, ", n, " tasks queued...");
+    check(pool.get_tasks_total() == n * 2 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n);
+    for (BS::concurrency_t i = n; i < n * 2; ++i)
+        release[i] = true;
+    std::this_thread::sleep_for(sleep_time);
+    dual_println("After releasing ", n, " more tasks, should have: ", n, " tasks total, ", n, " tasks running, ", 0, " tasks queued...");
+    check(pool.get_tasks_total() == n && pool.get_tasks_running() == n && pool.get_tasks_queued() == 0);
+    for (BS::concurrency_t i = n * 2; i < n * 3; ++i)
+        release[i] = true;
+    std::this_thread::sleep_for(sleep_time);
+    dual_println("After releasing the final ", n, " tasks, should have: ", 0, " tasks total, ", 0, " tasks running, ", 0, " tasks queued...");
+    check(pool.get_tasks_total() == 0 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == 0);
+    dual_println("Resetting pool to ", std::thread::hardware_concurrency(), " threads.");
+    pool.reset(std::thread::hardware_concurrency());
+}
+
+/**
+ * @brief Check that pausing works.
+ */
+void check_pausing()
+{
+    BS::concurrency_t n = std::min<BS::concurrency_t>(std::thread::hardware_concurrency(), 4);
+    dual_println("Resetting pool to ", n, " threads.");
+    pool.reset(n);
+    dual_println("Pausing pool.");
+    pool.paused = true;
+    dual_println("Submitting ", n * 3, " tasks, each one waiting for 200ms.");
+    for (BS::concurrency_t i = 0; i < n * 3; ++i)
+        pool.push_task(
+            [i]
+            {
+                std::this_thread::sleep_for(std::chrono::milliseconds(200));
+                dual_println("Task ", i, " done.");
+            });
+    dual_println("Immediately after submission, should have: ", n * 3, " tasks total, ", 0, " tasks running, ", n * 3, " tasks queued...");
+    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n * 3);
+    std::this_thread::sleep_for(std::chrono::milliseconds(300));
+    dual_println("300ms later, should still have: ", n * 3, " tasks total, ", 0, " tasks running, ", n * 3, " tasks queued...");
+    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n * 3);
+    dual_println("Unpausing pool.");
+    pool.paused = false;
+    std::this_thread::sleep_for(std::chrono::milliseconds(300));
+    dual_println("300ms later, should have: ", n * 2, " tasks total, ", n, " tasks running, ", n, " tasks queued...");
+    check(pool.get_tasks_total() == n * 2 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n);
+    dual_println("Pausing pool and using wait_for_tasks() to wait for the running tasks.");
+    pool.paused = true;
+    pool.wait_for_tasks();
+    dual_println("After waiting, should have: ", n, " tasks total, ", 0, " tasks running, ", n, " tasks queued...");
+    check(pool.get_tasks_total() == n && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n);
+    std::this_thread::sleep_for(std::chrono::milliseconds(200));
+    dual_println("200ms later, should still have: ", n, " tasks total, ", 0, " tasks running, ", n, " tasks queued...");
+    check(pool.get_tasks_total() == n && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n);
+    dual_println("Unpausing pool and using wait_for_tasks() to wait for all tasks.");
+    pool.paused = false;
+    pool.wait_for_tasks();
+    dual_println("After waiting, should have: ", 0, " tasks total, ", 0, " tasks running, ", 0, " tasks queued...");
+    check(pool.get_tasks_total() == 0 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == 0);
+    dual_println("Resetting pool to ", std::thread::hardware_concurrency(), " threads.");
+    pool.reset(std::thread::hardware_concurrency());
+}
+
+// ======================================
+// Functions to verify exception handling
+// ======================================
+
+/**
+ * @brief Check that exception handling work.
+ */
+void check_exceptions()
+{
+    bool caught = false;
+    std::future<void> my_future = pool.submit([] { throw std::runtime_error("Exception thrown!"); });
+    try
+    {
+        my_future.get();
+    }
+    catch (const std::exception& e)
+    {
+        if (e.what() == std::string("Exception thrown!"))
+            caught = true;
+    }
+    check(caught);
+}
+
+// =====================================
+// Functions to verify vector operations
+// =====================================
+
+/**
+ * @brief Check that parallelized vector operations work as expected by calculating the sum of two randomized vectors of a specific size in two ways, single-threaded and multithreaded, and comparing the results.
+ */
+void check_vector_of_size(const size_t vector_size, const BS::concurrency_t num_tasks)
+{
+    std::vector<int64_t> vector_1(vector_size);
+    std::vector<int64_t> vector_2(vector_size);
+    std::mt19937_64 mt(rd());
+    std::uniform_int_distribution<int64_t> vector_dist(-1000000, 1000000);
+    for (size_t i = 0; i < vector_size; ++i)
+    {
+        vector_1[i] = vector_dist(mt);
+        vector_2[i] = vector_dist(mt);
+    }
+    dual_println("Adding two vectors with ", vector_size, " elements using ", num_tasks, " tasks...");
+    std::vector<int64_t> sum_single(vector_size);
+    for (size_t i = 0; i < vector_size; ++i)
+        sum_single[i] = vector_1[i] + vector_2[i];
+    std::vector<int64_t> sum_multi(vector_size);
+    pool.parallelize_loop(
+            0, vector_size,
+            [&sum_multi, &vector_1, &vector_2](const size_t start, const size_t end)
+            {
+                for (size_t i = start; i < end; ++i)
+                    sum_multi[i] = vector_1[i] + vector_2[i];
+            },
+            num_tasks)
+        .wait();
+    bool vectors_equal = true;
+    for (size_t i = 0; i < vector_size; ++i)
+        vectors_equal = vectors_equal && (sum_single[i] == sum_multi[i]);
+    check(vectors_equal);
+}
+
+/**
+ * @brief Check that parallelized vector operations work as expected by calculating the sum of two randomized vectors in two ways, single-threaded and multithreaded, and comparing the results.
+ */
+void check_vectors()
+{
+    pool.reset();
+    std::mt19937_64 mt(rd());
+    std::uniform_int_distribution<size_t> size_dist(0, 1000000);
+    std::uniform_int_distribution<BS::concurrency_t> task_dist(1, pool.get_thread_count());
+    for (size_t i = 0; i < 10; ++i)
+        check_vector_of_size(size_dist(mt), task_dist(mt));
+}
+
+// ==================
+// Main test function
+// ==================
+
+/**
+ * @brief Test that various aspects of the library are working as expected.
+ */
+void do_tests()
+{
+    print_header("Checking that the constructor works:");
+    check_constructor();
+
+    print_header("Checking that reset() works:");
+    check_reset();
+
+    print_header("Checking that push_task() works:");
+    check_push_task();
+
+    print_header("Checking that submit() works:");
+    check_submit();
+
+    print_header("Checking that wait_for_tasks() works...");
+    check_wait_for_tasks();
+
+    print_header("Checking that parallelize_loop() works:");
+    check_parallelize_loop();
+
+    print_header("Checking that task monitoring works:");
+    check_task_monitoring();
+
+    print_header("Checking that pausing works:");
+    check_pausing();
+
+    print_header("Checking that exception handling works:");
+    check_exceptions();
+
+    print_header("Testing that vector operations produce the expected results:");
+    check_vectors();
+}
+
+// ==========================
+// Functions for benchmarking
+// ==========================
+
+/**
+ * @brief Print the timing of a specific test.
+ *
+ * @param num_tasks The number of tasks.
+ * @param mean_sd An std::pair containing the mean as the first member and standard deviation as the second member.
+ */
+void print_timing(const BS::concurrency_t num_tasks, const std::pair<double, double>& mean_sd)
+{
+    if (num_tasks == 0)
+        dual_print("Single-threaded");
+    else if (num_tasks == 1)
+        dual_print("With    1  task");
+    else
+        dual_print("With ", std::setw(4), num_tasks, " tasks");
+    dual_println(", mean execution time was ", std::setw(6), mean_sd.first, " ms with standard deviation ", std::setw(4), mean_sd.second, " ms.");
+}
+
+/**
+ * @brief Calculate and print the speedup obtained by multithreading.
+ *
+ * @param timings A vector of the timings corresponding to different numbers of tasks.
+ */
+void print_speedup(const std::vector<double>& timings, const BS::concurrency_t try_tasks[])
+{
+    const std::vector<double>::const_iterator min_el = std::min_element(std::begin(timings), std::end(timings));
+    const double max_speedup = std::round(timings[0] / *min_el * 10) / 10;
+    const BS::concurrency_t num_tasks = try_tasks[min_el - std::begin(timings)];
+    dual_println("Maximum speedup obtained by multithreading vs. single-threading: ", max_speedup, "x, using ", num_tasks, " tasks.");
+}
+
+/**
+ * @brief Calculate the mean and standard deviation of a set of integers.
+ *
+ * @param timings The integers.
+ * @return An std::pair containing the mean as the first member and standard deviation as the second member.
+ */
+std::pair<double, double> analyze(const std::vector<std::chrono::milliseconds::rep>& timings)
+{
+    double mean = 0;
+    for (size_t i = 0; i < timings.size(); ++i)
+        mean += static_cast<double>(timings[i]) / static_cast<double>(timings.size());
+    double variance = 0;
+    for (size_t i = 0; i < timings.size(); ++i)
+        variance += (static_cast<double>(timings[i]) - mean) * (static_cast<double>(timings[i]) - mean) / static_cast<double>(timings.size());
+    const double sd = std::sqrt(variance);
+    return std::pair(mean, sd);
+}
+
+/**
+ * @brief Generate a seed. The std::mt19937_64 in each task will be seeded using this function in order to avoid depleting the entropy of the random_device.
+ *
+ * @return A random unsigned 64-bit integer.
+ */
+uint64_t generate_seed()
+{
+    static std::mt19937_64 mt(rd());
+    return mt();
+}
+
+/**
+ * @brief Benchmark multithreaded performance by generating random vectors.
+ */
+void check_performance()
+{
+    // Reset the pool to ensure that we have a fresh start.
+    pool.reset();
+
+    // Set the formatting of floating point numbers.
+    dual_print(std::fixed, std::setprecision(1));
+
+    // Initialize a random distribution to randomize vectors with arbitrary floating point values.
+    const double range = std::sqrt(std::numeric_limits<double>::max());
+    std::uniform_real_distribution<double> vector_dist(-range, range);
+
+    // Initialize a timer object to measure execution time.
+    BS::timer tmr;
+
+    // Store the number of available hardware threads for easy access.
+    const BS::concurrency_t thread_count = pool.get_thread_count();
+    dual_println("Using ", thread_count, " threads.");
+
+    // Define the number of tasks to try in each run of the test (0 = single-threaded).
+    const BS::concurrency_t try_tasks[] = {0, thread_count / 4, thread_count / 2, thread_count, thread_count * 2, thread_count * 4};
+
+    // The size of the vectors to use for the test.
+    constexpr size_t vector_size = 500;
+
+    // How many times to repeat each run of the test in order to collect reliable statistics.
+    constexpr size_t repeat = 20;
+    dual_println("Each test will be repeated ", repeat, " times to collect reliable statistics.");
+
+    // The target duration of the single-threaded test in milliseconds. The total time spent on the test in the single-threaded case will be approximately equal to repeat * target_ms.
+    constexpr std::chrono::milliseconds::rep target_ms = 300;
+
+    // Vectors to store statistics.
+    std::vector<double> different_n_timings;
+    std::vector<std::chrono::milliseconds::rep> same_n_timings;
+
+    // Test how many vectors we need to generate to roughly achieve the target duration.
+    size_t num_vectors = 1;
+    do
+    {
+        num_vectors *= 2;
+        std::vector<std::vector<double>> vectors(num_vectors, std::vector<double>(vector_size));
+        std::mt19937_64 test_mt(rd());
+        tmr.start();
+        for (size_t i = 0; i < num_vectors; ++i)
+        {
+            for (size_t j = 0; j < vector_size; ++j)
+                vectors[i][j] = vector_dist(test_mt);
+        }
+        tmr.stop();
+    } while (tmr.ms() < target_ms);
+    num_vectors = static_cast<size_t>(std::llround(static_cast<double>(num_vectors) * static_cast<double>(target_ms) / static_cast<double>(tmr.ms())));
+
+    // Initialize the desired number of vectors.
+    std::vector<std::vector<double>> vectors(num_vectors, std::vector<double>(vector_size));
+
+    // Perform the test.
+    dual_println("\nGenerating ", num_vectors, " random vectors with ", vector_size, " elements each:");
+    for (BS::concurrency_t n : try_tasks)
+    {
+        for (size_t r = 0; r < repeat; ++r)
+        {
+            tmr.start();
+            if (n > 1)
+            {
+                pool.parallelize_loop(
+                        0, num_vectors,
+                        [&vector_dist, &vectors](const size_t start, const size_t end)
+                        {
+                            std::mt19937_64 multi_mt(generate_seed());
+                            for (size_t i = start; i < end; ++i)
+                            {
+                                for (size_t j = 0; j < vector_size; ++j)
+                                    vectors[i][j] = vector_dist(multi_mt);
+                            }
+                        },
+                        n)
+                    .wait();
+            }
+            else
+            {
+                std::mt19937_64 single_mt(generate_seed());
+                for (size_t i = 0; i < num_vectors; ++i)
+                {
+                    for (size_t j = 0; j < vector_size; ++j)
+                        vectors[i][j] = vector_dist(single_mt);
+                }
+            }
+            tmr.stop();
+            same_n_timings.push_back(tmr.ms());
+        }
+        std::pair<double, double> mean_sd = analyze(same_n_timings);
+        print_timing(n, mean_sd);
+        different_n_timings.push_back(mean_sd.first);
+        same_n_timings.clear();
+    }
+    print_speedup(different_n_timings, try_tasks);
+}
+
+int main()
+{
+    const std::string log_filename = "BS_thread_pool_test-" + get_time() + ".log";
+    if (output_log)
+        log_file.open(log_filename);
+
+    dual_println("A C++17 Thread Pool for High-Performance Scientific Computing");
+    dual_println("(c) 2022 Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)");
+    dual_println("GitHub: https://github.com/bshoshany/thread-pool\n");
+
+    dual_println("Thread pool library version is ", BS_THREAD_POOL_VERSION, ".");
+    dual_println("Hardware concurrency is ", std::thread::hardware_concurrency(), ".");
+    if (output_log)
+        dual_println("Generating log file: ", log_filename, ".\n");
+
+    dual_println("Important: Please do not run any other applications, especially multithreaded applications, in parallel with this test!");
+
+    if (enable_tests)
+        do_tests();
+
+    if (tests_failed == 0)
+    {
+        if (enable_tests)
+            print_header("SUCCESS: Passed all " + std::to_string(tests_succeeded) + " checks!", '+');
+        if (enable_benchmarks)
+        {
+            print_header("Performing benchmarks:");
+            check_performance();
+            print_header("Thread pool performance test completed!", '+');
+        }
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        print_header("FAILURE: Passed " + std::to_string(tests_succeeded) + " checks, but failed " + std::to_string(tests_failed) + "!", '+');
+        dual_println("\nPlease submit a bug report at https://github.com/bshoshany/thread-pool/issues including the exact specifications of your system (OS, CPU, compiler, etc.) and the generated log file.");
+        return EXIT_FAILURE;
+    }
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c6bfaf4..c230097 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,56 +1,150 @@
-# A C++17 Thread Pool for High-Performance Scientific Computing
+[![DOI:10.5281/zenodo.4742687](https://zenodo.org/badge/DOI/10.5281/zenodo.4742687.svg)](https://doi.org/10.5281/zenodo.4742687)
+[![arXiv:2105.00613](https://img.shields.io/badge/arXiv-2105.00613-b31b1b.svg)](https://arxiv.org/abs/2105.00613)
+[![License: MIT](https://img.shields.io/github/license/bshoshany/thread-pool)](https://github.com/bshoshany/thread-pool/blob/master/LICENSE.txt)
+![Language: C++17](https://img.shields.io/badge/Language-C%2B%2B17-yellow)
+![File size in bytes](https://img.shields.io/github/size/bshoshany/thread-pool/BS_thread_pool.hpp)
+![GitHub last commit](https://img.shields.io/github/last-commit/bshoshany/thread-pool)
+[![GitHub repo stars](https://img.shields.io/github/stars/bshoshany/thread-pool?style=social)](https://github.com/bshoshany/thread-pool)
+[![Twitter @BarakShoshany](https://img.shields.io/twitter/follow/BarakShoshany?style=social)](https://twitter.com/BarakShoshany)
+[![Open in Visual Studio Code](https://img.shields.io/badge/-Open%20in%20Visual%20Studio%20Code-007acc)](https://vscode.dev/github/bshoshany/thread-pool)
+
+# `BS::thread_pool`: a fast, lightweight, and easy-to-use C++17 thread pool library
+
+By Barak Shoshany ([baraksh@gmail.com](mailto:baraksh@gmail.com)) ([https://baraksh.com/](https://baraksh.com/))
+
+* [Version history](#version-history)
+    * [v3.0.0 (2022-05-30)](#v300-2022-05-30)
+    * [v2.0.0 (2021-08-14)](#v200-2021-08-14)
+    * [v1.9 (2021-07-29)](#v19-2021-07-29)
+    * [v1.8 (2021-07-28)](#v18-2021-07-28)
+    * [v1.7 (2021-06-02)](#v17-2021-06-02)
+    * [v1.6 (2021-05-26)](#v16-2021-05-26)
+    * [v1.5 (2021-05-07)](#v15-2021-05-07)
+    * [v1.4 (2021-05-05)](#v14-2021-05-05)
+    * [v1.3 (2021-05-03)](#v13-2021-05-03)
+    * [v1.2 (2021-04-29)](#v12-2021-04-29)
+    * [v1.1 (2021-04-24)](#v11-2021-04-24)
+    * [v1.0 (2021-01-15)](#v10-2021-01-15)
 
 ## Version history
 
-* v2.0.0 (2021-08-14)
-    * From now on, version numbers will adhere to the [Semantic Versioning](https://semver.org/) specification in the format **major.minor.patch**.
-    * A file named `thread_pool_test.cpp` has been added to the package. It will perform automated tests of all aspects of the package, and benchmark some multithreaded matrix operations. Please run it on your system and [submit a bug report](https://github.com/bshoshany/thread-pool/issues) if any of the tests fail. In addition, the code is thoroughly documented, and is meant to serve as an extensive example of how to properly use the package.
-    * The package is now available through [vcpkg](https://github.com/microsoft/vcpkg). Instructions for how to install it have been added to `README.md`. See [this pull request](https://github.com/bshoshany/thread-pool/pull/18).
-    * The package now defines a macro `THREAD_POOL_VERSION`, which returns the version number and release date of the thread pool library as a string.
-    * `parallelize_loop()` has undergone some major changes (and is now incompatible with v1.x):
-        * The second argument is now the index **after** the last index, instead of the last index itself. This is more consistent with C++ conventions (e.g. standard library algorithms) where the range is always `[first, last)`. For example, for an array with `n` indices, instead of `parallelize_loop(0, n - 1, ...)` you should now write `parallelize_loop(0, n, ...)`.
-        * The `loop` function is now only called once per block, instead of once per index, as was the case before. This should provide a performance boost due to significantly reducing the number of function calls, and it also allows you to conserve resources by using them only once per block instead of once per index (an example can be found in the `random_matrix_generator` class in `thread_pool_test.cpp`). It also means that `loop` now takes two arguments: the first index in the block and the index after the last index in the block. Thus, `loop(start, end)` should typically involve a loop of the form `for (T i = start; i < end; i++)`.
-        * The first and last indices can now be of two different integer types. Previously, `parallelize_loop(0, i, ...)` did not work if `i` was not an `int`, because `0` was interpreted as `int`, and the two arguments had to be of the same type. Therefore, one had to use casting, e.g. `parallelize_loop((size_t)0, i)`, to make it work. Now this is no longer necessary; the common type is inferred automatically using `std::common_type_t`.
-* v1.9 (2021-07-29)
-    * Fixed a bug in `reset()` which caused it to create the wrong number of threads.
-* v1.8 (2021-07-28)
-    * The version history has become too long to be included in `README.md`, so I moved it to a separate file, `CHANGELOG.md`.
-    * A button to open this repository directly in Visual Studio Code has been added to the badges in `README.md`.
-    * An internal variable named `promise` has been renamed to `task_promise` to avoid any potential errors in case the user invokes `using namespace std`.
-    * `submit()` now catches exceptions thrown by the submitted task and forwards them to the future. See [this issue](https://github.com/bshoshany/thread-pool/issues/14).
-    * Eliminated compiler warnings that appeared when using the `-Weffc++` flag in GCC. See [this pull request](https://github.com/bshoshany/thread-pool/pull/17).
-* v1.7 (2021-06-02)
-    * Fixed a bug in `parallelize_loop()` which prevented it from actually running loops in parallel, see [this issue](https://github.com/bshoshany/thread-pool/issues/11).
-* v1.6 (2021-05-26)
-    * Since MSVC does not interpret `and` as `&&` by default, the previous release did not compile with MSVC unless the `/permissive-` or `/Za` compiler flags were used. This has been fixed in this version, and the code now successfully compiles with GCC, Clang, and MSVC. See [this pull request](https://github.com/bshoshany/thread-pool/pull/10).
-* v1.5 (2021-05-07)
-    * This library now has a DOI for citation purposes. Information on how to cite it in publications has been added to the source code and to `README.md`.
-    * Added GitHub badges to `README.md`.
-* v1.4 (2021-05-05)
-    * Added three new public member functions to monitor the tasks submitted to the pool:
-        * `get_tasks_queued()` gets the number of tasks currently waiting in the queue to be executed by the threads.
-        * `get_tasks_running()` gets the number of tasks currently being executed by the threads.
-        * `get_tasks_total()` gets the total number of unfinished tasks - either still in the queue, or running in a thread.
-        * Note that `get_tasks_running() == get_tasks_total() - get_tasks_queued()`.
-        * Renamed the private member variable `tasks_waiting` to `tasks_total` to make its purpose clearer.
-    * Added an option to temporarily pause the workers:
-        * When public member variable `paused` is set to `true`, the workers temporarily stop popping new tasks out of the queue, although any tasks already executed will keep running until they are done. Set to `false` again to resume popping tasks.
-        * While the workers are paused, `wait_for_tasks()` will wait for the running tasks instead of all tasks (otherwise it would wait forever).
-        * By utilizing the new pausing mechanism, `reset()` can now change the number of threads on-the-fly while there are still tasks waiting in the queue. The new thread pool will resume executing tasks from the queue once it is created.
-    * `parallelize_loop()` and `wait_for_tasks()` now have the same behavior as the worker function with regards to waiting for tasks to complete. If the relevant tasks are not yet complete, then before checking again, they will sleep for `sleep_duration` microseconds, unless that variable is set to zero, in which case they will call `std::this_thread::yield()`. This should improve performance and reduce CPU usage.
-    * Merged [this commit](https://github.com/bshoshany/thread-pool/pull/8): Fixed weird error when using MSVC and including `windows.h`.
-    * The `README.md` file has been reorganized and expanded.
-* v1.3 (2021-05-03)
-    * Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/3): Removed `std::move` from the `return` statement in `push_task()`. This previously generated a `-Wpessimizing-move` warning in Clang. The assembly code generated by the compiler seems to be the same before and after this change, presumably because the compiler eliminates the `std::move` automatically, but this change gets rid of the Clang warning.
-    * Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/5): Removed a debugging message printed to `std::cout`, which was left in the code by mistake.
-    * Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/6): `parallelize_loop()` no longer sends references for the variables `start` and `stop` when calling `push_task()`, which may lead to undefined behavior.
-    * A companion paper is now published at <a href="https://arxiv.org/abs/2105.00613">arXiv:2105.00613</a>, including additional information such as performance tests on systems with up to 80 hardware threads. The `README.md` has been updated, and it is now roughly identical in content to the paper.
-* v1.2 (2021-04-29)
-    * The worker function, which controls the execution of tasks by each thread, now sleeps by default instead of yielding. Previously, when the worker could not find any tasks in the queue, it called `std::this_thread::yield()` and then tried again. However, this caused the workers to have high CPU usage when idle, [as reported by some users](https://github.com/bshoshany/thread-pool/issues/1). Now, when the worker function cannot find a task to run, it instead sleeps for a duration given by the public member variable `sleep_duration` (in microseconds) before checking the queue again. The default value is `1000` microseconds, which I found to be optimal in terms of both CPU usage and performance, but your own optimal value may be different.
-    * If the constructor is called with an argument of zero for the number of threads, then the default value, `std::thread::hardware_concurrency()`, is used instead.
-    * Added a simple helper class, `timer`, which can be used to measure execution time for benchmarking purposes.
-    * Improved and expanded the documentation.
-* v1.1 (2021-04-24)
-    * Cosmetic changes only. Fixed a typo in the Doxygen comments and added a link to the GitHub repository.
-* v1.0 (2021-01-15)
-    * Initial release.
+### v3.0.0 (2022-05-30)
+
+* This is a major new release with many changes and improvements! Please note that code written using previous releases will need to be slightly modified to work with the new release. The changes needed to migrate to the new API are explicitly indicated below for your convenience.
+* Breaking changes to the library header file:
+    * The header file has been renamed to `BS_thread_pool.hpp` to avoid potential conflict with other thread pool libraries.
+        * **API migration:** The library must now be included by invoking `#include "BS_thread_pool.hpp"`.
+    * All the definitions in the library, including the `thread_pool` class and the helper classes, are now located in the namespace `BS`. This namespace will also be used for my other C++ projects, and is intended to ensure consistency between my projects while avoiding potential name conflicts with other libraries.
+        * **API migration:** The thread pool class should now be invoked as `BS::thread_pool`. Alternatively, it is possible to employ `using BS::thread_pool` or even `using namespace BS` and then invoke `thread_pool` directly. Same for the `BS::synced_stream` and `BS::timer` helper classes.
+    * The macro `THREAD_POOL_VERSION`, which contains the version number and release date of the library, has been renamed to `BS_THREAD_POOL_VERSION` to avoid potential conflicts.
+        * **API migration:** The version must now be read from the macro `BS_THREAD_POOL_VERSION`.
+    * The public member `sleep_duration` has been removed. The thread pool now uses condition variables instead of sleep to facilitate waiting. This significantly improves performance (by 10%-50% in my testing), drastically decreases idle CPU utilization, and eliminates the need to set an optimal sleep time. This was a highly-requested change; see [issue #1](https://github.com/bshoshany/thread-pool/issues/1), [issue #12](https://github.com/bshoshany/thread-pool/issues/12), and [pull request #23](https://github.com/bshoshany/thread-pool/pull/23).
+        * **API migration:** Remove any code that relates to the public member `sleep_duration`.
+    * The template specializations for `submit()` have been merged. Now instead of two versions, one for functions with a return value and one for functions without a return value, there is just one version, which can accept any function. This makes the code more compact (and elegant). If a function with no return value is submitted, an `std::future<void>` is returned (the previous version returned an `std::future<bool>`)
+        * **API migration:** To wait for a task with no return value, simply call `wait()` or `get()` on the corresponding `std::future<void>`.
+    * `parallelize_loop()` now returns a future in the form of a new `BS::multi_future` helper class template. The member function `wait()` of this future allows waiting until all of the loop's blocks finish executing. In previous versions, calling `parallelize_loop()` both parallelized the loop and waited for the blocks to finish; now it is possible to do other stuff while the loop executes.
+        * **API migration:** Since `parallelize_loop()` no longer automatically blocks, you should either store the result in a `BS::multi_future` object and call its `wait()` member function, or simply call `parallelize_loop().wait()` to reproduce the old behavior.
+* Non-breaking changes to the library header file:
+    * It is now possible to use `parallelize_loop()` with functions that have return values and get these values from all blocks at once through the `get()` member function of the `BS::multi_future`.
+    * The template specializations for `push_task()` have been merged. Now instead of two versions, one for functions with arguments and one for functions without arguments, there is just one version, which can accept any function.
+    * Constructors have been made `explicit`. See [issue #28](https://github.com/bshoshany/thread-pool/issues/28).
+    * `submit()` now uses `std::make_shared` instead of `new` to create the shared pointer. This means only one memory allocation is performed instead of two, which should improve performance. In addition, all unique pointers are now created using `std::make_unique`.
+    * A new helper class template, `BS::multi_future`, has been added. It's basically just a wrapper around `std::vector<std::future<T>>`. This class is used by the new implementation of `parallelize_loop()` to allow waiting for the entire loop, consisting of multiple tasks with their corresponding futures, to finish executing.
+    * `BS::multi_future` can also be used independently to handle multiple futures at once. For example, you can now keep track of several groups of tasks by storing their futures inside separate `BS::multi_future` objects and use either `wait()` to wait for all tasks in a specific group to finish or `get()` to get an `std::vector` with the return values of every task in the group.
+    * Integer types are now chosen in a smarter way to improve portability, allow for better compatibility with 32-bit systems, and prevent potential conversion errors.
+    * Added a new type, `BS::concurrency_t`, equal to the return type of `std::thread::hardware_concurrency()`. This is probably pointless, since the C++ standard requires this to be `unsigned int`, but it seems to me to make the code slightly more portable, in case some non-conforming compiler chooses to use a different integer type.
+    * C-style casts have been converted to C++ cast expressions for added clarity.
+    * Miscellaneous minor optimizations and style improvements.
+* Changes to the test program:
+    * The program has been renamed to `BS_thread_pool_test.cpp` to avoid potential conflict with other thread pool libraries.
+    * The program now returns `EXIT_FAILURE` if any of the tests failed, for automation purposes. See [pull request #42](https://github.com/bshoshany/thread-pool/pull/42).
+    * Fixed incorrect check order in `check_task_monitoring()`. See [pull request #43](https://github.com/bshoshany/thread-pool/pull/43).
+    * Added a new test for `parallelize_loop()` with a return value.
+    * Improved some of the tests to make them more reliable. For example, `count_unique_threads()` now uses futures (stored in a `BS::multi_future<void>` object).
+    * The program now uses `std::vector` instead of matrices, for both consistency checks and benchmarks, in order to simplify the code and considerably reduce its length.
+    * The benchmarks have been simplified. There's now only one test: filling a specific number of vectors of fixed size with random values. This may be replaced with something more practical in a future released, but at least on the systems I've tested on, it does demonstrate a very significant multi-threading speedup.
+    * In addition to multi-threaded tests with different numbers of tasks, the benchmark now also includes a single-threaded test. This allows for more accurate benchmarks compared to previous versions, as the (slight) parallelization overhead is now taken into account when calculating the maximum speedup.
+    * The program decides how many vectors to use for benchmarking by testing how many are needed to reach a target duration in the single-threaded test. This ensures that the test takes approximately the same amount of time on different systems, and is thus more consistent and portable.
+    * Miscellaneous minor optimizations and style improvements.
+* Changes to `README.md`:
+    * Many sections have been rewritten and/or polished.
+    * Explanations and examples of all the new features have been added.
+    * Added an acknowledgements section.
+* Miscellaneous changes:
+    * Added a `CITATION.bib` file (in BibTeX format) to the GitHub repository. You can use it to easily cite this package if you use it in any research papers.
+    * Added a `CITATION.cff` file (in YAML format) to the GitHub repository. This should add [an option to get a citation in different formats](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files) directly from GitHub repository by clicking on "cite this repository" on the sidebar to the right.
+    * Added templates for GitHub issues and pull requests.
+
+### v2.0.0 (2021-08-14)
+
+* From now on, version numbers will adhere to the [Semantic Versioning](https://semver.org/) specification in the format **major.minor.patch**.
+* A file named `thread_pool_test.cpp` has been added to the package. It will perform automated tests of all aspects of the package, and benchmark some multithreaded matrix operations. Please run it on your system and [submit a bug report](https://github.com/bshoshany/thread-pool/issues) if any of the tests fail. In addition, the code is thoroughly documented, and is meant to serve as an extensive example of how to properly use the package.
+* The package is now available through [vcpkg](https://github.com/microsoft/vcpkg). Instructions for how to install it have been added to `README.md`. See [this pull request](https://github.com/bshoshany/thread-pool/pull/18).
+* The package now defines a macro `THREAD_POOL_VERSION`, which returns the version number and release date of the thread pool library as a string.
+* `parallelize_loop()` has undergone some major changes (and is now incompatible with v1.x):
+    * The second argument is now the index **after** the last index, instead of the last index itself. This is more consistent with C++ conventions (e.g. standard library algorithms) where the range is always `[first, last)`. For example, for an array with `n` indices, instead of `parallelize_loop(0, n - 1, ...)` you should now write `parallelize_loop(0, n, ...)`.
+    * The `loop` function is now only called once per block, instead of once per index, as was the case before. This should provide a performance boost due to significantly reducing the number of function calls, and it also allows you to conserve resources by using them only once per block instead of once per index (an example can be found in the `random_matrix_generator` class in `thread_pool_test.cpp`). It also means that `loop` now takes two arguments: the first index in the block and the index after the last index in the block. Thus, `loop(start, end)` should typically involve a loop of the form `for (T i = start; i < end; i++)`.
+    * The first and last indices can now be of two different integer types. Previously, `parallelize_loop(0, i, ...)` did not work if `i` was not an `int`, because `0` was interpreted as `int`, and the two arguments had to be of the same type. Therefore, one had to use casting, e.g. `parallelize_loop((size_t)0, i)`, to make it work. Now this is no longer necessary; the common type is inferred automatically using `std::common_type_t`.
+
+### v1.9 (2021-07-29)
+
+* Fixed a bug in `reset()` which caused it to create the wrong number of threads.
+
+### v1.8 (2021-07-28)
+
+* The version history has become too long to be included in `README.md`, so I moved it to a separate file, `CHANGELOG.md`.
+* A button to open this repository directly in Visual Studio Code has been added to the badges in `README.md`.
+* An internal variable named `promise` has been renamed to `task_promise` to avoid any potential errors in case the user invokes `using namespace std`.
+* `submit()` now catches exceptions thrown by the submitted task and forwards them to the future. See [this issue](https://github.com/bshoshany/thread-pool/issues/14).
+* Eliminated compiler warnings that appeared when using the `-Weffc++` flag in GCC. See [this pull request](https://github.com/bshoshany/thread-pool/pull/17).
+
+### v1.7 (2021-06-02)
+
+* Fixed a bug in `parallelize_loop()` which prevented it from actually running loops in parallel, see [this issue](https://github.com/bshoshany/thread-pool/issues/11).
+
+### v1.6 (2021-05-26)
+
+* Since MSVC does not interpret `and` as `&&` by default, the previous release did not compile with MSVC unless the `/permissive-` or `/Za` compiler flags were used. This has been fixed in this version, and the code now successfully compiles with GCC, Clang, and MSVC. See [this pull request](https://github.com/bshoshany/thread-pool/pull/10).
+
+### v1.5 (2021-05-07)
+
+* This library now has a DOI for citation purposes. Information on how to cite it in publications has been added to the source code and to `README.md`.
+* Added GitHub badges to `README.md`.
+
+### v1.4 (2021-05-05)
+
+* Added three new public member functions to monitor the tasks submitted to the pool:
+    * `get_tasks_queued()` gets the number of tasks currently waiting in the queue to be executed by the threads.
+    * `get_tasks_running()` gets the number of tasks currently being executed by the threads.
+    * `get_tasks_total()` gets the total number of unfinished tasks - either still in the queue, or running in a thread.
+    * Note that `get_tasks_running() == get_tasks_total() - get_tasks_queued()`.
+    * Renamed the private member variable `tasks_waiting` to `tasks_total` to make its purpose clearer.
+* Added an option to temporarily pause the workers:
+    * When public member variable `paused` is set to `true`, the workers temporarily stop popping new tasks out of the queue, although any tasks already executed will keep running until they are done. Set to `false` again to resume popping tasks.
+    * While the workers are paused, `wait_for_tasks()` will wait for the running tasks instead of all tasks (otherwise it would wait forever).
+    * By utilizing the new pausing mechanism, `reset()` can now change the number of threads on-the-fly while there are still tasks waiting in the queue. The new thread pool will resume executing tasks from the queue once it is created.
+* `parallelize_loop()` and `wait_for_tasks()` now have the same behavior as the worker function with regards to waiting for tasks to complete. If the relevant tasks are not yet complete, then before checking again, they will sleep for `sleep_duration` microseconds, unless that variable is set to zero, in which case they will call `std::this_thread::yield()`. This should improve performance and reduce CPU usage.
+* Merged [this commit](https://github.com/bshoshany/thread-pool/pull/8): Fixed weird error when using MSVC and including `windows.h`.
+* The `README.md` file has been reorganized and expanded.
+
+### v1.3 (2021-05-03)
+
+* Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/3): Removed `std::move` from the `return` statement in `push_task()`. This previously generated a `-Wpessimizing-move` warning in Clang. The assembly code generated by the compiler seems to be the same before and after this change, presumably because the compiler eliminates the `std::move` automatically, but this change gets rid of the Clang warning.
+* Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/5): Removed a debugging message printed to `std::cout`, which was left in the code by mistake.
+* Fixed [this issue](https://github.com/bshoshany/thread-pool/issues/6): `parallelize_loop()` no longer sends references for the variables `start` and `stop` when calling `push_task()`, which may lead to undefined behavior.
+* A companion paper is now published at <a href="https://arxiv.org/abs/2105.00613">arXiv:2105.00613</a>, including additional information such as performance tests on systems with up to 80 hardware threads. The `README.md` has been updated, and it is now roughly identical in content to the paper.
+
+### v1.2 (2021-04-29)
+
+* The worker function, which controls the execution of tasks by each thread, now sleeps by default instead of yielding. Previously, when the worker could not find any tasks in the queue, it called `std::this_thread::yield()` and then tried again. However, this caused the workers to have high CPU usage when idle, [as reported by some users](https://github.com/bshoshany/thread-pool/issues/1). Now, when the worker function cannot find a task to run, it instead sleeps for a duration given by the public member variable `sleep_duration` (in microseconds) before checking the queue again. The default value is `1000` microseconds, which I found to be optimal in terms of both CPU usage and performance, but your own optimal value may be different.
+* If the constructor is called with an argument of zero for the number of threads, then the default value, `std::thread::hardware_concurrency()`, is used instead.
+* Added a simple helper class, `timer`, which can be used to measure execution time for benchmarking purposes.
+* Improved and expanded the documentation.
+
+### v1.1 (2021-04-24)
+
+* Cosmetic changes only. Fixed a typo in the Doxygen comments and added a link to the GitHub repository.
+
+### v1.0 (2021-01-15)
+
+* Initial release.
diff --git a/CITATION.bib b/CITATION.bib
new file mode 100644
index 0000000..51406fd
--- /dev/null
+++ b/CITATION.bib
@@ -0,0 +1,13 @@
+@article{Shoshany2021_ThreadPool,
+    archiveprefix = {arXiv},
+    author        = {Barak Shoshany},
+    doi           = {10.5281/zenodo.4742687},
+    eid           = {arXiv:2105.00613},
+    eprint        = {2105.00613},
+    journal       = {arXiv e-prints},
+    keywords      = {Computer Science - Distributed, Parallel, and Cluster Computing, D.1.3, D.1.5},
+    month         = {May},
+    primaryclass  = {cs.DC},
+    title         = {{A C++17 Thread Pool for High-Performance Scientific Computing}},
+    year          = {2021}
+}
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000..99bb729
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,24 @@
+---
+authors:
+    - family-names: "Shoshany"
+      given-names: "Barak"
+      orcid: "https://orcid.org/0000-0003-2222-127X"
+cff-version: "1.2.0"
+date-released: "2021-05-03"
+doi: "10.5281/zenodo.4742687"
+license: "MIT"
+message: "If you use this package in published research, please cite it as follows."
+repository-code: "https://github.com/bshoshany/thread-pool"
+title: "A C++17 Thread Pool for High-Performance Scientific Computing"
+preferred-citation:
+    type: "article"
+    authors:
+        - family-names: "Shoshany"
+          given-names: "Barak"
+          orcid: "https://orcid.org/0000-0003-2222-127X"
+    doi: "10.5281/zenodo.4742687"
+    journal: "arXiv"
+    month: 5
+    title: "A C++17 Thread Pool for High-Performance Scientific Computing"
+    url: "https://arxiv.org/abs/2105.00613"
+    year: 2021
diff --git a/LICENSE.txt b/LICENSE.txt
index a8684f1..63a12c8 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Barak Shoshany
+Copyright (c) 2022 Barak Shoshany
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE. 
+SOFTWARE.
diff --git a/README.md b/README.md
index ed1f561..e18b3eb 100644
--- a/README.md
+++ b/README.md
@@ -2,29 +2,25 @@
 [![arXiv:2105.00613](https://img.shields.io/badge/arXiv-2105.00613-b31b1b.svg)](https://arxiv.org/abs/2105.00613)
 [![License: MIT](https://img.shields.io/github/license/bshoshany/thread-pool)](https://github.com/bshoshany/thread-pool/blob/master/LICENSE.txt)
 ![Language: C++17](https://img.shields.io/badge/Language-C%2B%2B17-yellow)
-![File size in bytes](https://img.shields.io/github/size/bshoshany/thread-pool/thread_pool.hpp)
+![File size in bytes](https://img.shields.io/github/size/bshoshany/thread-pool/BS_thread_pool.hpp)
 ![GitHub last commit](https://img.shields.io/github/last-commit/bshoshany/thread-pool)
 [![GitHub repo stars](https://img.shields.io/github/stars/bshoshany/thread-pool?style=social)](https://github.com/bshoshany/thread-pool)
 [![Twitter @BarakShoshany](https://img.shields.io/twitter/follow/BarakShoshany?style=social)](https://twitter.com/BarakShoshany)
-[![Open in Visual Studio Code](https://open.vscode.dev/badges/open-in-vscode.svg)](https://open.vscode.dev/bshoshany/thread-pool)
+[![Open in Visual Studio Code](https://img.shields.io/badge/-Open%20in%20Visual%20Studio%20Code-007acc)](https://vscode.dev/github/bshoshany/thread-pool)
 
-# A C++17 Thread Pool for High-Performance Scientific Computing
+# `BS::thread_pool`: a fast, lightweight, and easy-to-use C++17 thread pool library
 
-**Barak Shoshany**\
-Department of Physics, Brock University,\
-1812 Sir Isaac Brock Way, St. Catharines, Ontario, L2S 3A1, Canada\
-[bshoshany@brocku.ca](mailto:bshoshany@brocku.ca) | [https://baraksh.com/](https://baraksh.com/)\
-Companion paper: [arXiv:2105.00613](https://arxiv.org/abs/2105.00613)\
-DOI: [doi:10.5281/zenodo.4742687](https://doi.org/10.5281/zenodo.4742687)
+Documentation for v3.0.0 (2022-05-30)
+
+By Barak Shoshany ([baraksh@gmail.com](mailto:baraksh@gmail.com)) ([https://baraksh.com/](https://baraksh.com/))
 
-* [Abstract](#abstract)
 * [Introduction](#introduction)
     * [Motivation](#motivation)
     * [Overview of features](#overview-of-features)
     * [Compiling and compatibility](#compiling-and-compatibility)
+    * [Installing using vcpkg](#installing-using-vcpkg)
 * [Getting started](#getting-started)
     * [Including the library](#including-the-library)
-    * [Installing using vcpkg](#installing-using-vcpkg)
     * [Constructors](#constructors)
     * [Getting and resetting the number of threads in the pool](#getting-and-resetting-the-number-of-threads-in-the-pool)
     * [Finding the version of the package](#finding-the-version-of-the-package)
@@ -33,40 +29,37 @@ DOI: [doi:10.5281/zenodo.4742687](https://doi.org/10.5281/zenodo.4742687)
     * [Submitting tasks to the queue without futures](#submitting-tasks-to-the-queue-without-futures)
     * [Manually waiting for all tasks to complete](#manually-waiting-for-all-tasks-to-complete)
     * [Parallelizing loops](#parallelizing-loops)
+    * [Loops with return values](#loops-with-return-values)
 * [Helper classes](#helper-classes)
+    * [Handling multiple futures at once](#handling-multiple-futures-at-once)
     * [Synchronizing printing to an output stream](#synchronizing-printing-to-an-output-stream)
     * [Measuring execution time](#measuring-execution-time)
 * [Other features](#other-features)
-    * [Setting the worker function's sleep duration](#setting-the-worker-functions-sleep-duration)
     * [Monitoring the tasks](#monitoring-the-tasks)
     * [Pausing the workers](#pausing-the-workers)
     * [Exception handling](#exception-handling)
 * [Testing the package](#testing-the-package)
     * [Automated tests](#automated-tests)
     * [Performance tests](#performance-tests)
-    * [Dual Intel Xeon Gold 6148 (80 threads)](#dual-intel-xeon-gold-6148-80-threads)
 * [Issue and pull request policy](#issue-and-pull-request-policy)
+* [Acknowledgements](#acknowledgements)
 * [Copyright and citing](#copyright-and-citing)
 
-## Abstract
-
-We present a modern C++17-compatible thread pool implementation, built from scratch with high-performance scientific computing in mind. The thread pool is implemented as a single lightweight and self-contained class, and does not have any dependencies other than the C++17 standard library, thus allowing a great degree of portability. In particular, our implementation does not utilize OpenMP or any other high-level multithreading APIs, and thus gives the programmer precise low-level control over the details of the parallelization, which permits more robust optimizations. The thread pool was extensively tested on both AMD and Intel CPUs with up to 40 cores and 80 threads. This paper provides motivation, detailed usage instructions, and performance tests. The code is freely available in the [GitHub repository](https://github.com/bshoshany/thread-pool). This `README.md` file contains roughly the same content as the [companion paper](https://arxiv.org/abs/2105.00613).
-
 ## Introduction
 
 ### Motivation
 
 Multithreading is essential for modern high-performance computing. Since C++11, the C++ standard library has included built-in low-level multithreading support using constructs such as `std::thread`. However, `std::thread` creates a new thread each time it is called, which can have a significant performance overhead. Furthermore, it is possible to create more threads than the hardware can handle simultaneously, potentially resulting in a substantial slowdown.
 
-This library contains a thread pool class, `thread_pool`, which avoids these issues by creating a fixed pool of threads once and for all, and then reusing the same threads to perform different tasks throughout the lifetime of the pool. By default, the number of threads in the pool is equal to the maximum number of threads that the hardware can run in parallel.
+The library presented here contains a thread pool class, `BS::thread_pool`, which avoids these issues by creating a fixed pool of threads once and for all, and then continuously reusing the same threads to perform different tasks throughout the lifetime of the program. By default, the number of threads in the pool is equal to the maximum number of threads that the hardware can run in parallel.
 
-The user submits tasks to be executed into a queue. Whenever a thread becomes available, it pops a task from the queue and executes it. Each task is automatically assigned an `std::future`, which can be used to wait for the task to finish executing and/or obtain its eventual return value.
+The user submits tasks to be executed into a queue. Whenever a thread becomes available, it retrieves the next task from the queue and executes it. The pool automatically produces an `std::future` for each task, which allows the user to wait for the task to finish executing and/or obtain its eventual return value, if applicable. Threads and tasks are autonomously managed by the pool in the background, without requiring any input from the user aside from submitting the desired tasks.
 
-In addition to `std::thread`, the C++ standard library also offers the higher-level construct `std::async`, which may internally utilize a thread pool - but this is not guaranteed, and in fact, currently only the MSVC implementation of `std::async` uses a thread pool, while GCC and Clang do not. Using our custom-made thread pool class instead of `std::async` allows the user more control, transparency, and portability.
+The design of this package was guided by four important principles. First, *compactness*: the entire library consists of just one small self-contained header file, with no other components or dependencies. Second, *portability*: the package only utilizes the C++17 standard library, without relying on any compiler extensions or 3rd-party libraries, and is therefore compatible with any modern standards-conforming C++17 compiler on any platform. Third, *ease of use*: the package is extensively documented, and programmers of any level should be able to use it right out of the box.
 
-High-level multithreading APIs, such as OpenMP, allow simple one-line automatic parallelization of C++ code, but they do not give the user precise low-level control over the details of the parallelization. The thread pool class presented here allows the programmer to perform and manage the parallelization at the lowest level, and thus permits more robust optimizations, which can be used to achieve considerably higher performance.
+The fourth and final guiding principle is *performance*: each and every line of code in this library was carefully designed with maximum performance in mind, and performance was tested and verified on a variety of compilers and platforms. Indeed, the library was originally designed for use in the author's own computationally-intensive scientific computing projects, running both on high-end desktop/laptop computers and high-performance computing nodes.
 
-As demonstrated in the performance tests [below](#performance-tests), using our thread pool class we were able to saturate the upper bound of expected speedup for matrix multiplication and generation of random matrices. These performance tests were performed on 12-core / 24-thread and 40-core / 80-thread systems using GCC on Linux.
+Other, more advanced multithreading libraries may offer more features and/or higher performance. However, they typically consist of a vast codebase with multiple components and dependencies, and involve complex APIs that require a substantial time investment to learn. This library is not intended to replace these more advanced libraries; instead, it was designed for users who don't require very advanced features, and prefer a simple and lightweight package that is easy to learn and use and can be readily incorporated into existing or new projects.
 
 ### Overview of features
 
@@ -77,68 +70,61 @@ As demonstrated in the performance tests [below](#performance-tests), using our
     * Reusing threads avoids the overhead of creating and destroying them for individual tasks.
     * A task queue ensures that there are never more threads running in parallel than allowed by the hardware.
 * **Lightweight:**
-    * Only ~180 lines of code, excluding comments, blank lines, and the two optional helper classes.
-    * Single header file: simply `#include "thread_pool.hpp"`.
+    * Only ~190 lines of code, excluding comments, blank lines, and the two optional helper classes.
+    * Single header file: simply `#include "BS_thread_pool.hpp"` and you're all set!
     * Header-only: no need to install or build the library.
-    * Self-contained: no external requirements or dependencies. Does not require OpenMP or any other multithreading APIs. Only uses the C++ standard library, and works with any C++17-compliant compiler.
+    * Self-contained: no external requirements or dependencies.
+    * Portable: uses only the C++ standard library, and works with any C++17-compliant compiler.
 * **Easy to use:**
     * Very simple operation, using a handful of member functions.
-    * Every task submitted to the queue automatically generates an `std::future`, which can be used to wait for the task to finish executing and/or obtain its eventual return value.
-    * Optionally, tasks may also be submitted without generating a future, sacrificing convenience for greater performance.
+    * Every task submitted to the queue using the `submit()` member function automatically generates an `std::future`, which can be used to wait for the task to finish executing and/or obtain its eventual return value.
+    * Optionally, tasks may also be submitted using the `push_task()` member function without generating a future, sacrificing convenience for even greater performance.
     * The code is thoroughly documented using Doxygen comments - not only the interface, but also the implementation, in case the user would like to make modifications.
-    * The included test program `thread_pool_test.cpp` can be used to perform comprehensive automated tests and benchmarks, and also serves as an extensive example of how to properly use the package.
+    * The included test program `BS_thread_pool_test.cpp` can be used to perform exhaustive automated tests and benchmarks, and also serves as a comprehensive example of how to properly use the package.
+* **Helper classes:**
+    * Automatically parallelize a loop into any number of parallel tasks using the `parallelize_loop()` member function, and track its execution using the `BS::multi_future` helper class.
+    * Synchronize output to a stream from multiple threads in parallel using the `BS::synced_stream` helper class.
+    * Easily measure execution time for benchmarking purposes using the `BS::timer` helper class.
 * **Additional features:**
-    * Automatically parallelize a loop into any number of parallel tasks.
-    * Easily wait for all tasks in the queue to complete.
-    * Change the number of threads in the pool safely and on-the-fly as needed.
-    * Fine-tune the sleep duration of each thread's worker function for optimal performance.
-    * Monitor the number of queued and/or running tasks.
-    * Pause and resume popping new tasks out of the queue.
+    * Easily wait for all tasks in the queue to complete using the `wait_for_tasks()` member function.
+    * Change the number of threads in the pool safely and on-the-fly as needed using the `reset()` member function.
+    * Monitor the number of queued and/or running tasks using the `get_tasks_queued()`, `get_tasks_running()`, and `get_tasks_total()` member functions.
+    * Freely pause and resume the pool by modifying the `paused` member variable. When paused, threads do not retrieve new tasks out of the queue.
     * Catch exceptions thrown by the submitted tasks.
-    * Synchronize output to a stream from multiple threads in parallel using the `synced_stream` helper class.
-    * Easily measure execution time for benchmarking purposes using the `timer` helper class.
     * Under continuous and active development. Bug reports and feature requests are welcome, and should be made via [GitHub issues](https://github.com/bshoshany/thread-pool/issues).
 
 ### Compiling and compatibility
 
-This library should successfully compile on any C++17 standard-compliant compiler, on all operating systems and architectures for which such a compiler is available. Compatibility was verified with a 12-core / 24-thread AMD Ryzen 9 3900X CPU at 3.8 GHz using the following compilers and platforms:
+This library should successfully compile on any C++17 standard-compliant compiler, on all operating systems and architectures for which such a compiler is available. Compatibility was verified with a 12-core / 24-thread AMD Ryzen 9 3900X CPU using the following compilers and platforms:
 
-* Windows 10 build 19043.1165:
-    * [GCC](https://gcc.gnu.org/) v11.2.0 ([WinLibs build](https://winlibs.com/))
-    * [Clang](https://clang.llvm.org/) v12.0.1
-    * [MSVC](https://docs.microsoft.com/en-us/cpp/) v19.29.30133
-* Ubuntu 21.04:
-    * [GCC](https://gcc.gnu.org/) v11.1.0
-    * [Clang](https://clang.llvm.org/) v12.0.0
+* Windows 11 build 22000.675:
+    * [GCC](https://gcc.gnu.org/) v12.1.0 ([WinLibs build](https://winlibs.com/))
+    * [Clang](https://clang.llvm.org/) v14.0.4
+    * [Intel oneAPI C++ Compiler](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/dpc-compiler.html) v2022.1.0
+    * [MSVC](https://docs.microsoft.com/en-us/cpp/) v19.32.31329
+* Ubuntu 22.04 LTS:
+    * [GCC](https://gcc.gnu.org/) v12.0.1
+    * [Clang](https://clang.llvm.org/) v14.0.0
 
-In addition, this library was tested on a [Compute Canada](https://www.computecanada.ca/) node equipped with two 20-core / 40-thread Intel Xeon Gold 6148 CPUs at 2.4 GHz (for a total of 40 cores and 80 threads), running CentOS Linux 7.6.1810, using the following compilers:
+In addition, this library was tested on a [Compute Canada](https://www.computecanada.ca/) node equipped with two 20-core / 40-thread Intel Xeon Gold 6148 CPUs (for a total of 40 cores and 80 threads), running CentOS Linux 7.9.2009, using [GCC](https://gcc.gnu.org/) v12.1.1.
 
-* [GCC](https://gcc.gnu.org/) v9.4.0
-* [Intel C++ Compiler (ICC)](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/dpc-compiler.html) v19.1.3.304
-
-The test program `thread_pool_test.cpp` was compiled without warnings (with the warning flags `-Wall -Wpedantic -Wextra -Wconversion -Weffc++` in GCC/Clang and `/W4` in MSVC), executed, and successfully completed all [automated tests](#testing-the-package) using all of the compilers and systems mentioned above.
+The test program `BS_thread_pool_test.cpp` was compiled without warnings (with the warning flags `-Wall -Wextra -Wconversion -Wsign-conversion -Wpedantic -Weffc++ -Wshadow` in GCC/Clang and `/W4` in MSVC), executed, and successfully completed all [automated tests](#testing-the-package) and benchmarks using all of the compilers and systems mentioned above.
 
 As this library requires C++17 features, the code must be compiled with C++17 support:
 
-* For GCC, Clang, or ICC, use the `-std=c++17` flag. On Linux, you will also need to use the `-pthread` flag to enable the POSIX threads library.
-* For MSVC, use `/std:c++17`.
+* For GCC or Clang, use the `-std=c++17` flag. On Linux, you will also need to use the `-pthread` flag to enable the POSIX threads library.
+* For Intel, use `-std=c++17` on Linux or `/Qstd:c++17` on Windows.
+* For MSVC, use `/std:c++17`, and preferably also `/permissive-` to ensure standards conformance.
 
 For maximum performance, it is recommended to compile with all available compiler optimizations:
 
-* For GCC, Clang, or ICC, use the `-O3` flag.
+* For GCC or Clang, use the `-O3` flag.
+* For Intel, use `-O3` on Linux or `/O3` on Windows.
 * For MSVC, use `/O2`.
 
-## Getting started
-
-### Including the library
-
-To use the thread pool library, simply download the [latest release](https://github.com/bshoshany/thread-pool/releases) from the GitHub repository, place the single header file `thread_pool.hpp` in the desired folder, and include it in your program:
-
-```cpp
-#include "thread_pool.hpp"
-```
-
-The thread pool will now be accessible via the `thread_pool` class.
+As an example, to compile the test program `BS_thread_pool_test.cpp` with warnings and optimizations, it is recommended to use the following commands:
+* On Windows with MSVC: `cl BS_thread_pool_test.cpp /std:c++17 /permissive- /O2 /W4 /EHsc /Fe:BS_thread_pool_test.exe`
+* On Linux with GCC: `g++ BS_thread_pool_test.cpp -std=c++17 -O3 -Wall -Wextra -Wconversion -Wsign-conversion -Wpedantic -Weffc++ -Wshadow -pthread -o BS_thread_pool_test`
 
 ### Installing using vcpkg
 
@@ -156,24 +142,36 @@ On Windows:
 .\vcpkg install bshoshany-thread-pool:x86-windows bshoshany-thread-pool:x64-windows
 ```
 
-The thread pool will then be available automatically in the build system you integrated vcpkg with (e.g. MSBuild or CMake). Simply write `#include "thread_pool.hpp"` in any project to use the thread pool, without having to copy to file into the project first. I will update the vcpkg port with each new release, so it will be updated automatically when you run `vcpkg upgrade`.
+The thread pool will then be available automatically in the build system you integrated vcpkg with (e.g. MSBuild or CMake). Simply write `#include "BS_thread_pool.hpp"` in any project to use the thread pool, without having to copy to file into the project first. I will update the vcpkg port with each new release, so it will be updated automatically when you run `vcpkg upgrade`.
 
 Please see the [vcpkg repository](https://github.com/microsoft/vcpkg) for more information on how to use vcpkg.
 
+## Getting started
+
+### Including the library
+
+If you are not using a C++ library manager (such as vcpkg), simply download the [latest release](https://github.com/bshoshany/thread-pool/releases) from the GitHub repository, place the single header file `BS_thread_pool.hpp` in the desired folder, and include it in your program:
+
+```cpp
+#include "BS_thread_pool.hpp"
+```
+
+The thread pool will now be accessible via the `BS::thread_pool` class.
+
 ### Constructors
 
-The default constructor creates a thread pool with as many threads as the hardware can handle concurrently, as reported by the implementation via `std::thread::hardware_concurrency()`. With a hyperthreaded CPU, this will be twice the number of CPU cores. This is probably the constructor you want to use. For example:
+The default constructor creates a thread pool with as many threads as the hardware can handle concurrently, as reported by the implementation via `std::thread::hardware_concurrency()`. This is usually determined by the number of cores in the CPU. If a core is hyperthreaded, it will count as two threads. For example:
 
 ```cpp
 // Constructs a thread pool with as many threads as available in the hardware.
-thread_pool pool;
+BS::thread_pool pool;
 ```
 
 Optionally, a number of threads different from the hardware concurrency can be specified as an argument to the constructor. However, note that adding more threads than the hardware can handle will **not** improve performance, and in fact will most likely hinder it. This option exists in order to allow using **less** threads than the hardware concurrency, in cases where you wish to leave some threads available for other processes. For example:
 
 ```cpp
 // Constructs a thread pool with only 12 threads.
-thread_pool pool(12);
+BS::thread_pool pool(12);
 ```
 
 If your program's main thread only submits tasks to the thread pool and waits for them to finish, and does not perform any computationally intensive tasks on its own, then it is recommended to use the default value for the number of threads. This ensures that all of the threads available in the hardware will be put to work while the main thread waits.
@@ -190,18 +188,20 @@ It is generally unnecessary to change the number of threads in the pool after it
 
 ### Finding the version of the package
 
-If desired, the version of this package may be read during compilation time from the macro `THREAD_POOL_VERSION`. The value will be a string containing the version number and release date. For example:
+If desired, the version of this package may be read during compilation time from the macro `BS_THREAD_POOL_VERSION`. The value will be a string containing the version number and release date. For example:
 
 ```cpp
-std::cout << "Thread pool library version is " << THREAD_POOL_VERSION << ".\n";
+std::cout << "Thread pool library version is " << BS_THREAD_POOL_VERSION << ".\n";
 ```
 
 Sample output:
 
 ```none
-Thread pool library version is v2.0.0 (2021-08-14).
+Thread pool library version is v3.0.0 (2022-05-30).
 ```
 
+This can be used, for example, to allow the same code to work with several incompatible versions of the library.
+
 ## Submitting and waiting for tasks
 
 ### Submitting tasks to the queue with futures
@@ -219,12 +219,11 @@ auto my_future = pool.submit(task, arg);
 auto my_future = pool.submit(task, arg1, arg2);
 ```
 
-Using `auto` for the return value of `submit()` is recommended, since it means the compiler will automatically detect which instance of the template `std::future` to use. The value of the future depends on whether the function has a return value or not:
+If the submitted function has a return value of type `T`, then the future will be of type `std::future<T>`, and will be set to the return value when the function finishes its execution. If the submitted function does not have a return value, then the future will be an `std::future<void>`, which will not return any value but may still be used to wait for the function to finish.
 
-* If the submitted function has a return value, then the future will be set to that value when the function finishes its execution.
-* If the submitted function does not have a return value, then the future will be a `bool` that will be set to `true` when the function finishes its execution.
+Using `auto` for the return value of `submit()` means the compiler will automatically detect which instance of the template `std::future` to use. However, specifying the particular type `std::future<T>`, as in the examples below, is recommended for increased readability.
 
-To wait until the future's value becomes available, use the member function `wait()`. To obtain the value itself, use the member function `get()`, which will also automatically wait for the future if it's not ready yet. For example:
+To wait until the task finishes, use the member function `wait()` of the future. To obtain the return value, use the member function `get()`, which will also automatically wait for the task to finish if it hasn't yet. For example:
 
 ```cpp
 // Submit a task and get a future.
@@ -235,9 +234,80 @@ do_stuff();
 auto my_return_value = my_future.get();
 ```
 
+Here are some more concrete examples. The following program will print out `42`:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+int main()
+{
+    BS::thread_pool pool;
+    std::future<int> my_future = pool.submit([] { return 42; });
+    std::cout << my_future.get();
+}
+```
+
+Here we used a [lambda expression](https://en.cppreference.com/w/cpp/language/lambda) to quickly define the function on-the-fly. However, we can also use a previously-defined function:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+int the_answer()
+{
+    return 42;
+}
+
+int main()
+{
+    BS::thread_pool pool;
+    std::future<int> my_future = pool.submit(the_answer);
+    std::cout << my_future.get();
+}
+```
+
+The following is an example of submitting a function with arguments:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+int multiply(const int a, const int b)
+{
+    return a * b;
+}
+
+int main()
+{
+    BS::thread_pool pool;
+    std::future<int> my_future = pool.submit(multiply, 6, 7);
+    std::cout << my_future.get();
+}
+```
+
+Finally, here is an example of submitting a function with no return value and then using the future to wait for it to finish executing:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+void sleep()
+{
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+}
+
+int main()
+{
+    BS::thread_pool pool;
+    std::future<void> my_future = pool.submit(sleep);
+    std::cout << "Waiting... ";
+    my_future.wait();
+    std::cout << "Done.";
+}
+```
+
+Here, the command `std::this_thread::sleep_for(std::chrono::milliseconds(1000))` instructs the thread to sleep for 1 second.
+
 ### Submitting tasks to the queue without futures
 
-Usually, it is best to submit a task to the queue using `submit()`. This allows you to wait for the task to finish and/or get its return value later. However, sometimes a future is not needed, for example when you just want to "set and forget" a certain task, or if the task already communicates with the main thread or with other tasks without using futures, such as via references or pointers. In such cases, you may wish to avoid the overhead involved in assigning a future to the task in order to increase performance.
+Usually, it is best to submit a task to the queue using `submit()`. This allows you to wait for the task to finish and/or get its return value later. However, sometimes a future is not needed, for example when you just want to "set and forget" a certain task, or if the task already communicates with the main thread or with other tasks without using futures, such as via condition variables. In such cases, you may wish to avoid the overhead involved in assigning a future to the task in order to increase performance.
 
 The member function `push_task()` allows you to submit a task to the queue without generating a future for it. The task can have any number of arguments, but it cannot have a return value. For example:
 
@@ -257,27 +327,39 @@ To wait for a **single** submitted task to complete, use `submit()` and then use
 Consider, for example, the following code:
 
 ```cpp
-thread_pool pool;
-size_t a[100];
-for (size_t i = 0; i < 100; i++)
-    pool.push_task([&a, i] { a[i] = i * i; });
-std::cout << a[50];
+#include "BS_thread_pool.hpp"
+
+int main()
+{
+    BS::thread_pool pool(5);
+    int squares[100];
+    for (int i = 0; i < 100; ++i)
+        pool.push_task(
+            [&squares, i]
+            {
+                std::this_thread::sleep_for(std::chrono::milliseconds(50));
+                squares[i] = i * i;
+            });
+    std::cout << squares[50];
+}
 ```
 
-The output will most likely be garbage, since the task that modifies `a[50]` has not yet finished executing by the time we try to access that element (in fact, that task is probably still waiting in the queue). One solution would be to use `submit()` instead of `push_task()`, but perhaps we don't want the overhead of generating 100 different futures. Instead, simply adding the line
+The output will most likely be garbage, since the task that modifies `squares[50]` has not yet finished executing by the time we try to access that element - it's still waiting in the queue. One solution would be to use `submit()` instead of `push_task()`, but perhaps we don't want the overhead of generating 100 different futures. Instead, simply adding the line
 
 ```cpp
 pool.wait_for_tasks();
 ```
 
-after the `for` loop will ensure - as efficiently as possible - that all tasks have finished running before we attempt to access any elements of the array `a`, and the code will print out the value `2500` as expected. (Note, however, that `wait_for_tasks()` will wait for **all** the tasks in the queue, including those that are unrelated to the `for` loop. Using `parallelize_loop()` would make much more sense in this particular case, as it will wait only for the tasks related to the loop.)
+after the `for` loop will ensure - as efficiently as possible - that all tasks have finished running before we attempt to access any elements of the array `squares`, and the code will print out the value `2500` as expected.
+
+Note, however, that `wait_for_tasks()` will wait for **all** the tasks in the queue, including those that are unrelated to the `for` loop. Using [`parallelize_loop()`](#parallelizing-loops) would make much more sense in this particular case, as it will allow waiting only for the tasks related to the loop.
 
 ### Parallelizing loops
 
 Consider the following loop:
 
 ```cpp
-for (T i = start; i < end; i++)
+for (T i = start; i < end; ++i)
     do_something(i);
 ```
 
@@ -290,59 +372,135 @@ where:
 This loop may be automatically parallelized and submitted to the thread pool's queue using the member function `parallelize_loop()` as follows:
 
 ```cpp
-auto loop = [](const T &a, const T &b)
+auto loop = [](const T a, const T b)
 {
-    for (T i = a; i < b; i++)
+    for (T i = a; i < b; ++i)
         do_something(i);
 };
-pool.parallelize_loop(start, end, loop, n);
+BS::multi_future<void> loop_future = pool.parallelize_loop(start, end, loop, n);
+loop_future.wait();
 ```
 
-The range of indices `[start, end)` will be divided into `n` blocks of the form `[a, b)`. For example, if the range is `[0, 9)` and there are 3 blocks, then the blocks will be the ranges `[0, 3)`, `[3, 6)`, and `[6, 9)`. If possible, the blocks will be equal in size, otherwise the last block may be a bit longer. Then, a task will be submitted for each block, consisting of the function `loop()` with its two arguments being the start and end of the range `[a, b)` of each block. The main thread will then wait until all tasks generated by `parallelize_loop()` finish executing (and only those tasks - not any other tasks that also happen to be in the queue).
+Here's how this works:
 
-In the example above, the lambda function `loop` was defined separately for clarity. In practice, the lambda function will usually be defined within the argument itself, as in the example below. `loop` can also be an ordinary function (with no return value) instead of a lambda function, but that may be less useful, since typically one would like to capture some of the surrounding variables, as below.
+* The lambda function  `loop()` takes two indices, `a`, and `b`, and executes only the portion of the loop in the range `[a, b)`.
+    * Note that this lambda was defined here separately for clarity. In practice, the lambda function will usually be defined within the call to `parallelize_loop()` itself, as in the examples below.
+    * `loop()` can also be an ordinary function (with or without a return value) instead of a lambda function, but that may be less useful, since typically one would like to capture some of the surrounding variables.
+* When `parallelize_loop(start, end, loop, n)` is called, it will divide the range of indices `[start, end)` into `n` blocks of the form `[a, b)`. For example, if the range is `[0, 9)` and there are 3 blocks, then the blocks will be the ranges `[0, 3)`, `[3, 6)`, and `[6, 9)`. If possible, the blocks will be equal in size, otherwise the last block may be a bit longer.
+* Then, a task will be submitted for each block, consisting of the function `loop()` with its two arguments being the start and end of the range `[a, b)` of each block.
+* Each task will have an `std::future<void>` assigned to it, and all these futures will be stored inside an object `loop_future` of the helper class template `BS::multi_future`.
+* When `loop_future.wait()` is called, the main thread will wait until all tasks generated by `parallelize_loop()` finish executing, and only those tasks - not any other tasks that also happen to be in the queue. This is essentially the role of the `BS::multi_future` class: to wait for a specific group of tasks, in this case the tasks running the loop blocks.
 
-If the fourth argument `n` is not specified, the number of blocks will be equal to the number of threads in the pool. For best performance, it is recommended to do your own benchmarks to find the optimal number of blocks for each loop (you can use the `timer` helper class - see [below](#measuring-execution-time)). Using less tasks than there are threads may be preferred if you are also running other tasks in parallel. Using more tasks than there are threads may improve performance in some cases.
+If the fourth argument `n` is not specified, the number of blocks will be equal to the number of threads in the pool. For best performance, it is recommended to do your own benchmarks to find the optimal number of blocks for each loop (you can use the `BS::timer` helper class - see [below](#measuring-execution-time)). Using less tasks than there are threads may be preferred if you are also running other tasks in parallel. Using more tasks than there are threads may improve performance in some cases.
 
 As a simple example, the following code will calculate the squares of all integers from 0 to 99. Since there are 10 threads, and we did not specify a fourth argument, the loop will be divided into 10 blocks, each calculating 10 squares:
 
 ```cpp
-#include "thread_pool.hpp"
+#include "BS_thread_pool.hpp"
 
 int main()
 {
-    thread_pool pool(10);
-    uint32_t squares[100];
+    BS::thread_pool pool(10);
+    int squares[100];
     pool.parallelize_loop(0, 100,
-                          [&squares](const uint32_t &a, const uint32_t &b)
+                          [&squares](const int a, const int b)
                           {
-                              for (uint32_t i = a; i < b; i++)
+                              for (int i = a; i < b; ++i)
                                   squares[i] = i * i;
-                          });
-    std::cout << "16^2 = " << squares[16] << '\n';
-    std::cout << "32^2 = " << squares[32] << '\n';
+                          })
+        .wait();
+    std::cout << squares[50];
 }
 ```
 
-The output should be:
+Note that here, for simplicity, instead of creating a `BS::multi_future` and then using it to wait, we simply called the `wait()` member function directly on the temporary object returned by `parallelize_loop()`. This is a convenient shortcut when we have nothing else to do while waiting.
 
-```none
-16^2 = 256
-32^2 = 1024
+### Loops with return values
+
+Usually, `parallelize_loop()` should take functions with no return values. This is because the function will be executed once for each block, but the blocks are managed by the thread pool, so there's limited usability in returning one value per block. However, for the case where this is desired, such as for summation or some sorting algorithms, `parallelize_loop()` does accept functions with return values, in which case it returns a `BS::multi_future<T>` object where `T` is the return value.
+
+Here's an example of summing all the numbers from 1 to 100:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+int main()
+{
+    BS::thread_pool pool;
+    BS::multi_future<int> mf = pool.parallelize_loop(1, 101,
+                                                     [](const int a, const int b)
+                                                     {
+                                                         int block_total = 0;
+                                                         for (int i = a; i < b; ++i)
+                                                             block_total += i;
+                                                         return block_total;
+                                                     });
+    std::vector<int> totals = mf.get();
+    int sum = 0;
+    for (const int t : totals)
+        sum += t;
+    std::cout << sum;
+}
 ```
 
+Note that calling `get()` on a `BS::multi_future<T>` object returns an `std::vector<T>` with the values obtained from each future. In this case, the values will be the partial sums from each block, so when we add them up, we will get the total sum.
+
 ## Helper classes
 
+### Handling multiple futures at once
+
+The helper class template `BS::multi_future<T>`, already introduced in the context of `parallelize_loop()`, provides a convenient way to collect and access groups of futures. The futures are stored in a public member variable `f` of type `std::vector<std::future<T>>`, so all standard `std::vector` operations are available for organizing the futures. Once the futures are stored, you can use `wait()` to wait for all of them at once or `get()` to get an `std::vector<T>` with the results from all of them. Here's a simple example:
+
+```cpp
+#include "BS_thread_pool.hpp"
+
+int square(const int i)
+{
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    return i * i;
+};
+
+int main()
+{
+    BS::thread_pool pool;
+    BS::multi_future<int> mf1;
+    BS::multi_future<int> mf2;
+    for (int i = 0; i < 100; ++i)
+        mf1.f.push_back(pool.submit(square, i));
+    for (int i = 100; i < 200; ++i)
+        mf2.f.push_back(pool.submit(square, i));
+    /// ...
+    /// Do some stuff while the first group of tasks executes...
+    /// ...
+    const std::vector<int> squares1 = mf1.get();
+    std::cout << "Results from the first group:" << '\n';
+    for (const int s : squares1)
+        std::cout << s << ' ';
+    /// ...
+    /// Do other stuff while the second group of tasks executes...
+    /// ...
+    const std::vector<int> squares2 = mf2.get();
+    std::cout << '\n' << "Results from the second group:" << '\n';
+    for (const int s : squares2)
+        std::cout << s << ' ';
+}
+```
+
+In this example, we simulate complicated tasks by having each task wait for 500ms before returning its result. We collect the futures of the tasks submitted within each loop into two separate `BS::multi_future<int>` objects. `mf1` holds the results from the first loop, and `mf2` holds the results from the second loop. Now we can wait for and/or get the results from `mf1` whenever is convenient, and separately wait for and/or get the results from `mf2` at another time.
+
 ### Synchronizing printing to an output stream
 
 When printing to an output stream from multiple threads in parallel, the output may become garbled. For example, consider this code:
 
 ```cpp
-thread_pool pool;
-for (auto i = 1; i <= 5; i++)
-    pool.push_task([i] {
-        std::cout << "Task no. " << i << " executing.\n";
-    });
+#include "BS_thread_pool.hpp"
+
+int main()
+{
+    BS::thread_pool pool;
+    for (size_t i = 1; i <= 5; ++i)
+        pool.push_task([i] { std::cout << "Task no. " << i << " executing.\n"; });
+}
 ```
 
 The output may look as follows:
@@ -357,26 +515,29 @@ Task no. 1 executing.
 
 The reason is that, although each **individual** insertion to `std::cout` is thread-safe, there is no mechanism in place to ensure subsequent insertions from the same thread are printed contiguously.
 
-The helper class `synced_stream` is designed to eliminate such synchronization issues. The constructor takes one optional argument, specifying the output stream to print to. If no argument is supplied, `std::cout` will be used:
+The helper class `BS::synced_stream` is designed to eliminate such synchronization issues. The constructor takes one optional argument, specifying the output stream to print to. If no argument is supplied, `std::cout` will be used:
 
 ```cpp
 // Construct a synced stream that will print to std::cout.
-synced_stream sync_out;
+BS::synced_stream sync_out;
 // Construct a synced stream that will print to the output stream my_stream.
-synced_stream sync_out(my_stream);
+BS::synced_stream sync_out(my_stream);
 ```
 
-The member function `print()` takes an arbitrary number of arguments, which are inserted into the stream one by one, in the order they were given. `println()` does the same, but also prints a newline character `\n` at the end, for convenience. A mutex is used to synchronize this process, so that any other calls to `print()` or `println()` using the same `synced_stream` object must wait until the previous call has finished.
+The member function `print()` takes an arbitrary number of arguments, which are inserted into the stream one by one, in the order they were given. `println()` does the same, but also prints a newline character `\n` at the end, for convenience. A mutex is used to synchronize this process, so that any other calls to `print()` or `println()` using the same `BS::synced_stream` object must wait until the previous call has finished.
 
 As an example, this code:
 
 ```cpp
-synced_stream sync_out;
-thread_pool pool;
-for (auto i = 1; i <= 5; i++)
-    pool.push_task([i, &sync_out] {
-        sync_out.println("Task no. ", i, " executing.");
-    });
+#include "BS_thread_pool.hpp"
+
+int main()
+{
+    BS::synced_stream sync_out;
+    BS::thread_pool pool;
+    for (size_t i = 1; i <= 5; ++i)
+        pool.push_task([i, &sync_out] { sync_out.println("Task no. ", i, " executing."); });
+}
 ```
 
 Will print out:
@@ -389,21 +550,15 @@ Task no. 4 executing.
 Task no. 5 executing.
 ```
 
-**Warning:** Always create the `synced_stream` object **before** the `thread_pool` object, as we did in this example. When the `thread_pool` object goes out of scope, it waits for the remaining tasks to be executed. If the `synced_stream` object goes out of scope before the `thread_pool` object, then any tasks using the `synced_stream` will crash. Since objects are destructed in the opposite order of construction, creating the `synced_stream` object before the `thread_pool` object ensures that the `synced_stream` is always available to the tasks, even while the pool is destructing.
+**Warning:** Always create the `BS::synced_stream` object **before** the `BS::thread_pool` object, as we did in this example. When the `BS::thread_pool` object goes out of scope, it waits for the remaining tasks to be executed. If the `BS::synced_stream` object goes out of scope before the `BS::thread_pool` object, then any tasks using the `BS::synced_stream` will crash. Since objects are destructed in the opposite order of construction, creating the `BS::synced_stream` object before the `BS::thread_pool` object ensures that the `BS::synced_stream` is always available to the tasks, even while the pool is destructing.
 
 ### Measuring execution time
 
 If you are using a thread pool, then your code is most likely performance-critical. Achieving maximum performance requires performing a considerable amount of benchmarking to determine the optimal settings and algorithms. Therefore, it is important to be able to measure the execution time of various computations and operations under different conditions.
 
-For example, you may be interested in figuring out:
-
-* The optimal number of threads in the pool.
-* The optimal number of tasks to divide a specific operation into, either using `parallelize_loop()` or manually.
-* The optimal [sleep duration](#setting-the-worker-functions-sleep-duration) for the worker functions.
-
-The helper class `timer` provides a simple way to measure execution time. It is very straightforward to use:
+The helper class `BS::timer` provides a simple way to measure execution time. It is very straightforward to use:
 
-1. Create a new `timer` object.
+1. Create a new `BS::timer` object.
 2. Immediately before you execute the computation that you want to time, call the `start()` member function.
 3. Immediately after the computation ends, call the `stop()` member function.
 4. Use the member function `ms()` to obtain the elapsed time for the computation in milliseconds.
@@ -411,26 +566,16 @@ The helper class `timer` provides a simple way to measure execution time. It is
 For example:
 
 ```cpp
-timer tmr;
+BS::timer tmr;
 tmr.start();
 do_something();
 tmr.stop();
 std::cout << "The elapsed time was " << tmr.ms() << " ms.\n";
 ```
 
-## Other features
-
-### Setting the worker function's sleep duration
+A practical application of the `BS::timer` class can be found in the benchmark portion of the test program `BS_thread_pool_test.cpp`.
 
-The **worker function** is the function that controls the execution of tasks by each thread. It loops continuously, and with each iteration of the loop, checks if there are any tasks in the queue. If it finds a task, it pops it out of the queue and executes it. If it does not find a task, it will wait for a bit, by calling `std::this_thread::sleep_for()`, and then check the queue again. The public member variable `sleep_duration` controls the duration, in microseconds, that the worker function sleeps for when it cannot find a task in the queue.
-
-The default value of `sleep_duration` is `1000` microseconds, or `1` millisecond. In our benchmarks, lower values resulted in high CPU usage when the workers were idle. The value of `1000` microseconds was roughly the minimum value needed to reduce the idle CPU usage to a negligible amount.
-
-In addition, in our benchmarks this value resulted in moderately improved performance compared to lower values, since the workers check the queue - which is a costly process - less frequently. On the other hand, increasing the value even more could potentially cause the workers to spend too much time sleeping and not pick up tasks from the queue quickly enough, so `1000` is the "sweet spot".
-
-However, please note that this value is likely unique to the particular system our benchmarks were performed on, and your own optimal value would depend on factors such as your OS and C++ implementation, the type, complexity, and average duration of the tasks submitted to the pool, and whether there are any other programs running at the same time. Therefore, it is strongly recommended to do your own benchmarks and find the value that works best for you.
-
-If `sleep_duration` is set to `0`, then the worker function will execute `std::this_thread::yield()` instead of sleeping if there are no tasks in the queue. This will suggest to the OS that it should put this thread on hold and allow other threads to run instead. However, this also causes the worker functions to have high CPU usage when idle. On the other hand, for some applications this setting may provide better performance than sleeping - again, do your own benchmarks and find what works best for you.
+## Other features
 
 ### Monitoring the tasks
 
@@ -438,18 +583,18 @@ Sometimes you may wish to monitor what is happening with the tasks you submitted
 
 * `get_tasks_queued()` gets the number of tasks currently waiting in the queue to be executed by the threads.
 * `get_tasks_running()` gets the number of tasks currently being executed by the threads.
-* `get_tasks_total()` gets the total number of unfinished tasks - either still in the queue, or running in a thread.
-* Note that `get_tasks_running() == get_tasks_total() - get_tasks_queued()`.
+* `get_tasks_total()` gets the total number of unfinished tasks: either still in the queue, or running in a thread.
+* Note that `get_tasks_total() == get_tasks_queued() + get_tasks_running()`.
 
 These functions are demonstrated in the following program:
 
 ```cpp
-#include "thread_pool.hpp"
+#include "BS_thread_pool.hpp"
 
-synced_stream sync_out;
-thread_pool pool(4);
+BS::synced_stream sync_out;
+BS::thread_pool pool(4);
 
-void sleep_half_second(const size_t &i)
+void sleep_half_second(const size_t i)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
     sync_out.println("Task ", i, " done.");
@@ -457,17 +602,12 @@ void sleep_half_second(const size_t &i)
 
 void monitor_tasks()
 {
-    sync_out.println(pool.get_tasks_total(),
-                     " tasks total, ",
-                     pool.get_tasks_running(),
-                     " tasks running, ",
-                     pool.get_tasks_queued(),
-                     " tasks queued.");
+    sync_out.println(pool.get_tasks_total(), " tasks total, ", pool.get_tasks_running(), " tasks running, ", pool.get_tasks_queued(), " tasks queued.");
 }
 
 int main()
 {
-    for (size_t i = 0; i < 12; i++)
+    for (size_t i = 0; i < 12; ++i)
         pool.push_task(sleep_half_second, i);
     monitor_tasks();
     std::this_thread::sleep_for(std::chrono::milliseconds(750));
@@ -479,7 +619,7 @@ int main()
 }
 ```
 
-Assuming you have at least 4 hardware threads (so that 4 tasks can run concurrently), the output will be similar to:
+Assuming you have at least 4 hardware threads (so that 4 tasks can run concurrently), the output should be similar to:
 
 ```none
 12 tasks total, 0 tasks running, 12 tasks queued.
@@ -502,19 +642,19 @@ Task 11 done.
 
 ### Pausing the workers
 
-Sometimes you may wish to temporarily pause the execution of tasks, or perhaps you want to submit tasks to the queue but only start executing them at a later time. You can do this using the public member variable `paused`.
+Sometimes you may wish to temporarily pause the execution of tasks, or perhaps you want to submit tasks to the queue in advance and only start executing them at a later time. You can do this using the public member variable `paused`.
 
-When `paused` is set to `true`, the workers will temporarily stop popping new tasks out of the queue. However, any tasks already executed will keep running until they are done, since the thread pool has no control over the internal code of your tasks. If you need to pause a task in the middle of its execution, you must do that manually by programming your own pause mechanism into the task itself. To resume popping tasks, set `paused` back to its default value of `false`.
+When `paused` is set to `true`, the workers will temporarily stop retrieving new tasks out of the queue. However, any tasks already executed will keep running until they are done, since the thread pool has no control over the internal code of your tasks. If you need to pause a task in the middle of its execution, you must do that manually by programming your own pause mechanism into the task itself. To resume retrieving tasks, set `paused` back to its default value of `false`.
 
 Here is an example:
 
 ```cpp
-#include "thread_pool.hpp"
+#include "BS_thread_pool.hpp"
 
-synced_stream sync_out;
-thread_pool pool(4);
+BS::synced_stream sync_out;
+BS::thread_pool pool(4);
 
-void sleep_half_second(const size_t &i)
+void sleep_half_second(const size_t i)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
     sync_out.println("Task ", i, " done.");
@@ -522,7 +662,7 @@ void sleep_half_second(const size_t &i)
 
 int main()
 {
-    for (size_t i = 0; i < 8; i++)
+    for (size_t i = 0; i < 8; ++i)
         pool.push_task(sleep_half_second, i);
     sync_out.println("Submitted 8 tasks.");
     std::this_thread::sleep_for(std::chrono::milliseconds(250));
@@ -531,7 +671,7 @@ int main()
     std::this_thread::sleep_for(std::chrono::milliseconds(1000));
     sync_out.println("Still paused...");
     std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-    for (size_t i = 8; i < 12; i++)
+    for (size_t i = 8; i < 12; ++i)
         pool.push_task(sleep_half_second, i);
     sync_out.println("Submitted 4 more tasks.");
     sync_out.println("Still paused...");
@@ -541,7 +681,7 @@ int main()
 }
 ```
 
-Assuming you have at least 4 hardware threads, the output will be similar to:
+Assuming you have at least 4 hardware threads, the output should be similar to:
 
 ```none
 Submitted 8 tasks.
@@ -569,12 +709,12 @@ Here is what happened. We initially submitted a total of 8 tasks to the queue. S
 While the workers are paused, `wait_for_tasks()` will wait for the running tasks instead of all tasks (otherwise it would wait forever). This is demonstrated by the following program:
 
 ```cpp
-#include "thread_pool.hpp"
+#include "BS_thread_pool.hpp"
 
-synced_stream sync_out;
-thread_pool pool(4);
+BS::synced_stream sync_out;
+BS::thread_pool pool(4);
 
-void sleep_half_second(const size_t &i)
+void sleep_half_second(const size_t i)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
     sync_out.println("Task ", i, " done.");
@@ -582,11 +722,11 @@ void sleep_half_second(const size_t &i)
 
 int main()
 {
-    for (size_t i = 0; i < 8; i++)
+    for (size_t i = 0; i < 8; ++i)
         pool.push_task(sleep_half_second, i);
     sync_out.println("Submitted 8 tasks. Waiting for them to complete.");
     pool.wait_for_tasks();
-    for (size_t i = 8; i < 20; i++)
+    for (size_t i = 8; i < 20; ++i)
         pool.push_task(sleep_half_second, i);
     sync_out.println("Submitted 12 more tasks.");
     std::this_thread::sleep_for(std::chrono::milliseconds(250));
@@ -642,16 +782,16 @@ All tasks completed.
 
 The first `wait_for_tasks()`, which was called with `paused == false`, waited for all 8 tasks, both running and queued. The second `wait_for_tasks()`, which was called with `paused == true`, only waited for the 4 running tasks, while the other 8 tasks remained queued, and were not executed since the pool was paused. Finally, the third `wait_for_tasks()`, which was called with `paused == false`, waited for the remaining 8 tasks, both running and queued.
 
-**Warning**: If the thread pool is destroyed while paused, any tasks still in the queue will never be executed.
+**Warning**: If the thread pool is destroyed while paused, any tasks still in the queue will never be executed!
 
 ### Exception handling
 
 `submit()` catches any exceptions thrown by the submitted task and forwards them to the corresponding future. They can then be caught when invoking the `get()` member function of the future. For example:
 
 ```cpp
-#include "thread_pool.hpp"
+#include "BS_thread_pool.hpp"
 
-double inverse(const double &x)
+double inverse(const double x)
 {
     if (x == 0)
         throw std::runtime_error("Division by zero!");
@@ -661,14 +801,14 @@ double inverse(const double &x)
 
 int main()
 {
-    thread_pool pool;
+    BS::thread_pool pool;
     auto my_future = pool.submit(inverse, 0);
     try
     {
-        double result = my_future.get();
+        const double result = my_future.get();
         std::cout << "The result is: " << result << '\n';
     }
-    catch (const std::exception &e)
+    catch (const std::exception& e)
     {
         std::cout << "Caught exception: " << e.what() << '\n';
     }
@@ -683,11 +823,11 @@ Caught exception: Division by zero!
 
 ## Testing the package
 
-The included file `thread_pool_test.cpp` will perform automated tests of all aspects of the package, and benchmark some multithreaded matrix operations. The output will be printed both to `std::cout` and to a file named `thread_pool_test-yyyy-mm-dd_hh.mm.ss.log` based on the current date and time. In addition, the code is thoroughly documented, and is meant to serve as an extensive example of how to properly use the package.
+The included file `BS_thread_pool_test.cpp` will perform automated tests of all aspects of the package, and perform simple benchmarks. The output will be printed both to `std::cout` and to a file named `BS_thread_pool_test-yyyy-mm-dd_hh.mm.ss.log` based on the current date and time. In addition, the code is thoroughly documented, and is meant to serve as an extensive example of how to properly use the package.
 
 Please make sure to:
 
-1. [Compile](#compiling-and-compatibility) `thread_pool_test.cpp` with optimization flags enabled (e.g. `-O3` on GCC / Clang or `/O2` on MSVC).
+1. [Compile](#compiling-and-compatibility) `BS_thread_pool_test.cpp` with optimization flags enabled (e.g. `-O3` on GCC / Clang or `/O2` on MSVC).
 2. Run the test without any other applications, especially multithreaded applications, running in parallel.
 
 If any of the tests fail, please [submit a bug report](https://github.com/bshoshany/thread-pool/issues) including the exact specifications of your system (OS, CPU, compiler, etc.) and the generated log file.
@@ -698,12 +838,12 @@ A sample output of a successful run of the automated tests is as follows:
 
 ```none
 A C++17 Thread Pool for High-Performance Scientific Computing
-(c) 2021 Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)
+(c) 2022 Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)
 GitHub: https://github.com/bshoshany/thread-pool
 
-Thread pool library version is v2.0.0 (2021-08-14).
+Thread pool library version is v3.0.0 (2022-05-30).
 Hardware concurrency is 24.
-Generating log file: thread_pool_test-2021-08-14_23.34.25.log.
+Generating log file: BS_thread_pool_test-2022-05-30_22.59.30.log.
 
 Important: Please do not run any other applications, especially multithreaded applications, in parallel with this test!
 
@@ -761,43 +901,46 @@ Checking that wait_for_tasks() works...
 =======================================
 Checking that parallelize_loop() works:
 =======================================
-Verifying that a loop from -2064 to 551 with 4 tasks modifies all indices...
+Verifying that a loop from -434827 to 461429 with 23 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from -658 to -77 with 19 tasks modifies all indices...
+Verifying that a loop from 255333 to -889028 with 9 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 1512 to -1046 with 1 task modifies all indices...
+Verifying that a loop from -257322 to 550471 with 5 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from -2334 to -1770 with 23 tasks modifies all indices...
+Verifying that a loop from -257648 to -475958 with 23 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 1775 to -1242 with 13 tasks modifies all indices...
+Verifying that a loop from 175412 to -544672 with 13 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 846 to -506 with 14 tasks modifies all indices...
+Verifying that a loop from -244797 to -970178 with 11 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from -301 to -2111 with 5 tasks modifies all indices...
+Verifying that a loop from 411251 to -718341 with 15 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 1758 to -1602 with 11 tasks modifies all indices...
+Verifying that a loop from 418787 to 978302 with 22 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 94 to -1103 with 24 tasks modifies all indices...
+Verifying that a loop from -2412 to -310158 with 4 tasks modifies all indices...
 -> PASSED!
-Verifying that a loop from 612 to 2026 with 13 tasks modifies all indices...
+Verifying that a loop from -881862 to 137673 with 7 tasks modifies all indices...
 -> PASSED!
-
-======================================================
-Checking that different values of sleep_duration work:
-======================================================
-Submitting tasks with sleep_duration = 0 microseconds...
+Verifying that a loop from 539438 to -759983 with 17 tasks correctly sums all indices...
+-> PASSED!
+Verifying that a loop from 745706 to -519554 with 13 tasks correctly sums all indices...
+-> PASSED!
+Verifying that a loop from 288078 to 432534 with 12 tasks correctly sums all indices...
+-> PASSED!
+Verifying that a loop from -487251 to 302796 with 4 tasks correctly sums all indices...
 -> PASSED!
-Submitting tasks with sleep_duration = 1909 microseconds...
+Verifying that a loop from 408766 to 756890 with 3 tasks correctly sums all indices...
 -> PASSED!
-Submitting tasks with sleep_duration = 469 microseconds...
+Verifying that a loop from -976768 to -500744 with 10 tasks correctly sums all indices...
 -> PASSED!
-Submitting tasks with sleep_duration = 964 microseconds...
+Verifying that a loop from -817442 to 175967 with 6 tasks correctly sums all indices...
 -> PASSED!
-Submitting tasks with sleep_duration = 1946 microseconds...
+Verifying that a loop from -765007 to -53682 with 23 tasks correctly sums all indices...
 -> PASSED!
-Submitting tasks with sleep_duration = 773 microseconds...
+Verifying that a loop from -903190 to -361760 with 14 tasks correctly sums all indices...
+-> PASSED!
+Verifying that a loop from 72823 to -85485 with 15 tasks correctly sums all indices...
 -> PASSED!
-Resetting sleep_duration to the default value (1000 microseconds).
 
 ====================================
 Checking that task monitoring works:
@@ -806,22 +949,22 @@ Resetting pool to 4 threads.
 Submitting 12 tasks.
 After submission, should have: 12 tasks total, 4 tasks running, 8 tasks queued...
 -> PASSED!
-Task 1 released.
-Task 3 released.
 Task 0 released.
 Task 2 released.
+Task 1 released.
+Task 3 released.
 After releasing 4 tasks, should have: 8 tasks total, 4 tasks running, 4 tasks queued...
+-> PASSED!
+Task 7 released.
 Task 5 released.
 Task 4 released.
-Task 7 released.
 Task 6 released.
--> PASSED!
 After releasing 4 more tasks, should have: 4 tasks total, 4 tasks running, 0 tasks queued...
 -> PASSED!
-Task 11 released.
+Task 10 released.
 Task 8 released.
 Task 9 released.
-Task 10 released.
+Task 11 released.
 After releasing the final 4 tasks, should have: 0 tasks total, 0 tasks running, 0 tasks queued...
 -> PASSED!
 Resetting pool to 24 threads.
@@ -844,19 +987,19 @@ Task 0 done.
 300ms later, should have: 8 tasks total, 4 tasks running, 4 tasks queued...
 -> PASSED!
 Pausing pool and using wait_for_tasks() to wait for the running tasks.
-Task 7 done.
-Task 5 done.
 Task 6 done.
 Task 4 done.
+Task 5 done.
+Task 7 done.
 After waiting, should have: 4 tasks total, 0 tasks running, 4 tasks queued...
 -> PASSED!
 200ms later, should still have: 4 tasks total, 0 tasks running, 4 tasks queued...
 -> PASSED!
 Unpausing pool and using wait_for_tasks() to wait for all tasks.
 Task 9 done.
-Task 8 done.
 Task 10 done.
 Task 11 done.
+Task 8 done.
 After waiting, should have: 0 tasks total, 0 tasks running, 0 tasks queued...
 -> PASSED!
 Resetting pool to 24 threads.
@@ -867,161 +1010,86 @@ Checking that exception handling works:
 -> PASSED!
 
 ============================================================
-Testing that matrix operations produce the expected results:
+Testing that vector operations produce the expected results:
 ============================================================
-Using matrices of size 240x240 with a total of 57600 elements.
-Adding two matrices (single-threaded)...
-Adding two matrices (multithreaded)...
-Comparing the results...
+Adding two vectors with 83788 elements using 24 tasks...
+-> PASSED!
+Adding two vectors with 595750 elements using 3 tasks...
 -> PASSED!
-Transposing a matrix (single-threaded)...
-Transposing a matrix (multithreaded)...
-Comparing the results...
+Adding two vectors with 738336 elements using 20 tasks...
 -> PASSED!
-Multiplying two matrices (single-threaded)...
-Multiplying two matrices (multithreaded)...
-Comparing the results...
+Adding two vectors with 100123 elements using 24 tasks...
+-> PASSED!
+Adding two vectors with 921883 elements using 24 tasks...
+-> PASSED!
+Adding two vectors with 76713 elements using 22 tasks...
+-> PASSED!
+Adding two vectors with 891037 elements using 2 tasks...
+-> PASSED!
+Adding two vectors with 245369 elements using 17 tasks...
+-> PASSED!
+Adding two vectors with 39624 elements using 11 tasks...
+-> PASSED!
+Adding two vectors with 295307 elements using 10 tasks...
 -> PASSED!
 
 ++++++++++++++++++++++++++++++
-SUCCESS: Passed all 46 checks!
+SUCCESS: Passed all 57 checks!
 ++++++++++++++++++++++++++++++
 ```
 
 ### Performance tests
 
-If all checks passed, `thread_pool_test.cpp` will perform benchmarking of multithreaded matrix operations. Here we will present the results obtained with two different systems.
+If all checks passed, `BS_thread_pool_test.cpp` will perform simple benchmarks by filling a specific number of vectors of fixed size with random values. The program decides how many vectors to use by testing how many are needed to reach a target duration in the single-threaded test. This ensures that the test takes approximately the same amount of time on different systems, and is thus more consistent and portable.
 
-The first test was performed on a high-end desktop computer equipped with a 12-core / 24-thread AMD Ryzen 9 3900X CPU at 3.8 GHz and 32 GB of DDR4 RAM at 3600 MHz, compiled using GCC v11.2.0 on Windows 10 build 19043.1165 with the `-O3` compiler flag. The thread pool used 22 out of 24 threads, leaving 2 threads free for the operating system - which in our tests increased performance, presumably since all 22 threads could be dedicated entirely to the test. The output was as follows:
+Once the required number of vectors has been determined, the program will test the performance of several multi-threaded tests, dividing the total number of vectors into different numbers of tasks, compare them to the performance of the single-threaded test, and indicate the maximum speedup obtained.
 
-```none
-===================================
-Performing matrix performance test:
-===================================
-Using 22 out of 24 threads.
-Determining the optimal sleep duration........................
-Result: The optimal sleep duration is 300 microseconds.
-
-Adding two 4400x4400 matrices 20 times:
-With   1  task, mean execution time was   39.3 ms with standard deviation  2.4 ms.
-With   5 tasks, mean execution time was   21.2 ms with standard deviation  1.7 ms.
-With  11 tasks, mean execution time was   20.4 ms with standard deviation  1.1 ms.
-With  22 tasks, mean execution time was   18.3 ms with standard deviation  1.3 ms.
-With  44 tasks, mean execution time was   17.4 ms with standard deviation  0.7 ms.
-With  88 tasks, mean execution time was   18.0 ms with standard deviation  1.0 ms.
-Maximum speedup obtained: 2.3x.
-
-Transposing one 4400x4400 matrix 20 times:
-With   1  task, mean execution time was  139.8 ms with standard deviation  3.0 ms.
-With   5 tasks, mean execution time was   38.2 ms with standard deviation  2.4 ms.
-With  11 tasks, mean execution time was   23.3 ms with standard deviation  1.8 ms.
-With  22 tasks, mean execution time was   18.9 ms with standard deviation  1.6 ms.
-With  44 tasks, mean execution time was   19.5 ms with standard deviation  1.5 ms.
-With  88 tasks, mean execution time was   18.1 ms with standard deviation  0.7 ms.
-Maximum speedup obtained: 7.7x.
-
-Multiplying two 550x550 matrices 20 times:
-With   1  task, mean execution time was  165.2 ms with standard deviation  2.5 ms.
-With   5 tasks, mean execution time was   35.9 ms with standard deviation  1.0 ms.
-With  11 tasks, mean execution time was   17.6 ms with standard deviation  0.5 ms.
-With  22 tasks, mean execution time was   10.2 ms with standard deviation  0.7 ms.
-With  44 tasks, mean execution time was   16.1 ms with standard deviation  1.4 ms.
-With  88 tasks, mean execution time was   15.4 ms with standard deviation  0.7 ms.
-Maximum speedup obtained: 16.2x.
-
-Generating random 4400x4400 matrix 20 times:
-With   1  task, mean execution time was  244.7 ms with standard deviation  2.6 ms.
-With   5 tasks, mean execution time was   51.5 ms with standard deviation  1.5 ms.
-With  11 tasks, mean execution time was   25.7 ms with standard deviation  0.9 ms.
-With  22 tasks, mean execution time was   19.1 ms with standard deviation  2.7 ms.
-With  44 tasks, mean execution time was   17.2 ms with standard deviation  2.1 ms.
-With  88 tasks, mean execution time was   15.8 ms with standard deviation  1.0 ms.
-Maximum speedup obtained: 15.5x.
-
-Overall, multithreading provided speedups of up to 16.2x.
+Please note that these benchmarks are only intended to demonstrate that the package can provide a significant speedup, and it is highly recommended to perform your own benchmarks with your specific system, compiler, and code.
 
-+++++++++++++++++++++++++++++++++++++++
-Thread pool performance test completed!
-+++++++++++++++++++++++++++++++++++++++
-```
-
-Here are some lessons we can learn from these results:
-
-* For simple element-wise operations such as addition, multithreading improves performance very modestly, only by a factor of 2.3, even when utilizing 22 threads in parallel. This is because compiler optimizations already parallelize simple loops fairly well on their own. Omitting the `-O3` optimization flag, we observed a 6.8x speedup for addition. However, the user will most likely be compiling with optimizations turned on anyway.
-* Transposition enjoys a moderate 7.7x speedup with multithreading. Note that transposition requires reading memory is non-sequential order, jumping between the rows of the source matrix, which is why, compared to sequential operations such as addition, it is much slower when single-threaded, and benefits more from multithreading.
-* Matrix multiplication and random matrix generation, which are more complicated operations that cannot be automatically parallelized by compiler optimizations, gain the most out of multithreading - with a very significant speedup by a factor of around 16 on average. Given that the test CPU only has 12 physical cores, and hyperthreading can generally produce no more than a 30% performance improvement, a 16x speedup is about as good as can be expected.
-* Using as many tasks as there are threads almost always provides the best performance. Although in some cases 44 or 88 tasks seem to provide a slightly lower mean execution time compared to 22 tasks, the difference is within less than 1 standard deviation in all cases.
-
-### Dual Intel Xeon Gold 6148 (80 threads)
-
-The second test was performed on a [Compute Canada](https://www.computecanada.ca/) node equipped with dual 20-core / 40-thread Intel Xeon Gold 6148 CPUs at 2.4 GHz (for a total of 40 cores and 80 threads) and 202 GB of RAM, compiled using GCC v9.4.0 on CentOS Linux 7.6.1810 with the `-O3` compiler flag. The thread pool consisted of 78 threads. The output was as follows:
+Here we will present the results of the performance test running on a high-end desktop computer equipped with a 12-core / 24-thread AMD Ryzen 9 3900X CPU at 3.8 GHz and 32 GB of DDR4 RAM at 3600 MHz, compiled using [GCC](https://gcc.gnu.org/) v12.1.0 ([WinLibs build](https://winlibs.com/)) on Windows 11 build 22000.675 with the `-O3` compiler flag. The output was as follows:
 
 ```none
-===================================
-Performing matrix performance test:
-===================================
-Using 78 out of 80 threads.
-Determining the optimal sleep duration........................
-Result: The optimal sleep duration is 1000 microseconds.
-
-Adding two 15600x15600 matrices 20 times:
-With   1  task, mean execution time was  846.1 ms with standard deviation 40.2 ms.
-With  19 tasks, mean execution time was   88.1 ms with standard deviation  8.6 ms.
-With  39 tasks, mean execution time was   73.5 ms with standard deviation  4.8 ms.
-With  78 tasks, mean execution time was   67.3 ms with standard deviation  2.2 ms.
-With 156 tasks, mean execution time was   64.9 ms with standard deviation  2.3 ms.
-With 312 tasks, mean execution time was   65.8 ms with standard deviation  1.5 ms.
-Maximum speedup obtained: 13.0x.
-
-Transposing one 15600x15600 matrix 20 times:
-With   1  task, mean execution time was 1689.4 ms with standard deviation 75.3 ms.
-With  19 tasks, mean execution time was  155.3 ms with standard deviation 19.7 ms.
-With  39 tasks, mean execution time was  115.0 ms with standard deviation 10.8 ms.
-With  78 tasks, mean execution time was   99.0 ms with standard deviation  6.0 ms.
-With 156 tasks, mean execution time was   96.2 ms with standard deviation  1.6 ms.
-With 312 tasks, mean execution time was   97.8 ms with standard deviation  1.7 ms.
-Maximum speedup obtained: 17.6x.
-
-Multiplying two 1950x1950 matrices 20 times:
-With   1  task, mean execution time was 15415.1 ms with standard deviation 672.5 ms.
-With  19 tasks, mean execution time was 1152.5 ms with standard deviation 62.8 ms.
-With  39 tasks, mean execution time was  537.9 ms with standard deviation  4.1 ms.
-With  78 tasks, mean execution time was  292.3 ms with standard deviation 42.5 ms.
-With 156 tasks, mean execution time was  936.4 ms with standard deviation 15.8 ms.
-With 312 tasks, mean execution time was  951.2 ms with standard deviation 22.3 ms.
-Maximum speedup obtained: 52.7x.
-
-Generating random 15600x15600 matrix 20 times:
-With   1  task, mean execution time was 4318.3 ms with standard deviation  6.3 ms.
-With  19 tasks, mean execution time was  260.8 ms with standard deviation 15.1 ms.
-With  39 tasks, mean execution time was  156.1 ms with standard deviation  1.6 ms.
-With  78 tasks, mean execution time was   86.2 ms with standard deviation  1.9 ms.
-With 156 tasks, mean execution time was   84.8 ms with standard deviation  0.4 ms.
-With 312 tasks, mean execution time was   85.2 ms with standard deviation  1.3 ms.
-Maximum speedup obtained: 51.0x.
-
-Overall, multithreading provided speedups of up to 52.7x.
+======================
+Performing benchmarks:
+======================
+Using 24 threads.
+Each test will be repeated 20 times to collect reliable statistics.
+
+Generating 57320 random vectors with 500 elements each:
+Single-threaded, mean execution time was  298.2 ms with standard deviation  1.7 ms.
+With    6 tasks, mean execution time was   52.3 ms with standard deviation  1.3 ms.
+With   12 tasks, mean execution time was   30.3 ms with standard deviation  0.8 ms.
+With   24 tasks, mean execution time was   16.4 ms with standard deviation  1.2 ms.
+With   48 tasks, mean execution time was   19.2 ms with standard deviation  2.6 ms.
+With   96 tasks, mean execution time was   17.8 ms with standard deviation  1.2 ms.
+Maximum speedup obtained by multithreading vs. single-threading: 18.2x, using 24 tasks.
 
 +++++++++++++++++++++++++++++++++++++++
 Thread pool performance test completed!
 +++++++++++++++++++++++++++++++++++++++
 ```
 
-The speedup of around 51.9x on average for matrix multiplication and random matrix generation again matches the estimation of a 30% improvement in performance over the 40 physical CPU cores due to hyperthreading, which indicates that we are once again saturating the maximum possible performance of our system.
+This CPU has 12 physical cores, with each core providing two separate logical cores via hyperthreading, for a total of 24 threads. Without hyperthreading, we would expect a maximum theoretical speedup of 12x. With hyperthreading, one might naively expect to achieve up to a 24x speedup, but this is in fact impossible, as both logical cores share the same physical core's resources. However, generally we would expect [an estimated 30% additional speedup](https://software.intel.com/content/www/us/en/develop/articles/how-to-determine-the-effectiveness-of-hyper-threading-technology-with-an-application.html) from hyperthreading, which amounts to around 15.6x in this case. In our performance test, we see a speedup of 18.2x, saturating and even surpassing this estimated theoretical upper bound.
 
 ## Issue and pull request policy
 
 This package is under continuous and active development. If you encounter any bugs, or if you would like to request any additional features, please feel free to [open a new issue on GitHub](https://github.com/bshoshany/thread-pool/issues) and I will look into it as soon as I can.
 
-Contributions are always welcome. However, I release my projects in cumulative updates after editing them locally on my system, so my policy is not to accept any pull requests. If you open a pull request, and I decide to incorporate it into the code, I will first perform some tests to ensure that the change doesn't break anything, and then merge it into the next release of the project, possibly together with some other changes, and along with a version bump and a corresponding note in `CHANGELOG.md` with a link to the pull request.
+Contributions are always welcome. However, I release my projects in cumulative updates after editing and testing them locally on my system, so my policy is not to accept any pull requests. If you open a pull request, and I decide to incorporate your suggestion into the project, I will first modify your code to comply with the project's coding conventions (formatting, syntax, naming, comments, programming practices, etc.), and perform some tests to ensure that the change doesn't break anything. I will then merge it into the next release of the project, possibly together with some other changes. The new release will also include a note in `CHANGELOG.md` with a link to your pull request, and modifications to the documentation in `README.md` as needed.
+
+## Acknowledgements
+
+Many GitHub users have helped improve this project, directly or indirectly, via issues, pull requests, comments, and/or personal correspondence. Please see `CHANGELOG.md` for links to specific issues and pull requests that have been the most helpful. Thank you all for your contribution! :)
 
 ## Copyright and citing
 
-Copyright (c) 2021 [Barak Shoshany](http://baraksh.com). Licensed under the [MIT license](LICENSE.txt).
+Copyright (c) 2022 [Barak Shoshany](http://baraksh.com). Licensed under the [MIT license](LICENSE.txt).
+
+If you use the library in software of any kind, please provide a link to [the GitHub repository](https://github.com/bshoshany/thread-pool) in the source code and documentation.
 
 If you use this library in published research, please cite it as follows:
 
-* Barak Shoshany, "A C++17 Thread Pool for High-Performance Scientific Computing", [doi:10.5281/zenodo.4742687](https://doi.org/10.5281/zenodo.4742687), [arXiv:2105.00613](https://arxiv.org/abs/2105.00613) (May 2021)
+* Barak Shoshany, *"A C++17 Thread Pool for High-Performance Scientific Computing"*, [doi:10.5281/zenodo.4742687](https://doi.org/10.5281/zenodo.4742687), [arXiv:2105.00613](https://arxiv.org/abs/2105.00613) (May 2021)
 
 You can use the following BibTeX entry:
 
@@ -1040,3 +1108,5 @@ You can use the following BibTeX entry:
     year          = {2021}
 }
 ```
+
+Please note that the [companion paper on arXiv](https://arxiv.org/abs/2105.00613) is updated infrequently. The paper is intended to facilitate discovery of the package by scientists who may find it useful for scientific computing purposes and to allow citing the package in scientific research, but most users should read the `README.md` file on [the GitHub repository](https://github.com/bshoshany/thread-pool) instead, as it is guaranteed to always be up to date.
diff --git a/thread_pool.hpp b/thread_pool.hpp
deleted file mode 100644
index 4460e54..0000000
--- a/thread_pool.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-#pragma once
-
-/**
- * @file thread_pool.hpp
- * @author Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)
- * @version 2.0.0
- * @date 2021-08-14
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license. If you use this library in published research, please cite it as follows:
- *  - Barak Shoshany, "A C++17 Thread Pool for High-Performance Scientific Computing", doi:10.5281/zenodo.4742687, arXiv:2105.00613 (May 2021)
- *
- * @brief A C++17 thread pool for high-performance scientific computing.
- * @details A modern C++17-compatible thread pool implementation, built from scratch with high-performance scientific computing in mind. The thread pool is implemented as a single lightweight and self-contained class, and does not have any dependencies other than the C++17 standard library, thus allowing a great degree of portability. In particular, this implementation does not utilize OpenMP or any other high-level multithreading APIs, and thus gives the programmer precise low-level control over the details of the parallelization, which permits more robust optimizations. The thread pool was extensively tested on both AMD and Intel CPUs with up to 40 cores and 80 threads. Other features include automatic generation of futures and easy parallelization of loops. Two helper classes enable synchronizing printing to an output stream by different threads and measuring execution time for benchmarking purposes. Please visit the GitHub repository at https://github.com/bshoshany/thread-pool for documentation and updates, or to submit feature requests and bug reports.
- */
-
-#define THREAD_POOL_VERSION "v2.0.0 (2021-08-14)"
-
-#include <atomic>      // std::atomic
-#include <chrono>      // std::chrono
-#include <cstdint>     // std::int_fast64_t, std::uint_fast32_t
-#include <functional>  // std::function
-#include <future>      // std::future, std::promise
-#include <iostream>    // std::cout, std::ostream
-#include <memory>      // std::shared_ptr, std::unique_ptr
-#include <mutex>       // std::mutex, std::scoped_lock
-#include <queue>       // std::queue
-#include <thread>      // std::this_thread, std::thread
-#include <type_traits> // std::common_type_t, std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>     // std::move
-
-// ============================================================================================= //
-//                                    Begin class thread_pool                                    //
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a thread becomes available, it pops a task from the queue and executes it. Each task is automatically assigned a future, which can be used to wait for the task to finish executing and/or obtain its eventual return value.
- */
-class thread_pool
-{
-    typedef std::uint_fast32_t ui32;
-    typedef std::uint_fast64_t ui64;
-
-public:
-    // ============================
-    // Constructors and destructors
-    // ============================
-
-    /**
-     * @brief Construct a new thread pool.
-     *
-     * @param _thread_count The number of threads to use. The default value is the total number of hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this will be twice the number of CPU cores. If the argument is zero, the default value will be used instead.
-     */
-    thread_pool(const ui32 &_thread_count = std::thread::hardware_concurrency())
-        : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()), threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-    {
-        create_threads();
-    }
-
-    /**
-     * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads. Note that if the variable paused is set to true, then any tasks still in the queue will never be executed.
-     */
-    ~thread_pool()
-    {
-        wait_for_tasks();
-        running = false;
-        destroy_threads();
-    }
-
-    // =======================
-    // Public member functions
-    // =======================
-
-    /**
-     * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-     *
-     * @return The number of queued tasks.
-     */
-    ui64 get_tasks_queued() const
-    {
-        const std::scoped_lock lock(queue_mutex);
-        return tasks.size();
-    }
-
-    /**
-     * @brief Get the number of tasks currently being executed by the threads.
-     *
-     * @return The number of running tasks.
-     */
-    ui32 get_tasks_running() const
-    {
-        return tasks_total - (ui32)get_tasks_queued();
-    }
-
-    /**
-     * @brief Get the total number of unfinished tasks - either still in the queue, or running in a thread.
-     *
-     * @return The total number of tasks.
-     */
-    ui32 get_tasks_total() const
-    {
-        return tasks_total;
-    }
-
-    /**
-     * @brief Get the number of threads in the pool.
-     *
-     * @return The number of threads.
-     */
-    ui32 get_thread_count() const
-    {
-        return thread_count;
-    }
-
-    /**
-     * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the thread pool, and waiting for all blocks to finish executing. The user supplies a loop function, which will be called once per block and should iterate over the block's range.
-     *
-     * @tparam T1 The type of the first index in the loop. Should be a signed or unsigned integer.
-     * @tparam T2 The type of the index after the last index in the loop. Should be a signed or unsigned integer. If T1 is not the same as T2, a common type will be automatically inferred.
-     * @tparam F The type of the function to loop through.
-     * @param first_index The first index in the loop.
-     * @param index_after_last The index after the last index in the loop. The loop will iterate from first_index to (index_after_last - 1) inclusive. In other words, it will be equivalent to "for (T i = first_index; i < index_after_last; i++)". Note that if first_index == index_after_last, the function will terminate without doing anything.
-     * @param loop The function to loop through. Will be called once per block. Should take exactly two arguments: the first index in the block and the index after the last index in the block. loop(start, end) should typically involve a loop of the form "for (T i = start; i < end; i++)".
-     * @param num_blocks The maximum number of blocks to split the loop into. The default is to use the number of threads in the pool.
-     */
-    template <typename T1, typename T2, typename F>
-    void parallelize_loop(const T1 &first_index, const T2 &index_after_last, const F &loop, ui32 num_blocks = 0)
-    {
-        typedef std::common_type_t<T1, T2> T;
-        T the_first_index = (T)first_index;
-        T last_index = (T)index_after_last;
-        if (the_first_index == last_index)
-            return;
-        if (last_index < the_first_index)
-        {
-            T temp = last_index;
-            last_index = the_first_index;
-            the_first_index = temp;
-        }
-        last_index--;
-        if (num_blocks == 0)
-            num_blocks = thread_count;
-        ui64 total_size = (ui64)(last_index - the_first_index + 1);
-        ui64 block_size = (ui64)(total_size / num_blocks);
-        if (block_size == 0)
-        {
-            block_size = 1;
-            num_blocks = (ui32)total_size > 1 ? (ui32)total_size : 1;
-        }
-        std::atomic<ui32> blocks_running = 0;
-        for (ui32 t = 0; t < num_blocks; t++)
-        {
-            T start = ((T)(t * block_size) + the_first_index);
-            T end = (t == num_blocks - 1) ? last_index + 1 : ((T)((t + 1) * block_size) + the_first_index);
-            blocks_running++;
-            push_task([start, end, &loop, &blocks_running]
-                      {
-                          loop(start, end);
-                          blocks_running--;
-                      });
-        }
-        while (blocks_running != 0)
-        {
-            sleep_or_yield();
-        }
-    }
-
-    /**
-     * @brief Push a function with no arguments or return value into the task queue.
-     *
-     * @tparam F The type of the function.
-     * @param task The function to push.
-     */
-    template <typename F>
-    void push_task(const F &task)
-    {
-        tasks_total++;
-        {
-            const std::scoped_lock lock(queue_mutex);
-            tasks.push(std::function<void()>(task));
-        }
-    }
-
-    /**
-     * @brief Push a function with arguments, but no return value, into the task queue.
-     * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks in the queue must be of type std::function<void()>, so they cannot have any arguments or return value. If no arguments are provided, the other overload will be used, in order to avoid the (slight) overhead of using a lambda.
-     *
-     * @tparam F The type of the function.
-     * @tparam A The types of the arguments.
-     * @param task The function to push.
-     * @param args The arguments to pass to the function.
-     */
-    template <typename F, typename... A>
-    void push_task(const F &task, const A &...args)
-    {
-        push_task([task, args...]
-                  { task(args...); });
-    }
-
-    /**
-     * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be completed, then destroys all threads in the pool and creates a new thread pool with the new number of threads. Any tasks that were waiting in the queue before the pool was reset will then be executed by the new threads. If the pool was paused before resetting it, the new pool will be paused as well.
-     *
-     * @param _thread_count The number of threads to use. The default value is the total number of hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this will be twice the number of CPU cores. If the argument is zero, the default value will be used instead.
-     */
-    void reset(const ui32 &_thread_count = std::thread::hardware_concurrency())
-    {
-        bool was_paused = paused;
-        paused = true;
-        wait_for_tasks();
-        running = false;
-        destroy_threads();
-        thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-        threads.reset(new std::thread[thread_count]);
-        paused = was_paused;
-        running = true;
-        create_threads();
-    }
-
-    /**
-     * @brief Submit a function with zero or more arguments and no return value into the task queue, and get an std::future<bool> that will be set to true upon completion of the task.
-     *
-     * @tparam F The type of the function.
-     * @tparam A The types of the zero or more arguments to pass to the function.
-     * @param task The function to submit.
-     * @param args The zero or more arguments to pass to the function.
-     * @return A future to be used later to check if the function has finished its execution.
-     */
-    template <typename F, typename... A, typename = std::enable_if_t<std::is_void_v<std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>>>
-    std::future<bool> submit(const F &task, const A &...args)
-    {
-        std::shared_ptr<std::promise<bool>> task_promise(new std::promise<bool>);
-        std::future<bool> future = task_promise->get_future();
-        push_task([task, args..., task_promise]
-                  {
-                      try
-                      {
-                          task(args...);
-                          task_promise->set_value(true);
-                      }
-                      catch (...)
-                      {
-                          try
-                          {
-                              task_promise->set_exception(std::current_exception());
-                          }
-                          catch (...)
-                          {
-                          }
-                      }
-                  });
-        return future;
-    }
-
-    /**
-     * @brief Submit a function with zero or more arguments and a return value into the task queue, and get a future for its eventual returned value.
-     *
-     * @tparam F The type of the function.
-     * @tparam A The types of the zero or more arguments to pass to the function.
-     * @tparam R The return type of the function.
-     * @param task The function to submit.
-     * @param args The zero or more arguments to pass to the function.
-     * @return A future to be used later to obtain the function's returned value, waiting for it to finish its execution if needed.
-     */
-    template <typename F, typename... A, typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>, typename = std::enable_if_t<!std::is_void_v<R>>>
-    std::future<R> submit(const F &task, const A &...args)
-    {
-        std::shared_ptr<std::promise<R>> task_promise(new std::promise<R>);
-        std::future<R> future = task_promise->get_future();
-        push_task([task, args..., task_promise]
-                  {
-                      try
-                      {
-                          task_promise->set_value(task(args...));
-                      }
-                      catch (...)
-                      {
-                          try
-                          {
-                              task_promise->set_exception(std::current_exception());
-                          }
-                          catch (...)
-                          {
-                          }
-                      }
-                  });
-        return future;
-    }
-
-    /**
-     * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those that are currently running in the threads and those that are still waiting in the queue. However, if the variable paused is set to true, this function only waits for the currently running tasks (otherwise it would wait forever). To wait for a specific task, use submit() instead, and call the wait() member function of the generated future.
-     */
-    void wait_for_tasks()
-    {
-        while (true)
-        {
-            if (!paused)
-            {
-                if (tasks_total == 0)
-                    break;
-            }
-            else
-            {
-                if (get_tasks_running() == 0)
-                    break;
-            }
-            sleep_or_yield();
-        }
-    }
-
-    // ===========
-    // Public data
-    // ===========
-
-    /**
-     * @brief An atomic variable indicating to the workers to pause. When set to true, the workers temporarily stop popping new tasks out of the queue, although any tasks already executed will keep running until they are done. Set to false again to resume popping tasks.
-     */
-    std::atomic<bool> paused = false;
-
-    /**
-     * @brief The duration, in microseconds, that the worker function should sleep for when it cannot find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will execute std::this_thread::yield() if there are no tasks in the queue. The default value is 1000.
-     */
-    ui32 sleep_duration = 1000;
-
-private:
-    // ========================
-    // Private member functions
-    // ========================
-
-    /**
-     * @brief Create the threads in the pool and assign a worker to each thread.
-     */
-    void create_threads()
-    {
-        for (ui32 i = 0; i < thread_count; i++)
-        {
-            threads[i] = std::thread(&thread_pool::worker, this);
-        }
-    }
-
-    /**
-     * @brief Destroy the threads in the pool by joining them.
-     */
-    void destroy_threads()
-    {
-        for (ui32 i = 0; i < thread_count; i++)
-        {
-            threads[i].join();
-        }
-    }
-
-    /**
-     * @brief Try to pop a new task out of the queue.
-     *
-     * @param task A reference to the task. Will be populated with a function if the queue is not empty.
-     * @return true if a task was found, false if the queue is empty.
-     */
-    bool pop_task(std::function<void()> &task)
-    {
-        const std::scoped_lock lock(queue_mutex);
-        if (tasks.empty())
-            return false;
-        else
-        {
-            task = std::move(tasks.front());
-            tasks.pop();
-            return true;
-        }
-    }
-
-    /**
-     * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-     *
-     */
-    void sleep_or_yield()
-    {
-        if (sleep_duration)
-            std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-        else
-            std::this_thread::yield();
-    }
-
-    /**
-     * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out of the queue and executes them, as long as the atomic variable running is set to true.
-     */
-    void worker()
-    {
-        while (running)
-        {
-            std::function<void()> task;
-            if (!paused && pop_task(task))
-            {
-                task();
-                tasks_total--;
-            }
-            else
-            {
-                sleep_or_yield();
-            }
-        }
-    }
-
-    // ============
-    // Private data
-    // ============
-
-    /**
-     * @brief A mutex to synchronize access to the task queue by different threads.
-     */
-    mutable std::mutex queue_mutex = {};
-
-    /**
-     * @brief An atomic variable indicating to the workers to keep running. When set to false, the workers permanently stop working.
-     */
-    std::atomic<bool> running = true;
-
-    /**
-     * @brief A queue of tasks to be executed by the threads.
-     */
-    std::queue<std::function<void()>> tasks = {};
-
-    /**
-     * @brief The number of threads in the pool.
-     */
-    ui32 thread_count;
-
-    /**
-     * @brief A smart pointer to manage the memory allocated for the threads.
-     */
-    std::unique_ptr<std::thread[]> threads;
-
-    /**
-     * @brief An atomic variable to keep track of the total number of unfinished tasks - either still in the queue, or running in a thread.
-     */
-    std::atomic<ui32> tasks_total = 0;
-};
-
-//                                     End class thread_pool                                     //
-// ============================================================================================= //
-
-// ============================================================================================= //
-//                                   Begin class synced_stream                                   //
-
-/**
- * @brief A helper class to synchronize printing to an output stream by different threads.
- */
-class synced_stream
-{
-public:
-    /**
-     * @brief Construct a new synced stream.
-     *
-     * @param _out_stream The output stream to print to. The default value is std::cout.
-     */
-    synced_stream(std::ostream &_out_stream = std::cout)
-        : out_stream(_out_stream){};
-
-    /**
-     * @brief Print any number of items into the output stream. Ensures that no other threads print to this stream simultaneously, as long as they all exclusively use this synced_stream object to print.
-     *
-     * @tparam T The types of the items
-     * @param items The items to print.
-     */
-    template <typename... T>
-    void print(const T &...items)
-    {
-        const std::scoped_lock lock(stream_mutex);
-        (out_stream << ... << items);
-    }
-
-    /**
-     * @brief Print any number of items into the output stream, followed by a newline character. Ensures that no other threads print to this stream simultaneously, as long as they all exclusively use this synced_stream object to print.
-     *
-     * @tparam T The types of the items
-     * @param items The items to print.
-     */
-    template <typename... T>
-    void println(const T &...items)
-    {
-        print(items..., '\n');
-    }
-
-private:
-    /**
-     * @brief A mutex to synchronize printing.
-     */
-    mutable std::mutex stream_mutex = {};
-
-    /**
-     * @brief The output stream to print to.
-     */
-    std::ostream &out_stream;
-};
-
-//                                    End class synced_stream                                    //
-// ============================================================================================= //
-
-// ============================================================================================= //
-//                                       Begin class timer                                       //
-
-/**
- * @brief A helper class to measure execution time for benchmarking purposes.
- */
-class timer
-{
-    typedef std::int_fast64_t i64;
-
-public:
-    /**
-     * @brief Start (or restart) measuring time.
-     */
-    void start()
-    {
-        start_time = std::chrono::steady_clock::now();
-    }
-
-    /**
-     * @brief Stop measuring time and store the elapsed time since start().
-     */
-    void stop()
-    {
-        elapsed_time = std::chrono::steady_clock::now() - start_time;
-    }
-
-    /**
-     * @brief Get the number of milliseconds that have elapsed between start() and stop().
-     *
-     * @return The number of milliseconds.
-     */
-    i64 ms() const
-    {
-        return (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed_time)).count();
-    }
-
-private:
-    /**
-     * @brief The time point when measuring started.
-     */
-    std::chrono::time_point<std::chrono::steady_clock> start_time = std::chrono::steady_clock::now();
-
-    /**
-     * @brief The duration that has elapsed between start() and stop().
-     */
-    std::chrono::duration<double> elapsed_time = std::chrono::duration<double>::zero();
-};
-
-//                                        End class timer                                        //
-// ============================================================================================= //
diff --git a/thread_pool_test.cpp b/thread_pool_test.cpp
deleted file mode 100644
index 95b0157..0000000
--- a/thread_pool_test.cpp
+++ /dev/null
@@ -1,1112 +0,0 @@
-// Get rid of annoying MSVC warning.
-#ifdef _MSC_VER
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
-#include <fstream>
-#include <iomanip>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "thread_pool.hpp"
-
-// Define short names for commonly-used integer types.
-typedef std::int_fast32_t i32;
-typedef std::uint_fast32_t ui32;
-typedef std::int_fast64_t i64;
-typedef std::uint_fast64_t ui64;
-
-// Define two global synced_streams objects: one prints to std::cout and the other to a file.
-synced_stream sync_cout(std::cout);
-std::ofstream log_file;
-synced_stream sync_file(log_file);
-
-// A global thread pool object.
-thread_pool pool;
-
-// A global random_device object used to seed some random number generators.
-std::random_device rd;
-
-// Global variables to measure how many checks succeeded and how many failed.
-ui32 tests_succeeded = 0;
-ui32 tests_failed = 0;
-
-/**
- * @brief Print any number of items into both std::cout and the log file, syncing both independently.
- *
- * @tparam T The types of the items.
- * @param items The items to print.
- */
-template <typename... T>
-void dual_print(const T &...items)
-{
-    sync_cout.print(items...);
-    sync_file.print(items...);
-}
-
-/**
- * @brief Print any number of items into both std::cout and the log file, followed by a newline character, syncing both independently.
- *
- * @tparam T The types of the items.
- * @param items The items to print.
- */
-template <typename... T>
-void dual_println(const T &...items)
-{
-    dual_print(items..., '\n');
-}
-
-/**
- * @brief Print a stylized header.
- *
- * @param text The text of the header. Will appear between two lines.
- * @param symbol The symbol to use for the lines. Default is '='.
- */
-void print_header(const std::string &text, const char &symbol = '=')
-{
-    dual_println();
-    dual_println(std::string(text.length(), symbol));
-    dual_println(text);
-    dual_println(std::string(text.length(), symbol));
-}
-
-/**
- * @brief Get a string representing the current time.
- *
- * @return The string.
- */
-std::string get_time()
-{
-    const std::time_t t = std::time(nullptr);
-    char time_string[32];
-    std::strftime(time_string, sizeof(time_string), "%Y-%m-%d_%H.%M.%S", std::localtime(&t));
-    return std::string(time_string);
-}
-
-/**
- * @brief Check if a condition is met, report the result, and count the number of successes and failures.
- *
- * @param condition The condition to check.
- */
-void check(const bool condition)
-{
-    if (condition)
-    {
-        dual_println("-> PASSED!");
-        tests_succeeded++;
-    }
-    else
-    {
-        dual_println("-> FAILED!");
-        tests_failed++;
-    }
-}
-
-/**
- * @brief Store the ID of the current thread in memory. Waits for a short time to ensure it does not get evaluated by more than one thread.
- *
- * @param location A pointer to the location where the thread ID should be stored.
- */
-void store_ID(std::thread::id *location)
-{
-    *location = std::this_thread::get_id();
-    std::this_thread::sleep_for(std::chrono::milliseconds(10));
-}
-
-/**
- * @brief Count the number of unique threads in the thread pool to ensure that the correct number of individual threads was created. Pushes a number of tasks equal to four times the thread count into the thread pool, and count the number of unique thread IDs returned by the tasks.
- */
-ui32 count_unique_threads()
-{
-    std::vector<std::thread::id> thread_IDs(pool.get_thread_count() * 4);
-    for (std::thread::id &id : thread_IDs)
-        pool.push_task(store_ID, &id);
-    pool.wait_for_tasks();
-    std::sort(thread_IDs.begin(), thread_IDs.end());
-    ui32 unique_threads = (ui32)(std::unique(thread_IDs.begin(), thread_IDs.end()) - thread_IDs.begin());
-    return unique_threads;
-}
-
-/**
- * @brief Check that the constructor works.
- */
-void check_constructor()
-{
-    dual_println("Checking that the thread pool reports a number of threads equal to the hardware concurrency...");
-    check(pool.get_thread_count() == std::thread::hardware_concurrency());
-    dual_println("Checking that the manually counted number of unique thread IDs is equal to the reported number of threads...");
-    check(pool.get_thread_count() == count_unique_threads());
-}
-
-/**
- * @brief Check that reset() works.
- */
-void check_reset()
-{
-    pool.reset(std::thread::hardware_concurrency() / 2);
-    dual_println("Checking that after reset() the thread pool reports a number of threads equal to half the hardware concurrency...");
-    check(pool.get_thread_count() == std::thread::hardware_concurrency() / 2);
-    dual_println("Checking that after reset() the manually counted number of unique thread IDs is equal to the reported number of threads...");
-    check(pool.get_thread_count() == count_unique_threads());
-    pool.reset(std::thread::hardware_concurrency());
-    dual_println("Checking that after a second reset() the thread pool reports a number of threads equal to the hardware concurrency...");
-    check(pool.get_thread_count() == std::thread::hardware_concurrency());
-    dual_println("Checking that after a second reset() the manually counted number of unique thread IDs is equal to the reported number of threads...");
-    check(pool.get_thread_count() == count_unique_threads());
-}
-
-/**
- * @brief Check that push_task() works.
- */
-void check_push_task()
-{
-    dual_println("Checking that push_task() works for a function with no arguments or return value...");
-    {
-        bool flag = false;
-        pool.push_task([&flag]
-                       { flag = true; });
-        pool.wait_for_tasks();
-        check(flag);
-    }
-    dual_println("Checking that push_task() works for a function with one argument and no return value...");
-    {
-        bool flag = false;
-        pool.push_task([](bool *flag)
-                       { *flag = true; },
-                       &flag);
-        pool.wait_for_tasks();
-        check(flag);
-    }
-    dual_println("Checking that push_task() works for a function with two arguments and no return value...");
-    {
-        bool flag1 = false;
-        bool flag2 = false;
-        pool.push_task([](bool *flag1, bool *flag2)
-                       { *flag1 = *flag2 = true; },
-                       &flag1, &flag2);
-        pool.wait_for_tasks();
-        check(flag1 && flag2);
-    }
-}
-
-/**
- * @brief Check that submit() works.
- */
-void check_submit()
-{
-    dual_println("Checking that submit() works for a function with no arguments or return value...");
-    {
-        bool flag = false;
-        auto my_future = pool.submit([&flag]
-                                     { flag = true; });
-        check(my_future.get() && flag);
-    }
-    dual_println("Checking that submit() works for a function with one argument and no return value...");
-    {
-        bool flag = false;
-        auto my_future = pool.submit([](bool *flag)
-                                     { *flag = true; },
-                                     &flag);
-        check(my_future.get() && flag);
-    }
-    dual_println("Checking that submit() works for a function with two arguments and no return value...");
-    {
-        bool flag1 = false;
-        bool flag2 = false;
-        auto my_future = pool.submit([](bool *flag1, bool *flag2)
-                                     { *flag1 = *flag2 = true; },
-                                     &flag1, &flag2);
-        check(my_future.get() && flag1 && flag2);
-    }
-    dual_println("Checking that submit() works for a function with no arguments and a return value...");
-    {
-        bool flag = false;
-        auto my_future = pool.submit([&flag]
-                                     {
-                                         flag = true;
-                                         return 42;
-                                     });
-        check(my_future.get() == 42 && flag);
-    }
-    dual_println("Checking that submit() works for a function with one argument and a return value...");
-    {
-        bool flag = false;
-        auto my_future = pool.submit([](bool *flag)
-                                     {
-                                         *flag = true;
-                                         return 42;
-                                     },
-                                     &flag);
-        check(my_future.get() == 42 && flag);
-    }
-    dual_println("Checking that submit() works for a function with two arguments and a return value...");
-    {
-        bool flag1 = false;
-        bool flag2 = false;
-        auto my_future = pool.submit([](bool *flag1, bool *flag2)
-                                     {
-                                         *flag1 = *flag2 = true;
-                                         return 42;
-                                     },
-                                     &flag1, &flag2);
-        check(my_future.get() == 42 && flag1 && flag2);
-    }
-}
-
-/**
- * @brief Check that wait_for_tasks() works.
- */
-void check_wait_for_tasks()
-{
-    ui32 n = pool.get_thread_count() * 10;
-    std::vector<std::atomic<bool>> flags(n);
-    for (ui32 i = 0; i < n; i++)
-        pool.push_task([&flags, i]
-                       {
-                           std::this_thread::sleep_for(std::chrono::milliseconds(10));
-                           flags[i] = true;
-                       });
-    pool.wait_for_tasks();
-    bool all_flags = true;
-    for (ui32 i = 0; i < n; i++)
-        all_flags = all_flags && flags[i];
-    check(all_flags);
-}
-
-/**
- * @brief Check that parallelize_loop() works for a specific number of indices split over a specific number of tasks.
- *
- * @param start The first index in the loop.
- * @param end The last index in the loop plus 1.
- * @param num_tasks The number of tasks.
- */
-void check_parallelize_loop(i32 start, i32 end, const ui32 &num_tasks)
-{
-    if (start == end)
-        end++;
-    dual_println("Verifying that a loop from ", start, " to ", end, " with ", num_tasks, num_tasks == 1 ? " task" : " tasks", " modifies all indices...");
-    ui64 num_indices = (ui64)std::abs(end - start);
-    i32 offset = std::min(start, end);
-    std::vector<std::atomic<bool>> flags((ui64)num_indices);
-    pool.parallelize_loop(
-        start, end, [&flags, &offset](const i32 &start, const i32 &end)
-        {
-            for (i32 i = start; i < end; i++)
-                flags[(ui64)(i - offset)] = true;
-        },
-        num_tasks);
-    bool all_flags = true;
-    for (ui64 i = 0; i < num_indices; i++)
-        all_flags = all_flags && flags[i];
-    check(all_flags);
-}
-
-/**
- * @brief Check that parallelize_loop() works using several different random values for the range of indices and number of tasks.
- */
-void check_parallelize_loop()
-{
-    std::mt19937_64 mt(rd());
-    std::uniform_int_distribution<i32> index_dist((i32)pool.get_thread_count() * -100, (i32)pool.get_thread_count() * 100);
-    std::uniform_int_distribution<ui32> task_dist(1, pool.get_thread_count());
-    for (ui32 i = 0; i < 10; i++)
-        check_parallelize_loop(index_dist(mt), index_dist(mt), task_dist(mt));
-}
-
-/**
- * @brief Check that sleep_duration works for a specific value.
- *
- * @param duration The value of sleep_duration.
- */
-void check_sleep_duration(const ui32 &duration)
-{
-    dual_println("Submitting tasks with sleep_duration = ", duration, " microseconds...");
-    pool.sleep_duration = duration;
-    ui32 n = pool.get_thread_count() * 100;
-    std::vector<std::atomic<bool>> flags(n);
-    for (ui32 i = 0; i < n; i++)
-        pool.push_task([&flags, i]
-                       { flags[i] = true; });
-    pool.wait_for_tasks();
-    bool all_flags = true;
-    for (ui32 i = 0; i < n; i++)
-        all_flags = all_flags && flags[i];
-    check(all_flags);
-}
-
-/**
- * @brief Check that sleep_duration works for several different random values.
- */
-void check_sleep_duration()
-{
-    ui32 old_duration = pool.sleep_duration;
-    check_sleep_duration(0);
-    std::mt19937_64 mt(rd());
-    std::uniform_int_distribution<ui32> dist(1, 2000);
-    for (ui32 i = 0; i < 5; i++)
-        check_sleep_duration(dist(mt));
-    dual_println("Resetting sleep_duration to the default value (", old_duration, " microseconds).");
-    pool.sleep_duration = old_duration;
-}
-
-/**
- * @brief Check that task monitoring works.
- */
-void check_task_monitoring()
-{
-    ui32 n = std::min<ui32>(std::thread::hardware_concurrency(), 4);
-    dual_println("Resetting pool to ", n, " threads.");
-    pool.reset(n);
-    dual_println("Submitting ", n * 3, " tasks.");
-    std::vector<std::atomic<bool>> release(n * 3);
-    for (ui32 i = 0; i < n * 3; i++)
-        pool.push_task([&release, i]
-                       {
-                           while (!release[i])
-                               std::this_thread::yield();
-                           dual_println("Task ", i, " released.");
-                       });
-    std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    dual_println("After submission, should have: ", n * 3, " tasks total, ", n, " tasks running, ", n * 2, " tasks queued...");
-    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n * 2);
-    for (ui32 i = 0; i < n; i++)
-        release[i] = true;
-    std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    dual_println("After releasing ", n, " tasks, should have: ", n * 2, " tasks total, ", n, " tasks running, ", n, " tasks queued...");
-    for (ui32 i = n; i < n * 2; i++)
-        release[i] = true;
-    check(pool.get_tasks_total() == n * 2 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n);
-    std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    dual_println("After releasing ", n, " more tasks, should have: ", n, " tasks total, ", n, " tasks running, ", 0, " tasks queued...");
-    check(pool.get_tasks_total() == n && pool.get_tasks_running() == n && pool.get_tasks_queued() == 0);
-    for (ui32 i = n * 2; i < n * 3; i++)
-        release[i] = true;
-    std::this_thread::sleep_for(std::chrono::milliseconds(200));
-    dual_println("After releasing the final ", n, " tasks, should have: ", 0, " tasks total, ", 0, " tasks running, ", 0, " tasks queued...");
-    check(pool.get_tasks_total() == 0 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == 0);
-    dual_println("Resetting pool to ", std::thread::hardware_concurrency(), " threads.");
-    pool.reset(std::thread::hardware_concurrency());
-}
-
-/**
- * @brief Check that pausing works.
- */
-void check_pausing()
-{
-    ui32 n = std::min<ui32>(std::thread::hardware_concurrency(), 4);
-    dual_println("Resetting pool to ", n, " threads.");
-    pool.reset(n);
-    dual_println("Pausing pool.");
-    pool.paused = true;
-    dual_println("Submitting ", n * 3, " tasks, each one waiting for 200ms.");
-    for (ui32 i = 0; i < n * 3; i++)
-        pool.push_task([i]
-                       {
-                           std::this_thread::sleep_for(std::chrono::milliseconds(200));
-                           dual_println("Task ", i, " done.");
-                       });
-    dual_println("Immediately after submission, should have: ", n * 3, " tasks total, ", 0, " tasks running, ", n * 3, " tasks queued...");
-    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n * 3);
-    std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    dual_println("300ms later, should still have: ", n * 3, " tasks total, ", 0, " tasks running, ", n * 3, " tasks queued...");
-    check(pool.get_tasks_total() == n * 3 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n * 3);
-    dual_println("Unpausing pool.");
-    pool.paused = false;
-    std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    dual_println("300ms later, should have: ", n * 2, " tasks total, ", n, " tasks running, ", n, " tasks queued...");
-    check(pool.get_tasks_total() == n * 2 && pool.get_tasks_running() == n && pool.get_tasks_queued() == n);
-    dual_println("Pausing pool and using wait_for_tasks() to wait for the running tasks.");
-    pool.paused = true;
-    pool.wait_for_tasks();
-    dual_println("After waiting, should have: ", n, " tasks total, ", 0, " tasks running, ", n, " tasks queued...");
-    check(pool.get_tasks_total() == n && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n);
-    std::this_thread::sleep_for(std::chrono::milliseconds(200));
-    dual_println("200ms later, should still have: ", n, " tasks total, ", 0, " tasks running, ", n, " tasks queued...");
-    check(pool.get_tasks_total() == n && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == n);
-    dual_println("Unpausing pool and using wait_for_tasks() to wait for all tasks.");
-    pool.paused = false;
-    pool.wait_for_tasks();
-    dual_println("After waiting, should have: ", 0, " tasks total, ", 0, " tasks running, ", 0, " tasks queued...");
-    check(pool.get_tasks_total() == 0 && pool.get_tasks_running() == 0 && pool.get_tasks_queued() == 0);
-    dual_println("Resetting pool to ", std::thread::hardware_concurrency(), " threads.");
-    pool.reset(std::thread::hardware_concurrency());
-}
-
-/**
- * @brief Check that exception handling work.
- */
-void check_exceptions()
-{
-    bool caught = false;
-    auto my_future = pool.submit([]
-                                 { throw std::runtime_error("Exception thrown!"); });
-    try
-    {
-        my_future.get();
-    }
-    catch (const std::exception &e)
-    {
-        if (e.what() == std::string("Exception thrown!"))
-            caught = true;
-    }
-    check(caught);
-}
-
-/**
- * @brief A lightweight matrix class template for performance testing purposes. Not for general use; only contains the bare minimum functionality needed for the test. Based on https://github.com/bshoshany/multithreaded-matrix
- *
- * @tparam T The type to use for the matrix elements.
- */
-template <typename T>
-class matrix
-{
-public:
-    // =====================================
-    // Constructors and assignment operators
-    // =====================================
-
-    /**
-     * @brief Construct an uninitialized matrix.
-     *
-     * @param _rows The number of rows.
-     * @param _cols The number of columns.
-     */
-    matrix(const ui64 &_rows, const ui64 &_cols)
-        : rows(_rows), cols(_cols), smart_elements(new T[rows * cols])
-    {
-        elements = smart_elements.get();
-    }
-
-    /**
-     * @brief Construct a new matrix by copying the elements of an existing matrix.
-     *
-     * @param m The matrix to be copied.
-     */
-    matrix(const matrix<T> &m)
-        : rows(m.rows), cols(m.cols), smart_elements(new T[rows * cols])
-    {
-        elements = smart_elements.get();
-        for (ui64 i = 0; i < rows * cols; i++)
-            elements[i] = m.elements[i];
-    }
-
-    /**
-     * @brief Construct a new matrix by moving the elements of an existing matrix.
-     *
-     * @param m The matrix to be moved.
-     */
-    matrix(matrix<T> &&m)
-        : rows(m.rows), cols(m.cols), smart_elements(std::move(m.smart_elements))
-    {
-        elements = smart_elements.get();
-        m.rows = 0;
-        m.cols = 0;
-        m.elements = nullptr;
-    }
-
-    /**
-     * @brief Copy the elements of another matrix to this matrix.
-     *
-     * @param m The matrix to be copied.
-     * @return A reference to this matrix.
-     */
-    matrix<T> &operator=(const matrix<T> &m)
-    {
-        rows = m.rows;
-        cols = m.cols;
-        smart_elements.reset(new T[rows * cols]);
-        elements = smart_elements.get();
-        for (ui64 i = 0; i < rows * cols; i++)
-            elements[i] = m.elements[i];
-        return *this;
-    }
-
-    /**
-     * @brief Move the elements of another matrix to this matrix.
-     *
-     * @param m The matrix to be moved.
-     * @return A reference to this matrix.
-     */
-    matrix<T> &operator=(matrix<T> &&m)
-    {
-        rows = m.rows;
-        cols = m.cols;
-        smart_elements = std::move(m.smart_elements);
-        elements = smart_elements.get();
-        m.rows = 0;
-        m.cols = 0;
-        m.elements = nullptr;
-        return *this;
-    }
-
-    // ====================
-    // Overloaded operators
-    // ====================
-
-    /**
-     * @brief Read or modify a matrix element.
-     *
-     * @param row The row index (starting from zero).
-     * @param col The column index (starting from zero).
-     * @return A reference to the element.
-     */
-    inline T &operator()(const ui64 &row, const ui64 &col)
-    {
-        return elements[(cols * row) + col];
-    }
-
-    /**
-     * @brief Read a matrix element.
-     *
-     * @param row The row index (starting from zero).
-     * @param col The column index (starting from zero).
-     * @return The value of the element.
-     */
-    inline T operator()(const ui64 &row, const ui64 &col) const
-    {
-        return elements[(cols * row) + col];
-    }
-
-    /**
-     * @brief Read or modify an element of the underlying 1-dimensional array.
-     *
-     * @param i The element index (starting from zero).
-     * @return A reference to the element.
-     */
-    inline T &operator[](const ui64 &i)
-    {
-        return elements[i];
-    }
-
-    /**
-     * @brief Read an element of the underlying 1-dimensional array.
-     *
-     * @param i The element index (starting from zero).
-     * @return The value of the element.
-     */
-    inline T operator[](const ui64 &i) const
-    {
-        return elements[i];
-    }
-
-    /**
-     * @brief Compare this matrix to another matrix.
-     *
-     * @param m The matrix to compare to.
-     * @return Whether the matrices have the same elements.
-     */
-    bool operator==(const matrix<T> &m) const
-    {
-        bool compare_result = true;
-        for (ui64 i = 0; i < rows * cols; i++)
-            compare_result = compare_result && (elements[i] == m.elements[i]);
-        return compare_result;
-    }
-
-    // =======================
-    // Public member functions
-    // =======================
-
-    /**
-     * @brief Transpose a matrix.
-     *
-     * @param num_tasks The number of parallel tasks to use. If set to 0, no multithreading will be used.
-     * @return The transposed matrix.
-     */
-    matrix<T> transpose(const ui32 &num_tasks) const
-    {
-        matrix<T> out(cols, rows);
-        if (num_tasks == 0)
-        {
-            for (ui64 i = 0; i < out.rows; i++)
-                for (ui64 j = 0; j < out.cols; j++)
-                    out(i, j) = operator()(j, i);
-        }
-        else
-        {
-            pool.parallelize_loop(
-                0, out.rows, [this, &out](const ui64 &start, const ui64 &end)
-                {
-                    for (ui64 i = start; i < end; i++)
-                        for (ui64 j = 0; j < out.cols; j++)
-                            out(i, j) = operator()(j, i);
-                },
-                num_tasks);
-        }
-        return out;
-    }
-
-    // ================
-    // Friend functions
-    // ================
-
-    /**
-     * @brief Add two matrices using the specified number of parallel tasks.
-     *
-     * @param a The first matrix to be added.
-     * @param b The second matrix to be added.
-     * @param num_tasks The number of parallel tasks to use. If set to 0, no multithreading will be used.
-     * @return The sum of the matrices.
-     */
-    friend matrix<T> add_matrices(const matrix<T> &a, const matrix<T> &b, const ui32 &num_tasks)
-    {
-        matrix<T> c(a.rows, a.cols);
-        if (num_tasks == 0)
-            for (ui64 i = 0; i < a.rows * a.cols; i++)
-                c[i] = a[i] + b[i];
-        else
-            pool.parallelize_loop(
-                0, a.rows * a.cols, [&a, &b, &c](const ui64 &start, const ui64 &end)
-                {
-                    for (ui64 i = start; i < end; i++)
-                        c[i] = a[i] + b[i];
-                },
-                num_tasks);
-        return c;
-    }
-
-    /**
-     * @brief Multiply two matrices using the specified number of parallel tasks.
-     *
-     * @param a The first matrix to be multiplied.
-     * @param b The second matrix to be multiplied.
-     * @param num_tasks The number of parallel tasks to use. If set to 0, no multithreading will be used.
-     * @return The product of the matrices.
-     */
-    friend matrix<T> multiply_matrices(const matrix<T> &a, const matrix<T> &b, const ui32 &num_tasks)
-    {
-        matrix<T> c(a.rows, b.cols);
-        if (num_tasks == 0)
-        {
-            for (ui64 i = 0; i < a.rows; i++)
-                for (ui64 j = 0; j < b.cols; j++)
-                {
-                    c(i, j) = 0;
-                    for (ui64 k = 0; k < a.cols; k++)
-                        c(i, j) += a(i, k) * b(k, j);
-                }
-        }
-        else
-        {
-            pool.parallelize_loop(
-                0, a.rows, [&a, &b, &c, &a_cols = a.cols, &b_cols = b.cols](const ui64 &start, const ui64 &end)
-                {
-                    for (ui64 i = start; i < end; i++)
-                        for (ui64 j = 0; j < b_cols; j++)
-                        {
-                            c(i, j) = 0;
-                            for (ui64 k = 0; k < a_cols; k++)
-                                c(i, j) += a(i, k) * b(k, j);
-                        }
-                },
-                num_tasks);
-        }
-        return c;
-    }
-
-private:
-    // ========================
-    // Private member variables
-    // ========================
-
-    /**
-     * @brief The number of rows.
-     */
-    ui64 rows = 0;
-
-    /**
-     * @brief The number of columns.
-     */
-    ui64 cols = 0;
-
-    /**
-     * @brief A pointer to an array storing the elements of the matrix in flattened 1-dimensional form.
-     */
-    T *elements = nullptr;
-
-    /**
-     * @brief A smart pointer to manage the memory allocated for the matrix elements.
-     */
-    std::unique_ptr<T[]> smart_elements;
-};
-
-/**
- * @brief A class template for generating random matrices.
- *
- * @tparam T The type to use for the matrix elements.
- * @tparam D The distribution to use, e.g. std::uniform_real_distribution<double>.
- */
-template <typename T, typename D>
-class random_matrix_generator
-{
-public:
-    // ============
-    // Constructors
-    // ============
-
-    /**
-     * @brief Construct a new random matrix generator.
-     *
-     * @tparam P The types of the parameters to pass to the constructor of the distribution.
-     * @param params The parameters to pass to the constructor of the distribution. The number of parameters and their types depends on the particular distribution being used.
-     */
-    template <typename... P>
-    random_matrix_generator(const P &...params) : dist(params...), rd() {}
-
-    // =======================
-    // Public member functions
-    // =======================
-
-    /**
-     * @brief Generate a random matrix with the given number of rows and columns.
-     *
-     * @param rows The desired number of rows in the matrix.
-     * @param cols The desired number of columns in the matrix.
-     * @param num_tasks The number of parallel tasks to use. If set to 0, no multithreading will be used.
-     * @return The random matrix.
-     */
-    matrix<T> generate_matrix(const ui64 &rows, const ui64 &cols, const ui32 &num_tasks)
-    {
-        matrix<T> m(rows, cols);
-        if (num_tasks == 0)
-        {
-            std::mt19937_64 mt(generate_seed());
-            for (ui64 i = 0; i < rows * cols; i++)
-                m[i] = dist(mt);
-        }
-        else
-            pool.parallelize_loop(
-                0, rows * cols, [this, &m](const ui64 &start, const ui64 &end)
-                {
-                    std::mt19937_64 mt(generate_seed());
-                    for (ui64 i = start; i < end; i++)
-                        m[i] = dist(mt);
-                },
-                num_tasks);
-        return m;
-    }
-
-private:
-    // ========================
-    // Private member functions
-    // ========================
-
-    /**
-     * @brief Generate a seed. The std::mt19937_64 in each block will be seeded using this function in order to avoid depleting the entropy of the random_device.
-     *
-     * @return A random unsigned 64-bit integer.
-     */
-    ui64 generate_seed()
-    {
-        static std::mt19937_64 mt(rd());
-        return mt();
-    }
-
-    // ========================
-    // Private member variables
-    // ========================
-
-    /**
-     * @brief The distribution to use for generating random numbers.
-     */
-    D dist;
-
-    /**
-     * @brief The random device to be used for seeding the pseudo-random number generators.
-     */
-    std::random_device rd;
-};
-
-/**
- * @brief Check the matrix class template by comparing the results of adding, multiplying, and transposing matrices calculated in two ways: single-threaded and multithreaded.
- */
-void check_matrix()
-{
-    // Initialize a random_matrix_generator object to generates matrices with integers uniformly distributed between -1000 and 1000.
-    random_matrix_generator<i64, std::uniform_int_distribution<i64>> rnd(-1000, 1000);
-    // Define the size of the matrices to use.
-    const ui32 thread_count = pool.get_thread_count();
-    const ui64 rows = thread_count * 10;
-    const ui64 cols = rows;
-    const ui64 total_size = rows * cols;
-    dual_println("Using matrices of size ", rows, "x", cols, " with a total of ", total_size, " elements.");
-
-    matrix<i64> A = rnd.generate_matrix(rows, cols, thread_count);
-    matrix<i64> B = rnd.generate_matrix(rows, cols, thread_count);
-
-    dual_println("Adding two matrices (single-threaded)...");
-    matrix<i64> ApB_single = add_matrices(A, B, 0);
-    dual_println("Adding two matrices (multithreaded)...");
-    matrix<i64> ApB_multi = add_matrices(A, B, thread_count);
-    dual_println("Comparing the results...");
-    check(ApB_single == ApB_multi);
-
-    dual_println("Transposing a matrix (single-threaded)...");
-    matrix<i64> At_single = A.transpose(0);
-    dual_println("Transposing a matrix (multithreaded)...");
-    matrix<i64> At_multi = A.transpose(thread_count);
-    dual_println("Comparing the results...");
-    check(At_single == At_multi);
-
-    dual_println("Multiplying two matrices (single-threaded)...");
-    matrix<i64> AxB_single = multiply_matrices(A, B, 0);
-    dual_println("Multiplying two matrices (multithreaded)...");
-    matrix<i64> AxB_multi = multiply_matrices(A, B, thread_count);
-    dual_println("Comparing the results...");
-    check(AxB_single == AxB_multi);
-}
-
-/**
- * @brief Print the timing of a specific test.
- *
- * @param num_tasks The number of tasks.
- * @param mean_sd std::pair containing the mean as the first member and standard deviation as the second member.
- */
-void print_timing(const ui32 &num_tasks, const std::pair<double, double> &mean_sd)
-{
-    if (num_tasks == 1)
-        dual_print("With   1  task");
-    else
-        dual_print("With ", std::setw(3), num_tasks, " tasks");
-    dual_println(", mean execution time was ", std::setw(6), mean_sd.first, " ms with standard deviation ", std::setw(4), mean_sd.second, " ms.");
-}
-
-/**
- * @brief Calculate and print the speedup obtained by multithreading.
- *
- * @param timings A vector of the timings corresponding to different numbers of tasks.
- * @return The maximum speedup obtained.
- */
-double print_speedup(const std::vector<double> &timings)
-{
-    const auto [min_time, max_time] = std::minmax_element(std::begin(timings), std::end(timings));
-    double max_speedup = *max_time / *min_time;
-    dual_println("Maximum speedup obtained: ", max_speedup, "x.");
-    return max_speedup;
-}
-
-/**
- * @brief Calculate the mean and standard deviation of a set of integers.
- *
- * @param timings The integers.
- * @return std::pair containing the mean as the first member and standard deviation as the second member.
- */
-std::pair<double, double> analyze(const std::vector<i64> &timings)
-{
-    double mean = 0;
-    for (size_t i = 0; i < timings.size(); i++)
-        mean += (double)timings[i] / (double)timings.size();
-    double variance = 0;
-    for (size_t i = 0; i < timings.size(); i++)
-        variance += ((double)timings[i] - mean) * ((double)timings[i] - mean) / (double)timings.size();
-    double sd = std::sqrt(variance);
-    return std::pair(mean, sd);
-}
-
-/**
- * @brief Perform a performance test using some matrix operations.
- */
-void check_performance()
-{
-    // Set the formatting of floating point numbers.
-    dual_print(std::fixed, std::setprecision(1));
-
-    // Initialize a random_matrix_generator object to generates matrices with real (floating-point) numbers uniformly distributed between -1000 and 1000.
-    random_matrix_generator<double, std::uniform_real_distribution<double>> rnd(-1000, 1000);
-
-    // Initialize a timer object to measure the execution time of various operations.
-    timer tmr;
-
-    // If the CPU has more than 8 threads, we leave 2 threads for the rest of the operating system. Otherwise, performance may suffer.
-    const ui32 thread_count = pool.get_thread_count() <= 8 ? pool.get_thread_count() : pool.get_thread_count() - 2;
-    dual_println("Using ", thread_count, " out of ", pool.get_thread_count(), " threads.");
-
-    // Define the size of the matrices to use.
-    const ui64 rows = thread_count * 200;
-    const ui64 cols = rows;
-
-    // The number of tasks to try for each operation.
-    const ui32 try_tasks[] = {1, thread_count / 4, thread_count / 2, thread_count, thread_count * 2, thread_count * 4};
-
-    // Generate two random test matrices to be used for benchmarking addition, transposition, and random matrix generation.
-    matrix<double> A = rnd.generate_matrix(rows, cols, thread_count);
-    matrix<double> B = rnd.generate_matrix(rows, cols, thread_count);
-
-    // Generate two random test matrices to be used for benchmarking multiplication. Since matrix multiplication is O(n^3), we reduce the size of the test matrices so that this operation completes within a reasonable time.
-    constexpr ui64 mult_factor = 8;
-    matrix<double> X = rnd.generate_matrix(rows / mult_factor, cols / mult_factor, thread_count);
-    matrix<double> Y = rnd.generate_matrix(cols / mult_factor, rows / mult_factor, thread_count);
-
-    // Determine the optimal sleep duration for this system.
-    dual_print("Determining the optimal sleep duration...");
-    i64 optimal_ms = 0;
-    ui64 optimal_sleep = 0;
-    for (ui64 sleep = 0; sleep <= 2000; sleep += 100)
-    {
-        dual_print(".");
-        pool.sleep_duration = (ui32)sleep;
-        tmr.start();
-        matrix<double> C = add_matrices(A, B, thread_count);
-        matrix<double> D = A.transpose(thread_count);
-        matrix<double> E = multiply_matrices(X, Y, thread_count);
-        matrix<double> F = rnd.generate_matrix(rows, cols, thread_count);
-        tmr.stop();
-        if (tmr.ms() < optimal_ms || optimal_ms == 0)
-        {
-            optimal_ms = tmr.ms();
-            optimal_sleep = sleep;
-        }
-    }
-    if (optimal_sleep == 0)
-        dual_println("\nResult: Using std::this_thread::yield() instead of std::this_thread::sleep_for() is optimal.");
-    else
-        dual_println("\nResult: The optimal sleep duration is ", optimal_sleep, " microseconds.");
-    pool.sleep_duration = (ui32)optimal_sleep;
-
-    // Vectors to store statistics.
-    std::vector<double> different_n_timings;
-    std::vector<i64> same_n_timings;
-    std::vector<double> speedups;
-
-    // How many times to run each test.
-    constexpr ui32 repeat = 20;
-
-    dual_println("\nAdding two ", rows, "x", cols, " matrices ", repeat, " times:");
-    for (ui32 n : try_tasks)
-    {
-        for (ui32 i = 0; i < repeat; i++)
-        {
-            tmr.start();
-            matrix<double> C = add_matrices(A, B, n);
-            tmr.stop();
-            same_n_timings.push_back(tmr.ms());
-        }
-        auto mean_sd = analyze(same_n_timings);
-        print_timing(n, mean_sd);
-        different_n_timings.push_back(mean_sd.first);
-        same_n_timings.clear();
-    }
-    speedups.push_back(print_speedup(different_n_timings));
-    different_n_timings.clear();
-
-    dual_println("\nTransposing one ", rows, "x", cols, " matrix ", repeat, " times:");
-    for (ui32 n : try_tasks)
-    {
-        for (ui32 i = 0; i < repeat; i++)
-        {
-            tmr.start();
-            matrix<double> C = A.transpose(n);
-            tmr.stop();
-            same_n_timings.push_back(tmr.ms());
-        }
-        auto mean_sd = analyze(same_n_timings);
-        print_timing(n, mean_sd);
-        different_n_timings.push_back(mean_sd.first);
-        same_n_timings.clear();
-    }
-    speedups.push_back(print_speedup(different_n_timings));
-    different_n_timings.clear();
-
-    dual_println("\nMultiplying two ", rows / mult_factor, "x", cols / mult_factor, " matrices ", repeat, " times:");
-    for (ui32 n : try_tasks)
-    {
-        for (ui32 i = 0; i < repeat; i++)
-        {
-            tmr.start();
-            matrix<double> C = multiply_matrices(X, Y, n);
-            tmr.stop();
-            same_n_timings.push_back(tmr.ms());
-        }
-        auto mean_sd = analyze(same_n_timings);
-        print_timing(n, mean_sd);
-        different_n_timings.push_back(mean_sd.first);
-        same_n_timings.clear();
-    }
-    speedups.push_back(print_speedup(different_n_timings));
-    different_n_timings.clear();
-
-    dual_println("\nGenerating random ", rows, "x", cols, " matrix ", repeat, " times:");
-    for (ui32 n : try_tasks)
-    {
-        for (ui32 i = 0; i < repeat; i++)
-        {
-            tmr.start();
-            matrix<double> C = rnd.generate_matrix(rows, cols, n);
-            tmr.stop();
-            same_n_timings.push_back(tmr.ms());
-        }
-        auto mean_sd = analyze(same_n_timings);
-        print_timing(n, mean_sd);
-        different_n_timings.push_back(mean_sd.first);
-        same_n_timings.clear();
-    }
-    speedups.push_back(print_speedup(different_n_timings));
-
-    const double max_speedup = *std::max_element(std::begin(speedups), std::end(speedups));
-    dual_println("\nOverall, multithreading provided speedups of up to ", max_speedup, "x.");
-}
-
-int main()
-{
-    std::string log_filename = "thread_pool_test-" + get_time() + ".log";
-    log_file.open(log_filename);
-
-    dual_println("A C++17 Thread Pool for High-Performance Scientific Computing");
-    dual_println("(c) 2021 Barak Shoshany (baraksh@gmail.com) (http://baraksh.com)");
-    dual_println("GitHub: https://github.com/bshoshany/thread-pool\n");
-
-    dual_println("Thread pool library version is ", THREAD_POOL_VERSION, ".");
-    dual_println("Hardware concurrency is ", std::thread::hardware_concurrency(), ".");
-    dual_println("Generating log file: ", log_filename, ".\n");
-
-    dual_println("Important: Please do not run any other applications, especially multithreaded applications, in parallel with this test!");
-
-    print_header("Checking that the constructor works:");
-    check_constructor();
-
-    print_header("Checking that reset() works:");
-    check_reset();
-
-    print_header("Checking that push_task() works:");
-    check_push_task();
-
-    print_header("Checking that submit() works:");
-    check_submit();
-
-    print_header("Checking that wait_for_tasks() works...");
-    check_wait_for_tasks();
-
-    print_header("Checking that parallelize_loop() works:");
-    check_parallelize_loop();
-
-    print_header("Checking that different values of sleep_duration work:");
-    check_sleep_duration();
-
-    print_header("Checking that task monitoring works:");
-    check_task_monitoring();
-
-    print_header("Checking that pausing works:");
-    check_pausing();
-
-    print_header("Checking that exception handling works:");
-    check_exceptions();
-
-    print_header("Testing that matrix operations produce the expected results:");
-    check_matrix();
-
-    if (tests_failed == 0)
-    {
-        print_header("SUCCESS: Passed all " + std::to_string(tests_succeeded) + " checks!", '+');
-        print_header("Performing matrix performance test:");
-        check_performance();
-        print_header("Thread pool performance test completed!", '+');
-    }
-    else
-    {
-        print_header("FAILURE: Passed " + std::to_string(tests_succeeded) + " checks, but failed " + std::to_string(tests_failed) + "!", '+');
-        dual_println("\nPlease submit a bug report at https://github.com/bshoshany/thread-pool/issues including the exact specifications of your system (OS, CPU, compiler, etc.) and the generated log file.");
-    }
-
-    return 0;
-}