From 9f6dfcafeaea161cc8124bd9e2cc8e251c3ef0d7 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 14:04:16 -0500
Subject: [PATCH 01/25] Initial hardcoded version

---
 .../include/hpx/async_cuda/cuda_event.hpp      | 16 +++++++++++-----
 .../include/hpx/async_cuda/cuda_future.hpp     | 18 +++++++++---------
 .../async_cuda/detail/cuda_event_callback.hpp  |  2 +-
 .../async_cuda/src/cuda_event_callback.cpp     | 17 +++++++++++------
 libs/core/async_cuda/src/cuda_future.cpp       |  4 ++--
 libs/core/async_cuda/src/cuda_target.cpp       |  2 +-
 6 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index f2ae7ef59f5e..859caf245a9f 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -21,25 +21,30 @@ namespace hpx { namespace cuda { namespace experimental {
     {
         static constexpr int initial_events_in_pool = 128;
 
-        static cuda_event_pool& get_event_pool()
+        const int device_id;
+
+        static cuda_event_pool& get_event_pool(size_t device_id)
         {
-            static cuda_event_pool event_pool_;
-            return event_pool_;
+            static std::array<cuda_event_pool, 4> event_pool_{0, 1, 2, 3};
+            return event_pool_[device_id];
         }
 
         // create a bunch of events on initialization
-        cuda_event_pool()
-          : free_list_(initial_events_in_pool)
+        cuda_event_pool(int device_id)
+          : device_id(device_id), free_list_(initial_events_in_pool)
         {
+            check_cuda_error(cudaSetDevice(device_id));
             for (int i = 0; i < initial_events_in_pool; ++i)
             {
                 add_event_to_pool();
             }
+            std::cerr << "Created " << device_id << std::endl;
         }
 
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
+            check_cuda_error(cudaSetDevice(device_id));
             cudaEvent_t event;
             bool ok = true;
             while (ok)
@@ -68,6 +73,7 @@ namespace hpx { namespace cuda { namespace experimental {
     private:
         void add_event_to_pool()
         {
+            check_cuda_error(cudaSetDevice(device_id));
             cudaEvent_t event;
             // Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
             // Timing is disabled for performance. [1]
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 042b8199dbeb..9084166ddc54 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental {
             future_data() {}
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
             {
@@ -104,7 +104,7 @@ namespace hpx { namespace cuda { namespace experimental {
                                     status)));
                         }
                     },
-                    stream);
+                    stream, device);
             }
         };
 
@@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental {
             }
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
               , rt_(hpx::get_runtime_ptr())
@@ -183,7 +183,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator, and the
         // specified mode
         template <typename Allocator, typename Mode>
-        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
+        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream, int device = 0)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -200,7 +200,7 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
+            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream, device);
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
@@ -212,16 +212,16 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream);
+            return get_future<Allocator, callback_mode>(a, stream, 0);
         }
 
         // -------------------------------------------------------------
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream)
+            Allocator const& a, cudaStream_t stream, int device)
         {
-            return get_future<Allocator, event_mode>(a, stream);
+            return get_future<Allocator, event_mode>(a, stream, device);
         }
 
         // -------------------------------------------------------------
@@ -231,7 +231,7 @@ namespace hpx { namespace cuda { namespace experimental {
 
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
-        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
+        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t, int);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
index fde4361569d5..5c5592dbaded 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
@@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         hpx::move_only_function<void(cudaError_t)>;
 
     HPX_CORE_EXPORT void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream);
+        event_callback_function_type&& f, cudaStream_t stream, int device);
 
     HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
     HPX_CORE_EXPORT void unregister_polling(
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index 0d53bd4af3f7..56b5cd6884b7 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -50,6 +50,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     {
         cudaEvent_t event;
         event_callback_function_type f;
+        int device;
     };
 
     using event_callback_queue_type =
@@ -104,17 +105,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     }
 
     void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream)
+        event_callback_function_type&& f, cudaStream_t stream, int device)
     {
         cudaEvent_t event;
-        if (!cuda_event_pool::get_event_pool().pop(event))
+        if (!cuda_event_pool::get_event_pool(device).pop(event))
         {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
         }
+        check_cuda_error(cudaSetDevice(device));
         check_cuda_error(cudaEventRecord(event, stream));
 
-        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
+        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f), device});
     }
 
     // Background progress function for async CUDA operations. Checks for completed
@@ -155,9 +157,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                 debug::dec<3>(get_number_of_active_events()));
         }
 
-        // Grab the handle to the event pool so we can return completed events
-        cuda_event_pool& pool =
-            hpx::cuda::experimental::cuda_event_pool::get_event_pool();
 
         // Iterate over our list of events and see if any have completed
         event_callback_vector.erase(
@@ -177,6 +176,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
+                    // Grab the handle to the event pool so we can return completed events
+                    cuda_event_pool& pool = hpx::cuda::experimental::
+                        cuda_event_pool::get_event_pool(continuation.device);
                     pool.push(HPX_MOVE(continuation.event));
                     return true;
                 }),
@@ -199,6 +201,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     "active events",
                     debug::dec<3>(get_number_of_active_events()));
                 continuation.f(status);
+                // Grab the handle to the event pool so we can return completed events
+                cuda_event_pool& pool =
+                    hpx::cuda::experimental::cuda_event_pool::get_event_pool(continuation.device);
                 pool.push(HPX_MOVE(continuation.event));
             }
         }
diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
index 6b1dfbbaa436..fdf95bf8e96f 100644
--- a/libs/core/async_cuda/src/cuda_future.cpp
+++ b/libs/core/async_cuda/src/cuda_future.cpp
@@ -18,8 +18,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
             hpx::util::internal_allocator<>{}, stream);
     }
 
-    hpx::future<void> get_future_with_event(cudaStream_t stream)
+    hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
     {
-        return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
+        return get_future_with_event(hpx::util::internal_allocator<>{}, stream, device);
     }
 }}}}    // namespace hpx::cuda::experimental::detail
diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp
index 6906308ebc62..29377b216b55 100644
--- a/libs/core/async_cuda/src/cuda_target.cpp
+++ b/libs/core/async_cuda/src/cuda_target.cpp
@@ -187,7 +187,7 @@ namespace hpx { namespace cuda { namespace experimental {
 
     hpx::future<void> target::get_future_with_event() const
     {
-        return detail::get_future_with_event(handle_.get_stream());
+        return detail::get_future_with_event(handle_.get_stream(), handle_.get_device());
     }
 
     hpx::future<void> target::get_future_with_callback() const

From e91c81a3b2b19df655729c7cb9c4788ffde27abb Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 14:57:15 -0500
Subject: [PATCH 02/25] Detect number of GPUs

---
 .../include/hpx/async_cuda/cuda_event.hpp     | 75 +++++++++++--------
 .../async_cuda/src/cuda_event_callback.cpp    | 10 +--
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 859caf245a9f..a2faccd5c873 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <deque>
+
 #include <hpx/async_cuda/cuda_exception.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 #include <hpx/concurrency/stack.hpp>
@@ -21,59 +23,71 @@ namespace hpx { namespace cuda { namespace experimental {
     {
         static constexpr int initial_events_in_pool = 128;
 
-        const int device_id;
-
-        static cuda_event_pool& get_event_pool(size_t device_id)
+        static cuda_event_pool& get_event_pool()
         {
-            static std::array<cuda_event_pool, 4> event_pool_{0, 1, 2, 3};
-            return event_pool_[device_id];
+            static cuda_event_pool event_pool_;
+            return event_pool_;
         }
 
         // create a bunch of events on initialization
-        cuda_event_pool(int device_id)
-          : device_id(device_id), free_list_(initial_events_in_pool)
+        cuda_event_pool()
+          : max_number_devices_(0)
         {
-            check_cuda_error(cudaSetDevice(device_id));
-            for (int i = 0; i < initial_events_in_pool; ++i)
-            {
-                add_event_to_pool();
+            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
+            HPX_ASSERT_MSG(max_number_devices_ > 0,
+                "CUDA polling enabled and called, yet no CUDA device found!");
+            for (int device = 0; device < max_number_devices_; device++) {
+              check_cuda_error(cudaSetDevice(device));
+              free_lists_.emplace_back(initial_events_in_pool);
+              for (int i = 0; i < initial_events_in_pool; ++i)
+              {
+                  add_event_to_pool(device);
+              }
+              std::cerr << "Created " << device << std::endl;
             }
-            std::cerr << "Created " << device_id << std::endl;
         }
 
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            check_cuda_error(cudaSetDevice(device_id));
-            cudaEvent_t event;
-            bool ok = true;
-            while (ok)
-            {
-                ok = free_list_.pop(event);
-                if (ok)
-                    check_cuda_error(cudaEventDestroy(event));
+            HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
+                "Number of CUDA event pools does not match the number of devices!");
+            for (int device = 0; device < max_number_devices_; device++) {
+              check_cuda_error(cudaSetDevice(device));
+              cudaEvent_t event;
+              bool ok = true;
+              while (ok)
+              {
+                  ok = free_lists_[device].pop(event);
+                  if (ok)
+                      check_cuda_error(cudaEventDestroy(event));
+              }
             }
         }
 
-        inline bool pop(cudaEvent_t& event)
+        inline bool pop(cudaEvent_t& event, int device)
         {
+            HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
             // pop an event off the pool, if that fails, create a new one
-            while (!free_list_.pop(event))
+            while (!free_lists_[device].pop(event))
             {
-                add_event_to_pool();
+                add_event_to_pool(device);
             }
             return true;
         }
 
-        inline bool push(cudaEvent_t event)
+        inline bool push(cudaEvent_t event, int device)
         {
-            return free_list_.push(event);
+            HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
+            return free_lists_[device].push(event);
         }
 
     private:
-        void add_event_to_pool()
+        void add_event_to_pool(int device)
         {
-            check_cuda_error(cudaSetDevice(device_id));
+            check_cuda_error(cudaSetDevice(device));
             cudaEvent_t event;
             // Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
             // Timing is disabled for performance. [1]
@@ -81,10 +95,11 @@ namespace hpx { namespace cuda { namespace experimental {
             // [1]: CUDA Runtime API, section 5.5 cuda_event Management
             check_cuda_error(
                 cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-            free_list_.push(event);
+            free_lists_[device].push(event);
         }
+        int max_number_devices_;
 
-        // pool is dynamically sized and can grow if needed
-        hpx::lockfree::stack<cudaEvent_t> free_list_;
+        // One pool per GPU - each pool is dynamically sized and can grow if needed
+        std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
     };
 }}}    // namespace hpx::cuda::experimental
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index 56b5cd6884b7..b36b5d4940d2 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -108,7 +108,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         event_callback_function_type&& f, cudaStream_t stream, int device)
     {
         cudaEvent_t event;
-        if (!cuda_event_pool::get_event_pool(device).pop(event))
+        if (!cuda_event_pool::get_event_pool().pop(event, device))
         {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
@@ -178,8 +178,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     continuation.f(status);
                     // Grab the handle to the event pool so we can return completed events
                     cuda_event_pool& pool = hpx::cuda::experimental::
-                        cuda_event_pool::get_event_pool(continuation.device);
-                    pool.push(HPX_MOVE(continuation.event));
+                        cuda_event_pool::get_event_pool();
+                    pool.push(HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
             event_callback_vector.end());
@@ -203,8 +203,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                 continuation.f(status);
                 // Grab the handle to the event pool so we can return completed events
                 cuda_event_pool& pool =
-                    hpx::cuda::experimental::cuda_event_pool::get_event_pool(continuation.device);
-                pool.push(HPX_MOVE(continuation.event));
+                    hpx::cuda::experimental::cuda_event_pool::get_event_pool();
+                pool.push(HPX_MOVE(continuation.event), continuation.device);
             }
         }
 

From 25494a207169c46292cce1fdc4c06c5732fa92e6 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 15:22:04 -0500
Subject: [PATCH 03/25] Cleanup

---
 .../async_cuda/include/hpx/async_cuda/cuda_event.hpp     | 1 -
 libs/core/async_cuda/src/cuda_event_callback.cpp         | 9 +++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index a2faccd5c873..66659371fb4f 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -43,7 +43,6 @@ namespace hpx { namespace cuda { namespace experimental {
               {
                   add_event_to_pool(device);
               }
-              std::cerr << "Created " << device << std::endl;
             }
         }
 
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index b36b5d4940d2..b1439ff338bd 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -157,6 +157,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                 debug::dec<3>(get_number_of_active_events()));
         }
 
+        // Grab the handle to the event pool so we can return completed events
+        cuda_event_pool& pool =
+            hpx::cuda::experimental::cuda_event_pool::get_event_pool();
 
         // Iterate over our list of events and see if any have completed
         event_callback_vector.erase(
@@ -176,9 +179,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
-                    // Grab the handle to the event pool so we can return completed events
-                    cuda_event_pool& pool = hpx::cuda::experimental::
-                        cuda_event_pool::get_event_pool();
                     pool.push(HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
@@ -201,9 +201,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     "active events",
                     debug::dec<3>(get_number_of_active_events()));
                 continuation.f(status);
-                // Grab the handle to the event pool so we can return completed events
-                cuda_event_pool& pool =
-                    hpx::cuda::experimental::cuda_event_pool::get_event_pool();
                 pool.push(HPX_MOVE(continuation.event), continuation.device);
             }
         }

From 64ecdfc791a3dfd44a8eecfcdfb9d164a2a2f322 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 15:56:03 -0500
Subject: [PATCH 04/25] Add missing default value

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 9084166ddc54..1c4503d66b81 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -219,7 +219,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream, int device)
+            Allocator const& a, cudaStream_t stream, int device = 0)
         {
             return get_future<Allocator, event_mode>(a, stream, device);
         }
@@ -231,7 +231,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
-        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t, int);
+        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
+            cudaStream_t stream, int device = 0);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 

From 12b3e3c3ac06a9981aebb740a935e95aad555bd4 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 16:10:37 -0500
Subject: [PATCH 05/25] Fix format

---
 .../include/hpx/async_cuda/cuda_event.hpp     | 36 ++++++++++---------
 .../include/hpx/async_cuda/cuda_future.hpp    |  6 ++--
 .../async_cuda/src/cuda_event_callback.cpp    |  6 ++--
 libs/core/async_cuda/src/cuda_future.cpp      |  3 +-
 libs/core/async_cuda/src/cuda_target.cpp      |  3 +-
 5 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 66659371fb4f..cca0859f920b 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -36,13 +36,14 @@ namespace hpx { namespace cuda { namespace experimental {
             check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
             HPX_ASSERT_MSG(max_number_devices_ > 0,
                 "CUDA polling enabled and called, yet no CUDA device found!");
-            for (int device = 0; device < max_number_devices_; device++) {
-              check_cuda_error(cudaSetDevice(device));
-              free_lists_.emplace_back(initial_events_in_pool);
-              for (int i = 0; i < initial_events_in_pool; ++i)
-              {
-                  add_event_to_pool(device);
-              }
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                free_lists_.emplace_back(initial_events_in_pool);
+                for (int i = 0; i < initial_events_in_pool; ++i)
+                {
+                    add_event_to_pool(device);
+                }
             }
         }
 
@@ -51,16 +52,17 @@ namespace hpx { namespace cuda { namespace experimental {
         {
             HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
                 "Number of CUDA event pools does not match the number of devices!");
-            for (int device = 0; device < max_number_devices_; device++) {
-              check_cuda_error(cudaSetDevice(device));
-              cudaEvent_t event;
-              bool ok = true;
-              while (ok)
-              {
-                  ok = free_lists_[device].pop(event);
-                  if (ok)
-                      check_cuda_error(cudaEventDestroy(event));
-              }
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                cudaEvent_t event;
+                bool ok = true;
+                while (ok)
+                {
+                    ok = free_lists_[device].pop(event);
+                    if (ok)
+                        check_cuda_error(cudaEventDestroy(event));
+                }
             }
         }
 
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 1c4503d66b81..9633d40bf2bc 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -183,7 +183,8 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator, and the
         // specified mode
         template <typename Allocator, typename Mode>
-        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream, int device = 0)
+        hpx::future<void> get_future(
+            Allocator const& a, cudaStream_t stream, int device = 0)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -200,7 +201,8 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream, device);
+            traits::construct(
+                alloc, p.get(), init_no_addref{}, alloc, stream, device);
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index b1439ff338bd..b472e5c55e23 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -116,7 +116,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         check_cuda_error(cudaSetDevice(device));
         check_cuda_error(cudaEventRecord(event, stream));
 
-        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f), device});
+        detail::add_to_event_callback_queue(
+            event_callback{event, HPX_MOVE(f), device});
     }
 
     // Background progress function for async CUDA operations. Checks for completed
@@ -179,7 +180,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
-                    pool.push(HPX_MOVE(continuation.event), continuation.device);
+                    pool.push(
+                        HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
             event_callback_vector.end());
diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
index fdf95bf8e96f..c797191e1474 100644
--- a/libs/core/async_cuda/src/cuda_future.cpp
+++ b/libs/core/async_cuda/src/cuda_future.cpp
@@ -20,6 +20,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
 
     hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
     {
-        return get_future_with_event(hpx::util::internal_allocator<>{}, stream, device);
+        return get_future_with_event(
+            hpx::util::internal_allocator<>{}, stream, device);
     }
 }}}}    // namespace hpx::cuda::experimental::detail
diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp
index 29377b216b55..60ea190980f0 100644
--- a/libs/core/async_cuda/src/cuda_target.cpp
+++ b/libs/core/async_cuda/src/cuda_target.cpp
@@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
     hpx::future<void> target::get_future_with_event() const
     {
-        return detail::get_future_with_event(handle_.get_stream(), handle_.get_device());
+        return detail::get_future_with_event(
+            handle_.get_stream(), handle_.get_device());
     }
 
     hpx::future<void> target::get_future_with_callback() const

From bf85ef6cd635a5cb51e0f402050545feb46a3cfa Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 16:19:38 -0500
Subject: [PATCH 06/25] Add missing format fix

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index cca0859f920b..3f239416226a 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -51,7 +51,8 @@ namespace hpx { namespace cuda { namespace experimental {
         ~cuda_event_pool()
         {
             HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
-                "Number of CUDA event pools does not match the number of devices!");
+                "Number of CUDA event pools does not match the number of "
+                "devices!");
             for (int device = 0; device < max_number_devices_; device++)
             {
                 check_cuda_error(cudaSetDevice(device));

From 6a24f16e8acd820c2a72a8f00f052a9f9c6e9fa4 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Mon, 19 Jun 2023 18:55:41 -0500
Subject: [PATCH 07/25] Add default device arguments

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp    | 4 ++--
 .../include/hpx/async_cuda/detail/cuda_event_callback.hpp     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 3f239416226a..1a7b8256f91e 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -67,7 +67,7 @@ namespace hpx { namespace cuda { namespace experimental {
             }
         }
 
-        inline bool pop(cudaEvent_t& event, int device)
+        inline bool pop(cudaEvent_t& event, int device = 0)
         {
             HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
                 "Accessing CUDA event pool with invalid device ID!");
@@ -79,7 +79,7 @@ namespace hpx { namespace cuda { namespace experimental {
             return true;
         }
 
-        inline bool push(cudaEvent_t event, int device)
+        inline bool push(cudaEvent_t event, int device = 0)
         {
             HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
                 "Accessing CUDA event pool with invalid device ID!");
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
index 5c5592dbaded..e1e6ef49cb20 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
@@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         hpx::move_only_function<void(cudaError_t)>;
 
     HPX_CORE_EXPORT void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream, int device);
+        event_callback_function_type&& f, cudaStream_t stream, int device = 0);
 
     HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
     HPX_CORE_EXPORT void unregister_polling(

From e665a9e4a495c5c9271641c89ee0fa64edb379ed Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 20 Jun 2023 09:13:33 -0500
Subject: [PATCH 08/25] Fix asserts

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 1a7b8256f91e..ff33a067f497 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -69,7 +69,7 @@ namespace hpx { namespace cuda { namespace experimental {
 
         inline bool pop(cudaEvent_t& event, int device = 0)
         {
-            HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
                 "Accessing CUDA event pool with invalid device ID!");
             // pop an event off the pool, if that fails, create a new one
             while (!free_lists_[device].pop(event))
@@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental {
 
         inline bool push(cudaEvent_t event, int device = 0)
         {
-            HPX_ASSERT_MSG(device > 0 && device < max_number_devices_,
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
                 "Accessing CUDA event pool with invalid device ID!");
             return free_lists_[device].push(event);
         }

From 791cecd62be30d08209f6b0fd56efaa43ffa17d4 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 21 Jun 2023 13:57:21 -0500
Subject: [PATCH 09/25] Delete copy/move event pool constructors

---
 .../include/hpx/async_cuda/cuda_event.hpp     | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index ff33a067f497..1783dd28721a 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -21,7 +21,7 @@ namespace hpx { namespace cuda { namespace experimental {
     // of them at startup.
     struct cuda_event_pool
     {
-        static constexpr int initial_events_in_pool = 128;
+        static constexpr std::size_t initial_events_in_pool = 128;
 
         static cuda_event_pool& get_event_pool()
         {
@@ -29,24 +29,6 @@ namespace hpx { namespace cuda { namespace experimental {
             return event_pool_;
         }
 
-        // create a bunch of events on initialization
-        cuda_event_pool()
-          : max_number_devices_(0)
-        {
-            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
-            HPX_ASSERT_MSG(max_number_devices_ > 0,
-                "CUDA polling enabled and called, yet no CUDA device found!");
-            for (int device = 0; device < max_number_devices_; device++)
-            {
-                check_cuda_error(cudaSetDevice(device));
-                free_lists_.emplace_back(initial_events_in_pool);
-                for (int i = 0; i < initial_events_in_pool; ++i)
-                {
-                    add_event_to_pool(device);
-                }
-            }
-        }
-
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
@@ -86,7 +68,33 @@ namespace hpx { namespace cuda { namespace experimental {
             return free_lists_[device].push(event);
         }
 
+        // delete copy / move constructors
+        cuda_event_pool(cuda_event_pool&&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool&&) = delete;
+        cuda_event_pool(const cuda_event_pool&) = delete;
+        cuda_event_pool& operator=(const cuda_event_pool&) = delete;
+
     private:
+        // Private singleton constructor 
+        // Creates a bunch of events on initialization
+        cuda_event_pool()
+          : max_number_devices_(0)
+        {
+            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
+            HPX_ASSERT_MSG(max_number_devices_ > 0,
+                "CUDA polling enabled and called, yet no CUDA device found!");
+            /* free_lists_.reserve(max_number_devices_); */
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                free_lists_.emplace_back(initial_events_in_pool);
+                for (std::size_t i = 0; i < initial_events_in_pool; ++i)
+                {
+                    add_event_to_pool(device);
+                }
+            }
+        }
+
         void add_event_to_pool(int device)
         {
             check_cuda_error(cudaSetDevice(device));

From d93ada24a541a0de315974d97676ce0b2bd95375 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 21 Jun 2023 14:07:16 -0500
Subject: [PATCH 10/25] Fix unused parameter warning

---
 .../include/hpx/async_cuda/cuda_future.hpp    | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 9633d40bf2bc..85fff23cfdf5 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental {
             }
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream, int device)
+                cudaStream_t stream)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
               , rt_(hpx::get_runtime_ptr())
@@ -201,8 +201,19 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(
-                alloc, p.get(), init_no_addref{}, alloc, stream, device);
+            static_assert(std::is_same_v<Mode, event_mode> ||
+                    std::is_same_v<Mode, callback_mode>,
+                "get_future mode not supported!");
+            if constexpr (std::is_same_v<Mode, event_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream, device);
+            }
+            else if constexpr (std::is_same_v<Mode, callback_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream);
+            }
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
@@ -214,7 +225,7 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream, 0);
+            return get_future<Allocator, callback_mode>(a, stream);
         }
 
         // -------------------------------------------------------------

From 7a5f5510abd19345440e650e4afa248979345eeb Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Thu, 22 Jun 2023 21:31:16 -0500
Subject: [PATCH 11/25] Switch to east const

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 1783dd28721a..07783c97e422 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -71,8 +71,8 @@ namespace hpx { namespace cuda { namespace experimental {
         // delete copy / move constructors
         cuda_event_pool(cuda_event_pool&&) = delete;
         cuda_event_pool& operator=(cuda_event_pool&&) = delete;
-        cuda_event_pool(const cuda_event_pool&) = delete;
-        cuda_event_pool& operator=(const cuda_event_pool&) = delete;
+        cuda_event_pool(cuda_event_pool const&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool const&) = delete;
 
     private:
         // Private singleton constructor 

From b37f72a7096ca0d614a239551446453a770dd777 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Thu, 22 Jun 2023 21:31:50 -0500
Subject: [PATCH 12/25] Add -1 default parameter for device auto-detection

---
 .../include/hpx/async_cuda/cuda_future.hpp           | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 85fff23cfdf5..7b906baa42ed 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -184,7 +184,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // specified mode
         template <typename Allocator, typename Mode>
         hpx::future<void> get_future(
-            Allocator const& a, cudaStream_t stream, int device = 0)
+            Allocator const& a, cudaStream_t stream, int device)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -225,15 +225,19 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream);
+            // device id 0 will be dropped in callback mode - can be
+            // an arbitrary number here
+            return get_future<Allocator, callback_mode>(a, stream, 0);
         }
 
         // -------------------------------------------------------------
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream, int device = 0)
+            Allocator const& a, cudaStream_t stream, int device = -1)
         {
+            if (device == -1) 
+              check_cuda_error(cudaGetDevice(&device));
             return get_future<Allocator, event_mode>(a, stream, device);
         }
 
@@ -245,7 +249,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
         HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
-            cudaStream_t stream, int device = 0);
+            cudaStream_t stream, int device = -1);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 

From d1a81f0ea424eadd3ad732644cbe5adbbd745869 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 11:14:46 -0500
Subject: [PATCH 13/25] Fix assert

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 07783c97e422..8b7b740a167b 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -1,6 +1,5 @@
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2020 Teodor Nikolov
-//
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -32,7 +31,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
+            HPX_ASSERT_MSG(free_lists_.size() != max_number_devices_,
                 "Number of CUDA event pools does not match the number of "
                 "devices!");
             for (int device = 0; device < max_number_devices_; device++)

From d1f59531535125d330fb0ef4e48fe279a82f9a45 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 13:00:29 -0500
Subject: [PATCH 14/25] Restore original device after init

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 8b7b740a167b..6d249d2031e4 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -82,7 +82,8 @@ namespace hpx { namespace cuda { namespace experimental {
             check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
             HPX_ASSERT_MSG(max_number_devices_ > 0,
                 "CUDA polling enabled and called, yet no CUDA device found!");
-            /* free_lists_.reserve(max_number_devices_); */
+            int original_device = 0;
+            check_cuda_error(cudaGetDevice(&original_device));
             for (int device = 0; device < max_number_devices_; device++)
             {
                 check_cuda_error(cudaSetDevice(device));
@@ -92,6 +93,9 @@ namespace hpx { namespace cuda { namespace experimental {
                     add_event_to_pool(device);
                 }
             }
+            // Restore original device -- avoids confusion about HPX switching
+            // devices for the current thread if only one device is used.
+            check_cuda_error(cudaSetDevice(original_device));
         }
 
         void add_event_to_pool(int device)

From 9b3388e92a463efd8da772d6be036a4a4b2101b9 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Tue, 27 Jun 2023 14:54:41 -0500
Subject: [PATCH 15/25] Remove superfluous cudaSetDevice

---
 libs/core/async_cuda/src/cuda_event_callback.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index b472e5c55e23..1b0267cd068b 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -113,7 +113,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
         }
-        check_cuda_error(cudaSetDevice(device));
         check_cuda_error(cudaEventRecord(event, stream));
 
         detail::add_to_event_callback_queue(

From ee33c48def919c1cd7ffed7e9092b7ee451ccd01 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 14:18:32 -0500
Subject: [PATCH 16/25] Put event pool singleton access definition in src

---
 libs/core/async_cuda/CMakeLists.txt             |  2 +-
 .../include/hpx/async_cuda/cuda_event.hpp       | 11 +++++------
 libs/core/async_cuda/src/cuda_event.cpp         | 17 +++++++++++++++++
 3 files changed, 23 insertions(+), 7 deletions(-)
 create mode 100644 libs/core/async_cuda/src/cuda_event.cpp

diff --git a/libs/core/async_cuda/CMakeLists.txt b/libs/core/async_cuda/CMakeLists.txt
index 77782a23bc47..b2a7954877bb 100644
--- a/libs/core/async_cuda/CMakeLists.txt
+++ b/libs/core/async_cuda/CMakeLists.txt
@@ -36,7 +36,7 @@ set(async_cuda_compat_headers
 # cmake-format: on
 
 set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp
-                       get_targets.cpp
+                       get_targets.cpp cuda_event.cpp
 )
 
 if(HPX_WITH_HIP AND TARGET roc::hipblas)
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 6d249d2031e4..c399839a7a13 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -1,5 +1,7 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2020 Teodor Nikolov
+//
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -22,16 +24,13 @@ namespace hpx { namespace cuda { namespace experimental {
     {
         static constexpr std::size_t initial_events_in_pool = 128;
 
-        static cuda_event_pool& get_event_pool()
-        {
-            static cuda_event_pool event_pool_;
-            return event_pool_;
-        }
+        HPX_CORE_EXPORT static cuda_event_pool& get_event_pool();
 
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            HPX_ASSERT_MSG(free_lists_.size() != max_number_devices_,
+            HPX_ASSERT_MSG(free_lists_.size() !=
+                    static_cast<std::size_t>(max_number_devices_),
                 "Number of CUDA event pools does not match the number of "
                 "devices!");
             for (int device = 0; device < max_number_devices_; device++)
diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp
new file mode 100644
index 000000000000..809cf1fa892c
--- /dev/null
+++ b/libs/core/async_cuda/src/cuda_event.cpp
@@ -0,0 +1,17 @@
+//  Copyright (c) 2023 Gregor Daiß
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/assert.hpp>
+#include <hpx/async_cuda/cuda_event.hpp>
+#include <hpx/async_cuda/custom_gpu_api.hpp>
+
+namespace hpx { namespace cuda { namespace experimental { 
+    cuda_event_pool& cuda_event_pool::get_event_pool()
+    {
+        static cuda_event_pool event_pool_;
+        return event_pool_;
+    }
+}}}    // namespace hpx::cuda::experimental

From 707c0254e57bf10b00e84739a7d80afb2db45eee Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 16:08:11 -0500
Subject: [PATCH 17/25] Add basic multi gpu polling test

---
 .../core/async_cuda/tests/unit/CMakeLists.txt |  5 +-
 .../tests/unit/cuda_multi_device_polling.cpp  | 83 +++++++++++++++++++
 2 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp

diff --git a/libs/core/async_cuda/tests/unit/CMakeLists.txt b/libs/core/async_cuda/tests/unit/CMakeLists.txt
index ce6cf5a19ee2..b14933107024 100644
--- a/libs/core/async_cuda/tests/unit/CMakeLists.txt
+++ b/libs/core/async_cuda/tests/unit/CMakeLists.txt
@@ -4,16 +4,19 @@
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-set(tests cuda_future transform_stream)
+set(tests cuda_future cuda_multi_device_polling transform_stream)
 if(HPX_WITH_GPUBLAS)
   set(benchmarks ${benchmarks} cublas_matmul)
 endif()
 
 set(cublas_matmul_PARAMETERS THREADS_PER_LOCALITY 4)
 set(cuda_future_PARAMETERS THREADS_PER_LOCALITY 4)
+set(cuda_multi_device_polling_PARAMETERS THREADS_PER_LOCALITY 4)
 set(transform_stream_PARAMETERS THREADS_PER_LOCALITY 4)
 
 set(cuda_future_CUDA_SOURCE saxpy trivial_demo)
+set(cuda_multi_device_polling_CUDA_SOURCE trivial_demo)
+
 set(transform_stream_CUDA ON)
 
 foreach(test ${tests})
diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
new file mode 100644
index 000000000000..2200155055e9
--- /dev/null
+++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
@@ -0,0 +1,83 @@
+//  Copyright (c) 2023 Gregor Daiß
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/algorithm.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/future.hpp>
+#include <hpx/init.hpp>
+#include <hpx/modules/async_cuda.hpp>
+#include <hpx/modules/testing.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+// -------------------------------------------------------------------------
+// This example is similar to the unit/cuda_future.cpp example (hence it also uses
+// the externally defined cuda_trivial_kernel. See unit/cuda_future.cpp for
+// more details regarding this.
+//
+// This example extends unit/cuda_future.cpp by testing the cuda event polling
+// on multiple devices (if available)! If the polling is not working correctly,
+// the test will time out as some of the futures are never triggered.
+
+template <typename T>
+extern void cuda_trivial_kernel(T, cudaStream_t stream);
+
+extern void launch_saxpy_kernel(
+    hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks,
+    unsigned int& threads, void** args);
+
+
+// -------------------------------------------------------------------------
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    // install cuda future polling handler
+    hpx::cuda::experimental::enable_user_polling poll("default");
+
+    // Print all targets for debug purposes
+    hpx::cuda::experimental::print_local_targets();
+
+    int number_devices = 0;
+    hpx::cuda::experimental::check_cuda_error(cudaGetDeviceCount(&number_devices));
+    HPX_ASSERT(number_devices > 0);
+
+    std::vector<hpx::shared_future<void>> futs(number_devices);
+
+    for (auto device_id = 0; device_id < number_devices; device_id++) {
+      hpx::cuda::experimental::cuda_executor exec(
+          device_id, hpx::cuda::experimental::event_mode{});
+      auto fut = hpx::async(exec, cuda_trivial_kernel<double>,
+          static_cast<float>(device_id) + 1);
+      futs[device_id] = fut.then([device_id](hpx::future<void>&&) {
+          std::cout
+              << "Continuation for kernel future triggered on device executor "
+              << device_id << std::endl;
+      });
+    }
+
+    auto final_fut = hpx::when_all(futs);
+    std::cout << "All executor test kernels launched! " << std::endl;
+    final_fut.get();
+    std::cout << "All executor test kernels finished! " << std::endl;
+
+    return hpx::local::finalize();
+}
+
+// -------------------------------------------------------------------------
+int main(int argc, char** argv)
+{
+    std::cout << "[HPX Cuda multi device polling] - Starting...\n" << std::endl;
+
+    hpx::local::init_params init_args;
+
+    auto result = hpx::local::init(hpx_main, argc, argv, init_args);
+    return result || hpx::util::report_errors();
+}

From 8d7f6367ceff3e427243cbd9db2c0a7fe97d2483 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 16:51:55 -0500
Subject: [PATCH 18/25] Fix assert (again)

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index c399839a7a13..fa2c8d80b9e4 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -29,7 +29,7 @@ namespace hpx { namespace cuda { namespace experimental {
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            HPX_ASSERT_MSG(free_lists_.size() !=
+            HPX_ASSERT_MSG(free_lists_.size() ==
                     static_cast<std::size_t>(max_number_devices_),
                 "Number of CUDA event pools does not match the number of "
                 "devices!");

From a7e76e5602e1926b4fcada7c7fab15daf12b3062 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 17:12:27 -0500
Subject: [PATCH 19/25] Add test for default device ID

---
 .../tests/unit/cuda_multi_device_polling.cpp  | 47 ++++++++++++++-----
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
index 2200155055e9..636c610ca952 100644
--- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
+++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
@@ -46,28 +46,49 @@ int hpx_main(hpx::program_options::variables_map& vm)
     hpx::cuda::experimental::print_local_targets();
 
     int number_devices = 0;
-    hpx::cuda::experimental::check_cuda_error(cudaGetDeviceCount(&number_devices));
+    hpx::cuda::experimental::check_cuda_error(
+        cudaGetDeviceCount(&number_devices));
     HPX_ASSERT(number_devices > 0);
 
+    // Check if the futures complete when using executors on all devices
     std::vector<hpx::shared_future<void>> futs(number_devices);
-
-    for (auto device_id = 0; device_id < number_devices; device_id++) {
-      hpx::cuda::experimental::cuda_executor exec(
-          device_id, hpx::cuda::experimental::event_mode{});
-      auto fut = hpx::async(exec, cuda_trivial_kernel<double>,
-          static_cast<float>(device_id) + 1);
-      futs[device_id] = fut.then([device_id](hpx::future<void>&&) {
-          std::cout
-              << "Continuation for kernel future triggered on device executor "
-              << device_id << std::endl;
-      });
+    for (auto device_id = 0; device_id < number_devices; device_id++)
+    {
+        hpx::cuda::experimental::cuda_executor exec(
+            device_id, hpx::cuda::experimental::event_mode{});
+        auto fut = hpx::async(exec, cuda_trivial_kernel<float>,
+            static_cast<float>(device_id) + 1);
+        futs[device_id] = fut.then([device_id](hpx::future<void>&&) {
+            std::cout << "Continuation for kernel future triggered on device "
+                         "executor "
+                      << device_id << std::endl;
+        });
     }
-
     auto final_fut = hpx::when_all(futs);
     std::cout << "All executor test kernels launched! " << std::endl;
     final_fut.get();
     std::cout << "All executor test kernels finished! " << std::endl;
 
+    // Test to see if HPX correctly picks up the current device in case
+    // get_future_with_event is not given a device_id
+    for (auto device_id = 0; device_id < number_devices; device_id++)
+    {
+        hpx::cuda::experimental::check_cuda_error(cudaSetDevice(device_id));
+        cudaStream_t device_stream;
+        hpx::cuda::experimental::check_cuda_error(
+            cudaStreamCreate(&device_stream));
+        cuda_trivial_kernel<float>(
+            number_devices + device_id + 1, device_stream);
+        auto fut = hpx::cuda::experimental::detail::get_future_with_event(
+            device_stream);
+        fut.get();
+        std::cout
+            << "get_future_with_event default ID test finished on device "
+            << device_id << std::endl;
+        hpx::cuda::experimental::check_cuda_error(
+            cudaStreamDestroy(device_stream));
+    }
+
     return hpx::local::finalize();
 }
 

From 525ad21d1681da5f84eb149b9ba00608841bef98 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 17:13:50 -0500
Subject: [PATCH 20/25] Fix some format issues / update file author lists

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp  | 4 ++--
 libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp | 5 +++--
 libs/core/async_cuda/src/cuda_event.cpp                     | 2 +-
 libs/core/async_cuda/src/cuda_event_callback.cpp            | 1 +
 libs/core/async_cuda/src/cuda_future.cpp                    | 1 +
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index fa2c8d80b9e4..31c6cc768f85 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -73,8 +73,8 @@ namespace hpx { namespace cuda { namespace experimental {
         cuda_event_pool& operator=(cuda_event_pool const&) = delete;
 
     private:
-        // Private singleton constructor 
-        // Creates a bunch of events on initialization
+        // Private singleton constructor. Creates a initial_events_in_pool of
+        // events on initialization
         cuda_event_pool()
           : max_number_devices_(0)
         {
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 7b906baa42ed..62fc91c2d1d4 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Thomas Heller
 //  Copyright (c) 2016 Hartmut Kaiser
@@ -236,8 +237,8 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_event(
             Allocator const& a, cudaStream_t stream, int device = -1)
         {
-            if (device == -1) 
-              check_cuda_error(cudaGetDevice(&device));
+            if (device == -1)
+                check_cuda_error(cudaGetDevice(&device));
             return get_future<Allocator, event_mode>(a, stream, device);
         }
 
diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp
index 809cf1fa892c..470dbf5533d4 100644
--- a/libs/core/async_cuda/src/cuda_event.cpp
+++ b/libs/core/async_cuda/src/cuda_event.cpp
@@ -8,7 +8,7 @@
 #include <hpx/async_cuda/cuda_event.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 
-namespace hpx { namespace cuda { namespace experimental { 
+namespace hpx { namespace cuda { namespace experimental {
     cuda_event_pool& cuda_event_pool::get_event_pool()
     {
         static cuda_event_pool event_pool_;
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index 1b0267cd068b..9597656ecbe6 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2021 ETH Zurich
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
index c797191e1474..545b1dc9b77c 100644
--- a/libs/core/async_cuda/src/cuda_future.cpp
+++ b/libs/core/async_cuda/src/cuda_future.cpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
 //  Copyright (c) 2016 Thomas Heller

From 3999ecc8013ac35e01ca000ac7d1a813e2e124e8 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 17:30:42 -0500
Subject: [PATCH 21/25] Fix test format

---
 .../async_cuda/tests/unit/cuda_multi_device_polling.cpp     | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
index 636c610ca952..24493aa58992 100644
--- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
+++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
@@ -35,7 +35,6 @@ extern void launch_saxpy_kernel(
     hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks,
     unsigned int& threads, void** args);
 
-
 // -------------------------------------------------------------------------
 int hpx_main(hpx::program_options::variables_map& vm)
 {
@@ -82,9 +81,8 @@ int hpx_main(hpx::program_options::variables_map& vm)
         auto fut = hpx::cuda::experimental::detail::get_future_with_event(
             device_stream);
         fut.get();
-        std::cout
-            << "get_future_with_event default ID test finished on device "
-            << device_id << std::endl;
+        std::cout << "get_future_with_event default ID test finished on device "
+                  << device_id << std::endl;
         hpx::cuda::experimental::check_cuda_error(
             cudaStreamDestroy(device_stream));
     }

From 46b477a47e2f621932fe6b68214ead7d5640ec1e Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 22:57:46 -0500
Subject: [PATCH 22/25] Cal setdevice when creating events on-demand

---
 .../core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 31c6cc768f85..5a13a911c857 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -54,7 +54,15 @@ namespace hpx { namespace cuda { namespace experimental {
             // pop an event off the pool, if that fails, create a new one
             while (!free_lists_[device].pop(event))
             {
+                // Set correct device in case if required
+                int original_device = -1;
+                check_cuda_error(cudaGetDevice(&original_device));
+                if (original_device != device)
+                    check_cuda_error(cudaSetDevice(device));
                 add_event_to_pool(device);
+                // reset to original device if required 
+                if (original_device != device)
+                  check_cuda_error(cudaSetDevice(original_device));
             }
             return true;
         }

From 4eb0b394e9999382fb355c055da44bc45087475e Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Wed, 28 Jun 2023 23:17:03 -0500
Subject: [PATCH 23/25] Fix format (again)

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 5a13a911c857..727379a1db09 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -60,9 +60,9 @@ namespace hpx { namespace cuda { namespace experimental {
                 if (original_device != device)
                     check_cuda_error(cudaSetDevice(device));
                 add_event_to_pool(device);
-                // reset to original device if required 
+                // reset to original device if required
                 if (original_device != device)
-                  check_cuda_error(cudaSetDevice(original_device));
+                    check_cuda_error(cudaSetDevice(original_device));
             }
             return true;
         }

From e547c24a9ba4cc95da13b40e320ce3ad81e3ee29 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Thu, 29 Jun 2023 11:33:26 -0500
Subject: [PATCH 24/25] Remove superfluous api call

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index 727379a1db09..f9c7da830cbb 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -54,11 +54,9 @@ namespace hpx { namespace cuda { namespace experimental {
             // pop an event off the pool, if that fails, create a new one
             while (!free_lists_[device].pop(event))
             {
-                // Set correct device in case if required
+                // Save current device
                 int original_device = -1;
                 check_cuda_error(cudaGetDevice(&original_device));
-                if (original_device != device)
-                    check_cuda_error(cudaSetDevice(device));
                 add_event_to_pool(device);
                 // reset to original device if required
                 if (original_device != device)

From 9f0d330d69770be4613908130dc4af94a5477779 Mon Sep 17 00:00:00 2001
From: Gregor Daiss <Gregor.Daiss+git@gmail.com>
Date: Thu, 6 Jul 2023 12:51:50 -0500
Subject: [PATCH 25/25] Fix inspect

---
 libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp    | 4 ++++
 libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp   | 2 ++
 libs/core/async_cuda/src/cuda_event.cpp                       | 2 ++
 libs/core/async_cuda/src/cuda_event_callback.cpp              | 2 ++
 libs/core/async_cuda/src/cuda_future.cpp                      | 2 ++
 libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp | 3 +++
 6 files changed, 15 insertions(+)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
index f9c7da830cbb..ce4db339969f 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -5,11 +5,15 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
+#include <cstddef>
 #include <deque>
 
+#include <hpx/assert.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 #include <hpx/concurrency/stack.hpp>
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
index 62fc91c2d1d4..e5e617c47171 100644
--- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
+++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -6,6 +6,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp
index 470dbf5533d4..1425937a8f29 100644
--- a/libs/core/async_cuda/src/cuda_event.cpp
+++ b/libs/core/async_cuda/src/cuda_event.cpp
@@ -3,6 +3,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/assert.hpp>
 #include <hpx/async_cuda/cuda_event.hpp>
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
index 9597656ecbe6..90698b402a98 100644
--- a/libs/core/async_cuda/src/cuda_event_callback.cpp
+++ b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -7,6 +7,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/config.hpp>
 #include <hpx/assert.hpp>
diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
index 545b1dc9b77c..accb87e1ff89 100644
--- a/libs/core/async_cuda/src/cuda_future.cpp
+++ b/libs/core/async_cuda/src/cuda_future.cpp
@@ -6,6 +6,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/allocator_support/internal_allocator.hpp>
 #include <hpx/assert.hpp>
diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
index 24493aa58992..ca1ee5579c7f 100644
--- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
+++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp
@@ -3,8 +3,11 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/algorithm.hpp>
+#include <hpx/assert.hpp>
 #include <hpx/execution.hpp>
 #include <hpx/future.hpp>
 #include <hpx/init.hpp>