STEllAR-GROUP · hkaiser · Jul 8, 2023 · Jun 19, 2023 · Jun 19, 2023 · Jun 19, 2023
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <deque>
+
 #include <hpx/async_cuda/cuda_exception.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 #include <hpx/concurrency/stack.hpp>
@@ -19,66 +21,95 @@ namespace hpx { namespace cuda { namespace experimental {
     // of them at startup.
     struct cuda_event_pool
     {
-        static constexpr int initial_events_in_pool = 128;
+        static constexpr std::size_t initial_events_in_pool = 128;
 
         static cuda_event_pool& get_event_pool()
         {
             static cuda_event_pool event_pool_;
             return event_pool_;
         }
 
-        // create a bunch of events on initialization
-        cuda_event_pool()
-          : free_list_(initial_events_in_pool)
-        {
-            for (int i = 0; i < initial_events_in_pool; ++i)
-            {
-                add_event_to_pool();
-            }
-        }
-
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            cudaEvent_t event;
-            bool ok = true;
-            while (ok)
+            HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
+                "Number of CUDA event pools does not match the number of "
+                "devices!");
+            for (int device = 0; device < max_number_devices_; device++)
             {
-                ok = free_list_.pop(event);
-                if (ok)
-                    check_cuda_error(cudaEventDestroy(event));
+                check_cuda_error(cudaSetDevice(device));
+                cudaEvent_t event;
+                bool ok = true;
+                while (ok)
+                {
+                    ok = free_lists_[device].pop(event);
+                    if (ok)
+                        check_cuda_error(cudaEventDestroy(event));
+                }
             }
         }
 
-        inline bool pop(cudaEvent_t& event)
+        inline bool pop(cudaEvent_t& event, int device = 0)
         {
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
             // pop an event off the pool, if that fails, create a new one
-            while (!free_list_.pop(event))
+            while (!free_lists_[device].pop(event))
             {
-                add_event_to_pool();
+                add_event_to_pool(device);
             }
             return true;
         }
 
-        inline bool push(cudaEvent_t event)
+        inline bool push(cudaEvent_t event, int device = 0)
         {
-            return free_list_.push(event);
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
+            return free_lists_[device].push(event);
         }
 
+        // delete copy / move constructors
+        cuda_event_pool(cuda_event_pool&&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool&&) = delete;
+        cuda_event_pool(const cuda_event_pool&) = delete;
+        cuda_event_pool& operator=(const cuda_event_pool&) = delete;
+
     private:
-        void add_event_to_pool()
+        // Private singleton constructor 
+        // Creates a bunch of events on initialization
+        cuda_event_pool()
+          : max_number_devices_(0)
+        {
+            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
+            HPX_ASSERT_MSG(max_number_devices_ > 0,
+                "CUDA polling enabled and called, yet no CUDA device found!");
+            /* free_lists_.reserve(max_number_devices_); */
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                free_lists_.emplace_back(initial_events_in_pool);
+                for (std::size_t i = 0; i < initial_events_in_pool; ++i)
+                {
+                    add_event_to_pool(device);
+                }
+            }
+        }
+
+        void add_event_to_pool(int device)
         {
+            check_cuda_error(cudaSetDevice(device));
             cudaEvent_t event;
             // Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
             // Timing is disabled for performance. [1]
             //
             // [1]: CUDA Runtime API, section 5.5 cuda_event Management
             check_cuda_error(
                 cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-            free_list_.push(event);
+            free_lists_[device].push(event);
         }
+        int max_number_devices_;
 
-        // pool is dynamically sized and can grow if needed
-        hpx::lockfree::stack<cudaEvent_t> free_list_;
+        // One pool per GPU - each pool is dynamically sized and can grow if needed
+        std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
     };
 }}}    // namespace hpx::cuda::experimental
@@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental {
             future_data() {}
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
             {
@@ -104,7 +104,7 @@ namespace hpx { namespace cuda { namespace experimental {
                                     status)));
                         }
                     },
-                    stream);
+                    stream, device);
             }
         };
 
@@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental {
             }
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
               , rt_(hpx::get_runtime_ptr())
@@ -183,7 +183,8 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator, and the
         // specified mode
         template <typename Allocator, typename Mode>
-        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
+        hpx::future<void> get_future(
+            Allocator const& a, cudaStream_t stream, int device = 0)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -200,7 +201,8 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
+            traits::construct(
+                alloc, p.get(), init_no_addref{}, alloc, stream, device);
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
@@ -212,16 +214,16 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream);
+            return get_future<Allocator, callback_mode>(a, stream, 0);
         }
 
         // -------------------------------------------------------------
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream)
+            Allocator const& a, cudaStream_t stream, int device = 0)
         {
-            return get_future<Allocator, event_mode>(a, stream);
+            return get_future<Allocator, event_mode>(a, stream, device);
         }
 
         // -------------------------------------------------------------
@@ -231,7 +233,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
-        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
+        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
+            cudaStream_t stream, int device = 0);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 

@@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         hpx::move_only_function<void(cudaError_t)>;
 
     HPX_CORE_EXPORT void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream);
+        event_callback_function_type&& f, cudaStream_t stream, int device = 0);
 
     HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
     HPX_CORE_EXPORT void unregister_polling(

@@ -50,6 +50,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     {
         cudaEvent_t event;
         event_callback_function_type f;
+        int device;
     };
 
     using event_callback_queue_type =
@@ -104,17 +105,19 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     }
 
     void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream)
+        event_callback_function_type&& f, cudaStream_t stream, int device)
     {
         cudaEvent_t event;
-        if (!cuda_event_pool::get_event_pool().pop(event))
+        if (!cuda_event_pool::get_event_pool().pop(event, device))
         {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
         }
+        check_cuda_error(cudaSetDevice(device));
         check_cuda_error(cudaEventRecord(event, stream));
 
-        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
+        detail::add_to_event_callback_queue(
+            event_callback{event, HPX_MOVE(f), device});
     }
 
     // Background progress function for async CUDA operations. Checks for completed
@@ -177,7 +180,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
-                    pool.push(HPX_MOVE(continuation.event));
+                    pool.push(
+                        HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
             event_callback_vector.end());
@@ -199,7 +203,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     "active events",
                     debug::dec<3>(get_number_of_active_events()));
                 continuation.f(status);
-                pool.push(HPX_MOVE(continuation.event));
+                pool.push(HPX_MOVE(continuation.event), continuation.device);
             }
         }
 

@@ -18,8 +18,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
             hpx::util::internal_allocator<>{}, stream);
     }
 
-    hpx::future<void> get_future_with_event(cudaStream_t stream)
+    hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
     {
-        return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
+        return get_future_with_event(
+            hpx::util::internal_allocator<>{}, stream, device);
     }
 }}}}    // namespace hpx::cuda::experimental::detail
@@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
     hpx::future<void> target::get_future_with_event() const
     {
-        return detail::get_future_with_event(handle_.get_stream());
+        return detail::get_future_with_event(
+            handle_.get_stream(), handle_.get_device());
     }
 
     hpx::future<void> target::get_future_with_callback() const