Merge pull request #6284 from G-071/add-multigpu-polling

Add CUDA/HIP MultiGPU Event Polling
STEllAR-GROUP · Jul 8, 2023 · 75faae5 · 75faae5
2 parents 2e61670 + 9f0d330
commit 75faae5
Show file tree

Hide file tree

Showing 10 changed files with 252 additions and 50 deletions.
diff --git a/libs/core/async_cuda/CMakeLists.txt b/libs/core/async_cuda/CMakeLists.txt
@@ -36,7 +36,7 @@ set(async_cuda_compat_headers
 # cmake-format: on
 
 set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp
-                       get_targets.cpp
+                       get_targets.cpp cuda_event.cpp
 )
 
 if(HPX_WITH_HIP AND TARGET roc::hipblas)

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
@@ -1,12 +1,19 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2020 Teodor Nikolov
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
+#include <cstddef>
+#include <deque>
+
+#include <hpx/assert.hpp>
 #include <hpx/async_cuda/cuda_exception.hpp>
 #include <hpx/async_cuda/custom_gpu_api.hpp>
 #include <hpx/concurrency/stack.hpp>
@@ -19,66 +26,102 @@ namespace hpx { namespace cuda { namespace experimental {
     // of them at startup.
     struct cuda_event_pool
     {
-        static constexpr int initial_events_in_pool = 128;
-
-        static cuda_event_pool& get_event_pool()
-        {
-            static cuda_event_pool event_pool_;
-            return event_pool_;
-        }
+        static constexpr std::size_t initial_events_in_pool = 128;
 
-        // create a bunch of events on initialization
-        cuda_event_pool()
-          : free_list_(initial_events_in_pool)
-        {
-            for (int i = 0; i < initial_events_in_pool; ++i)
-            {
-                add_event_to_pool();
-            }
-        }
+        HPX_CORE_EXPORT static cuda_event_pool& get_event_pool();
 
         // on destruction, all objects in stack will be freed
         ~cuda_event_pool()
         {
-            cudaEvent_t event;
-            bool ok = true;
-            while (ok)
+            HPX_ASSERT_MSG(free_lists_.size() ==
+                    static_cast<std::size_t>(max_number_devices_),
+                "Number of CUDA event pools does not match the number of "
+                "devices!");
+            for (int device = 0; device < max_number_devices_; device++)
             {
-                ok = free_list_.pop(event);
-                if (ok)
-                    check_cuda_error(cudaEventDestroy(event));
+                check_cuda_error(cudaSetDevice(device));
+                cudaEvent_t event;
+                bool ok = true;
+                while (ok)
+                {
+                    ok = free_lists_[device].pop(event);
+                    if (ok)
+                        check_cuda_error(cudaEventDestroy(event));
+                }
             }
         }
 
-        inline bool pop(cudaEvent_t& event)
+        inline bool pop(cudaEvent_t& event, int device = 0)
         {
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
             // pop an event off the pool, if that fails, create a new one
-            while (!free_list_.pop(event))
+            while (!free_lists_[device].pop(event))
             {
-                add_event_to_pool();
+                // Save current device
+                int original_device = -1;
+                check_cuda_error(cudaGetDevice(&original_device));
+                add_event_to_pool(device);
+                // reset to original device if required
+                if (original_device != device)
+                    check_cuda_error(cudaSetDevice(original_device));
             }
             return true;
         }
 
-        inline bool push(cudaEvent_t event)
+        inline bool push(cudaEvent_t event, int device = 0)
         {
-            return free_list_.push(event);
+            HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
+                "Accessing CUDA event pool with invalid device ID!");
+            return free_lists_[device].push(event);
         }
 
+        // delete copy / move constructors
+        cuda_event_pool(cuda_event_pool&&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool&&) = delete;
+        cuda_event_pool(cuda_event_pool const&) = delete;
+        cuda_event_pool& operator=(cuda_event_pool const&) = delete;
+
     private:
-        void add_event_to_pool()
+        // Private singleton constructor. Creates a initial_events_in_pool of
+        // events on initialization
+        cuda_event_pool()
+          : max_number_devices_(0)
+        {
+            check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
+            HPX_ASSERT_MSG(max_number_devices_ > 0,
+                "CUDA polling enabled and called, yet no CUDA device found!");
+            int original_device = 0;
+            check_cuda_error(cudaGetDevice(&original_device));
+            for (int device = 0; device < max_number_devices_; device++)
+            {
+                check_cuda_error(cudaSetDevice(device));
+                free_lists_.emplace_back(initial_events_in_pool);
+                for (std::size_t i = 0; i < initial_events_in_pool; ++i)
+                {
+                    add_event_to_pool(device);
+                }
+            }
+            // Restore original device -- avoids confusion about HPX switching
+            // devices for the current thread if only one device is used.
+            check_cuda_error(cudaSetDevice(original_device));
+        }
+
+        void add_event_to_pool(int device)
         {
+            check_cuda_error(cudaSetDevice(device));
             cudaEvent_t event;
             // Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
             // Timing is disabled for performance. [1]
             //
             // [1]: CUDA Runtime API, section 5.5 cuda_event Management
             check_cuda_error(
                 cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-            free_list_.push(event);
+            free_lists_[device].push(event);
         }
+        int max_number_devices_;
 
-        // pool is dynamically sized and can grow if needed
-        hpx::lockfree::stack<cudaEvent_t> free_list_;
+        // One pool per GPU - each pool is dynamically sized and can grow if needed
+        std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
     };
 }}}    // namespace hpx::cuda::experimental
diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
@@ -1,10 +1,13 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Thomas Heller
 //  Copyright (c) 2016 Hartmut Kaiser
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #pragma once
 
@@ -81,7 +84,7 @@ namespace hpx { namespace cuda { namespace experimental {
             future_data() {}
 
             future_data(init_no_addref no_addref, other_allocator const& alloc,
-                cudaStream_t stream)
+                cudaStream_t stream, int device)
               : lcos::detail::future_data_allocator<void, Allocator,
                     future_data>(no_addref, alloc)
             {
@@ -104,7 +107,7 @@ namespace hpx { namespace cuda { namespace experimental {
                                     status)));
                         }
                     },
-                    stream);
+                    stream, device);
             }
         };
 
@@ -183,7 +186,8 @@ namespace hpx { namespace cuda { namespace experimental {
         // main API call to get a future from a stream using allocator, and the
         // specified mode
         template <typename Allocator, typename Mode>
-        hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
+        hpx::future<void> get_future(
+            Allocator const& a, cudaStream_t stream, int device)
         {
             using shared_state = future_data<Allocator, Mode>;
 
@@ -200,7 +204,19 @@ namespace hpx { namespace cuda { namespace experimental {
             unique_ptr p(traits::allocate(alloc, 1),
                 hpx::util::allocator_deleter<other_allocator>{alloc});
 
-            traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
+            static_assert(std::is_same_v<Mode, event_mode> ||
+                    std::is_same_v<Mode, callback_mode>,
+                "get_future mode not supported!");
+            if constexpr (std::is_same_v<Mode, event_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream, device);
+            }
+            else if constexpr (std::is_same_v<Mode, callback_mode>)
+            {
+                traits::construct(
+                    alloc, p.get(), init_no_addref{}, alloc, stream);
+            }
 
             return hpx::traits::future_access<future<void>>::create(
                 p.release(), false);
@@ -212,16 +228,20 @@ namespace hpx { namespace cuda { namespace experimental {
         hpx::future<void> get_future_with_callback(
             Allocator const& a, cudaStream_t stream)
         {
-            return get_future<Allocator, callback_mode>(a, stream);
+            // device id 0 will be dropped in callback mode - can be
+            // an arbitrary number here
+            return get_future<Allocator, callback_mode>(a, stream, 0);
         }
 
         // -------------------------------------------------------------
         // main API call to get a future from a stream using allocator
         template <typename Allocator>
         hpx::future<void> get_future_with_event(
-            Allocator const& a, cudaStream_t stream)
+            Allocator const& a, cudaStream_t stream, int device = -1)
         {
-            return get_future<Allocator, event_mode>(a, stream);
+            if (device == -1)
+                check_cuda_error(cudaGetDevice(&device));
+            return get_future<Allocator, event_mode>(a, stream, device);
         }
 
         // -------------------------------------------------------------
@@ -231,7 +251,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
         // -------------------------------------------------------------
         // non allocator version of : get future with an event set
-        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
+        HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
+            cudaStream_t stream, int device = -1);
     }    // namespace detail
 }}}      // namespace hpx::cuda::experimental
 

diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp
@@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
         hpx::move_only_function<void(cudaError_t)>;
 
     HPX_CORE_EXPORT void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream);
+        event_callback_function_type&& f, cudaStream_t stream, int device = 0);
 
     HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
     HPX_CORE_EXPORT void unregister_polling(

diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp
@@ -0,0 +1,19 @@
+//  Copyright (c) 2023 Gregor Daiß
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
+
+#include <hpx/assert.hpp>
+#include <hpx/async_cuda/cuda_event.hpp>
+#include <hpx/async_cuda/custom_gpu_api.hpp>
+
+namespace hpx { namespace cuda { namespace experimental {
+    cuda_event_pool& cuda_event_pool::get_event_pool()
+    {
+        static cuda_event_pool event_pool_;
+        return event_pool_;
+    }
+}}}    // namespace hpx::cuda::experimental
diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp
@@ -1,3 +1,4 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2021 ETH Zurich
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
@@ -6,6 +7,8 @@
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/config.hpp>
 #include <hpx/assert.hpp>
@@ -50,6 +53,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     {
         cudaEvent_t event;
         event_callback_function_type f;
+        int device;
     };
 
     using event_callback_queue_type =
@@ -104,17 +108,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
     }
 
     void add_event_callback(
-        event_callback_function_type&& f, cudaStream_t stream)
+        event_callback_function_type&& f, cudaStream_t stream, int device)
     {
         cudaEvent_t event;
-        if (!cuda_event_pool::get_event_pool().pop(event))
+        if (!cuda_event_pool::get_event_pool().pop(event, device))
         {
             HPX_THROW_EXCEPTION(hpx::error::invalid_status,
                 "add_event_callback", "could not get an event");
         }
         check_cuda_error(cudaEventRecord(event, stream));
 
-        detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
+        detail::add_to_event_callback_queue(
+            event_callback{event, HPX_MOVE(f), device});
     }
 
     // Background progress function for async CUDA operations. Checks for completed
@@ -177,7 +182,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                         "active events",
                         debug::dec<3>(get_number_of_active_events()));
                     continuation.f(status);
-                    pool.push(HPX_MOVE(continuation.event));
+                    pool.push(
+                        HPX_MOVE(continuation.event), continuation.device);
                     return true;
                 }),
             event_callback_vector.end());
@@ -199,7 +205,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
                     "active events",
                     debug::dec<3>(get_number_of_active_events()));
                 continuation.f(status);
-                pool.push(HPX_MOVE(continuation.event));
+                pool.push(HPX_MOVE(continuation.event), continuation.device);
             }
         }
 

diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp
@@ -1,10 +1,13 @@
+//  Copyright (c) 2023 Gregor Daiß
 //  Copyright (c) 2020 John Biddiscombe
 //  Copyright (c) 2016 Hartmut Kaiser
 //  Copyright (c) 2016 Thomas Heller
 //
 //  SPDX-License-Identifier: BSL-1.0
 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// hpxinspect:noascii
 
 #include <hpx/allocator_support/internal_allocator.hpp>
 #include <hpx/assert.hpp>
@@ -18,8 +21,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
             hpx::util::internal_allocator<>{}, stream);
     }
 
-    hpx::future<void> get_future_with_event(cudaStream_t stream)
+    hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
     {
-        return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
+        return get_future_with_event(
+            hpx::util::internal_allocator<>{}, stream, device);
     }
 }}}}    // namespace hpx::cuda::experimental::detail
diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp
@@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {
 
     hpx::future<void> target::get_future_with_event() const
     {
-        return detail::get_future_with_event(handle_.get_stream());
+        return detail::get_future_with_event(
+            handle_.get_stream(), handle_.get_device());
     }
 
     hpx::future<void> target::get_future_with_callback() const