From 9f6dfcafeaea161cc8124bd9e2cc8e251c3ef0d7 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 14:04:16 -0500 Subject: [PATCH 01/25] Initial hardcoded version --- .../include/hpx/async_cuda/cuda_event.hpp | 16 +++++++++++----- .../include/hpx/async_cuda/cuda_future.hpp | 18 +++++++++--------- .../async_cuda/detail/cuda_event_callback.hpp | 2 +- .../async_cuda/src/cuda_event_callback.cpp | 17 +++++++++++------ libs/core/async_cuda/src/cuda_future.cpp | 4 ++-- libs/core/async_cuda/src/cuda_target.cpp | 2 +- 6 files changed, 35 insertions(+), 24 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index f2ae7ef59f5e..859caf245a9f 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -21,25 +21,30 @@ namespace hpx { namespace cuda { namespace experimental { { static constexpr int initial_events_in_pool = 128; - static cuda_event_pool& get_event_pool() + const int device_id; + + static cuda_event_pool& get_event_pool(size_t device_id) { - static cuda_event_pool event_pool_; - return event_pool_; + static std::array event_pool_{0, 1, 2, 3}; + return event_pool_[device_id]; } // create a bunch of events on initialization - cuda_event_pool() - : free_list_(initial_events_in_pool) + cuda_event_pool(int device_id) + : device_id(device_id), free_list_(initial_events_in_pool) { + check_cuda_error(cudaSetDevice(device_id)); for (int i = 0; i < initial_events_in_pool; ++i) { add_event_to_pool(); } + std::cerr << "Created " << device_id << std::endl; } // on destruction, all objects in stack will be freed ~cuda_event_pool() { + check_cuda_error(cudaSetDevice(device_id)); cudaEvent_t event; bool ok = true; while (ok) @@ -68,6 +73,7 @@ namespace hpx { namespace cuda { namespace experimental { private: void add_event_to_pool() { + check_cuda_error(cudaSetDevice(device_id)); cudaEvent_t event; // Create an cuda_event to query a CUDA/CUBLAS kernel for completion. // Timing is disabled for performance. [1] diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 042b8199dbeb..9084166ddc54 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental { future_data() {} future_data(init_no_addref no_addref, other_allocator const& alloc, - cudaStream_t stream) + cudaStream_t stream, int device) : lcos::detail::future_data_allocator(no_addref, alloc) { @@ -104,7 +104,7 @@ namespace hpx { namespace cuda { namespace experimental { status))); } }, - stream); + stream, device); } }; @@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental { } future_data(init_no_addref no_addref, other_allocator const& alloc, - cudaStream_t stream) + cudaStream_t stream, int device) : lcos::detail::future_data_allocator(no_addref, alloc) , rt_(hpx::get_runtime_ptr()) @@ -183,7 +183,7 @@ namespace hpx { namespace cuda { namespace experimental { // main API call to get a future from a stream using allocator, and the // specified mode template - hpx::future get_future(Allocator const& a, cudaStream_t stream) + hpx::future get_future(Allocator const& a, cudaStream_t stream, int device = 0) { using shared_state = future_data; @@ -200,7 +200,7 @@ namespace hpx { namespace cuda { namespace experimental { unique_ptr p(traits::allocate(alloc, 1), hpx::util::allocator_deleter{alloc}); - traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream); + traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream, device); return hpx::traits::future_access>::create( p.release(), false); @@ -212,16 +212,16 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future get_future_with_callback( Allocator const& a, cudaStream_t stream) { - return get_future(a, stream); + return get_future(a, stream, 0); } // ------------------------------------------------------------- // main API call to get a future from a stream using allocator template hpx::future get_future_with_event( - Allocator const& a, cudaStream_t stream) + Allocator const& a, cudaStream_t stream, int device) { - return get_future(a, stream); + return get_future(a, stream, device); } // ------------------------------------------------------------- @@ -231,7 +231,7 @@ namespace hpx { namespace cuda { namespace experimental { // ------------------------------------------------------------- // non allocator version of : get future with an event set - HPX_CORE_EXPORT hpx::future get_future_with_event(cudaStream_t); + HPX_CORE_EXPORT hpx::future get_future_with_event(cudaStream_t, int); } // namespace detail }}} // namespace hpx::cuda::experimental diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp index fde4361569d5..5c5592dbaded 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp @@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::move_only_function; HPX_CORE_EXPORT void add_event_callback( - event_callback_function_type&& f, cudaStream_t stream); + event_callback_function_type&& f, cudaStream_t stream, int device); HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool); HPX_CORE_EXPORT void unregister_polling( diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index 0d53bd4af3f7..56b5cd6884b7 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -50,6 +50,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { { cudaEvent_t event; event_callback_function_type f; + int device; }; using event_callback_queue_type = @@ -104,17 +105,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { } void add_event_callback( - event_callback_function_type&& f, cudaStream_t stream) + event_callback_function_type&& f, cudaStream_t stream, int device) { cudaEvent_t event; - if (!cuda_event_pool::get_event_pool().pop(event)) + if (!cuda_event_pool::get_event_pool(device).pop(event)) { HPX_THROW_EXCEPTION(hpx::error::invalid_status, "add_event_callback", "could not get an event"); } + check_cuda_error(cudaSetDevice(device)); check_cuda_error(cudaEventRecord(event, stream)); - detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)}); + detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f), device}); } // Background progress function for async CUDA operations. Checks for completed @@ -155,9 +157,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { debug::dec<3>(get_number_of_active_events())); } - // Grab the handle to the event pool so we can return completed events - cuda_event_pool& pool = - hpx::cuda::experimental::cuda_event_pool::get_event_pool(); // Iterate over our list of events and see if any have completed event_callback_vector.erase( @@ -177,6 +176,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); + // Grab the handle to the event pool so we can return completed events + cuda_event_pool& pool = hpx::cuda::experimental:: + cuda_event_pool::get_event_pool(continuation.device); pool.push(HPX_MOVE(continuation.event)); return true; }), @@ -199,6 +201,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); + // Grab the handle to the event pool so we can return completed events + cuda_event_pool& pool = + hpx::cuda::experimental::cuda_event_pool::get_event_pool(continuation.device); pool.push(HPX_MOVE(continuation.event)); } } diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp index 6b1dfbbaa436..fdf95bf8e96f 100644 --- a/libs/core/async_cuda/src/cuda_future.cpp +++ b/libs/core/async_cuda/src/cuda_future.cpp @@ -18,8 +18,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::util::internal_allocator<>{}, stream); } - hpx::future get_future_with_event(cudaStream_t stream) + hpx::future get_future_with_event(cudaStream_t stream, int device) { - return get_future_with_event(hpx::util::internal_allocator<>{}, stream); + return get_future_with_event(hpx::util::internal_allocator<>{}, stream, device); } }}}} // namespace hpx::cuda::experimental::detail diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp index 6906308ebc62..29377b216b55 100644 --- a/libs/core/async_cuda/src/cuda_target.cpp +++ b/libs/core/async_cuda/src/cuda_target.cpp @@ -187,7 +187,7 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future target::get_future_with_event() const { - return detail::get_future_with_event(handle_.get_stream()); + return detail::get_future_with_event(handle_.get_stream(), handle_.get_device()); } hpx::future target::get_future_with_callback() const From e91c81a3b2b19df655729c7cb9c4788ffde27abb Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 14:57:15 -0500 Subject: [PATCH 02/25] Detect number of GPUs --- .../include/hpx/async_cuda/cuda_event.hpp | 75 +++++++++++-------- .../async_cuda/src/cuda_event_callback.cpp | 10 +-- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 859caf245a9f..a2faccd5c873 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -7,6 +7,8 @@ #pragma once +#include + #include #include #include @@ -21,59 +23,71 @@ namespace hpx { namespace cuda { namespace experimental { { static constexpr int initial_events_in_pool = 128; - const int device_id; - - static cuda_event_pool& get_event_pool(size_t device_id) + static cuda_event_pool& get_event_pool() { - static std::array event_pool_{0, 1, 2, 3}; - return event_pool_[device_id]; + static cuda_event_pool event_pool_; + return event_pool_; } // create a bunch of events on initialization - cuda_event_pool(int device_id) - : device_id(device_id), free_list_(initial_events_in_pool) + cuda_event_pool() + : max_number_devices_(0) { - check_cuda_error(cudaSetDevice(device_id)); - for (int i = 0; i < initial_events_in_pool; ++i) - { - add_event_to_pool(); + check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); + HPX_ASSERT_MSG(max_number_devices_ > 0, + "CUDA polling enabled and called, yet no CUDA device found!"); + for (int device = 0; device < max_number_devices_; device++) { + check_cuda_error(cudaSetDevice(device)); + free_lists_.emplace_back(initial_events_in_pool); + for (int i = 0; i < initial_events_in_pool; ++i) + { + add_event_to_pool(device); + } + std::cerr << "Created " << device << std::endl; } - std::cerr << "Created " << device_id << std::endl; } // on destruction, all objects in stack will be freed ~cuda_event_pool() { - check_cuda_error(cudaSetDevice(device_id)); - cudaEvent_t event; - bool ok = true; - while (ok) - { - ok = free_list_.pop(event); - if (ok) - check_cuda_error(cudaEventDestroy(event)); + HPX_ASSERT_MSG(free_lists_.size != max_number_devices_, + "Number of CUDA event pools does not match the number of devices!"); + for (int device = 0; device < max_number_devices_; device++) { + check_cuda_error(cudaSetDevice(device)); + cudaEvent_t event; + bool ok = true; + while (ok) + { + ok = free_lists_[device].pop(event); + if (ok) + check_cuda_error(cudaEventDestroy(event)); + } } } - inline bool pop(cudaEvent_t& event) + inline bool pop(cudaEvent_t& event, int device) { + HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, + "Accessing CUDA event pool with invalid device ID!"); // pop an event off the pool, if that fails, create a new one - while (!free_list_.pop(event)) + while (!free_lists_[device].pop(event)) { - add_event_to_pool(); + add_event_to_pool(device); } return true; } - inline bool push(cudaEvent_t event) + inline bool push(cudaEvent_t event, int device) { - return free_list_.push(event); + HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, + "Accessing CUDA event pool with invalid device ID!"); + return free_lists_[device].push(event); } private: - void add_event_to_pool() + void add_event_to_pool(int device) { - check_cuda_error(cudaSetDevice(device_id)); + check_cuda_error(cudaSetDevice(device)); cudaEvent_t event; // Create an cuda_event to query a CUDA/CUBLAS kernel for completion. // Timing is disabled for performance. [1] @@ -81,10 +95,11 @@ namespace hpx { namespace cuda { namespace experimental { // [1]: CUDA Runtime API, section 5.5 cuda_event Management check_cuda_error( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - free_list_.push(event); + free_lists_[device].push(event); } + int max_number_devices_; - // pool is dynamically sized and can grow if needed - hpx::lockfree::stack free_list_; + // One pool per GPU - each pool is dynamically sized and can grow if needed + std::deque> free_lists_; }; }}} // namespace hpx::cuda::experimental diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index 56b5cd6884b7..b36b5d4940d2 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -108,7 +108,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { event_callback_function_type&& f, cudaStream_t stream, int device) { cudaEvent_t event; - if (!cuda_event_pool::get_event_pool(device).pop(event)) + if (!cuda_event_pool::get_event_pool().pop(event, device)) { HPX_THROW_EXCEPTION(hpx::error::invalid_status, "add_event_callback", "could not get an event"); @@ -178,8 +178,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { continuation.f(status); // Grab the handle to the event pool so we can return completed events cuda_event_pool& pool = hpx::cuda::experimental:: - cuda_event_pool::get_event_pool(continuation.device); - pool.push(HPX_MOVE(continuation.event)); + cuda_event_pool::get_event_pool(); + pool.push(HPX_MOVE(continuation.event), continuation.device); return true; }), event_callback_vector.end()); @@ -203,8 +203,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { continuation.f(status); // Grab the handle to the event pool so we can return completed events cuda_event_pool& pool = - hpx::cuda::experimental::cuda_event_pool::get_event_pool(continuation.device); - pool.push(HPX_MOVE(continuation.event)); + hpx::cuda::experimental::cuda_event_pool::get_event_pool(); + pool.push(HPX_MOVE(continuation.event), continuation.device); } } From 25494a207169c46292cce1fdc4c06c5732fa92e6 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 15:22:04 -0500 Subject: [PATCH 03/25] Cleanup --- .../async_cuda/include/hpx/async_cuda/cuda_event.hpp | 1 - libs/core/async_cuda/src/cuda_event_callback.cpp | 9 +++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index a2faccd5c873..66659371fb4f 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -43,7 +43,6 @@ namespace hpx { namespace cuda { namespace experimental { { add_event_to_pool(device); } - std::cerr << "Created " << device << std::endl; } } diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index b36b5d4940d2..b1439ff338bd 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -157,6 +157,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { debug::dec<3>(get_number_of_active_events())); } + // Grab the handle to the event pool so we can return completed events + cuda_event_pool& pool = + hpx::cuda::experimental::cuda_event_pool::get_event_pool(); // Iterate over our list of events and see if any have completed event_callback_vector.erase( @@ -176,9 +179,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); - // Grab the handle to the event pool so we can return completed events - cuda_event_pool& pool = hpx::cuda::experimental:: - cuda_event_pool::get_event_pool(); pool.push(HPX_MOVE(continuation.event), continuation.device); return true; }), @@ -201,9 +201,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); - // Grab the handle to the event pool so we can return completed events - cuda_event_pool& pool = - hpx::cuda::experimental::cuda_event_pool::get_event_pool(); pool.push(HPX_MOVE(continuation.event), continuation.device); } } From 64ecdfc791a3dfd44a8eecfcdfb9d164a2a2f322 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 15:56:03 -0500 Subject: [PATCH 04/25] Add missing default value --- libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 9084166ddc54..1c4503d66b81 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -219,7 +219,7 @@ namespace hpx { namespace cuda { namespace experimental { // main API call to get a future from a stream using allocator template hpx::future get_future_with_event( - Allocator const& a, cudaStream_t stream, int device) + Allocator const& a, cudaStream_t stream, int device = 0) { return get_future(a, stream, device); } @@ -231,7 +231,8 @@ namespace hpx { namespace cuda { namespace experimental { // ------------------------------------------------------------- // non allocator version of : get future with an event set - HPX_CORE_EXPORT hpx::future get_future_with_event(cudaStream_t, int); + HPX_CORE_EXPORT hpx::future get_future_with_event( + cudaStream_t stream, int device = 0); } // namespace detail }}} // namespace hpx::cuda::experimental From 12b3e3c3ac06a9981aebb740a935e95aad555bd4 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 16:10:37 -0500 Subject: [PATCH 05/25] Fix format --- .../include/hpx/async_cuda/cuda_event.hpp | 36 ++++++++++--------- .../include/hpx/async_cuda/cuda_future.hpp | 6 ++-- .../async_cuda/src/cuda_event_callback.cpp | 6 ++-- libs/core/async_cuda/src/cuda_future.cpp | 3 +- libs/core/async_cuda/src/cuda_target.cpp | 3 +- 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 66659371fb4f..cca0859f920b 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -36,13 +36,14 @@ namespace hpx { namespace cuda { namespace experimental { check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); HPX_ASSERT_MSG(max_number_devices_ > 0, "CUDA polling enabled and called, yet no CUDA device found!"); - for (int device = 0; device < max_number_devices_; device++) { - check_cuda_error(cudaSetDevice(device)); - free_lists_.emplace_back(initial_events_in_pool); - for (int i = 0; i < initial_events_in_pool; ++i) - { - add_event_to_pool(device); - } + for (int device = 0; device < max_number_devices_; device++) + { + check_cuda_error(cudaSetDevice(device)); + free_lists_.emplace_back(initial_events_in_pool); + for (int i = 0; i < initial_events_in_pool; ++i) + { + add_event_to_pool(device); + } } } @@ -51,16 +52,17 @@ namespace hpx { namespace cuda { namespace experimental { { HPX_ASSERT_MSG(free_lists_.size != max_number_devices_, "Number of CUDA event pools does not match the number of devices!"); - for (int device = 0; device < max_number_devices_; device++) { - check_cuda_error(cudaSetDevice(device)); - cudaEvent_t event; - bool ok = true; - while (ok) - { - ok = free_lists_[device].pop(event); - if (ok) - check_cuda_error(cudaEventDestroy(event)); - } + for (int device = 0; device < max_number_devices_; device++) + { + check_cuda_error(cudaSetDevice(device)); + cudaEvent_t event; + bool ok = true; + while (ok) + { + ok = free_lists_[device].pop(event); + if (ok) + check_cuda_error(cudaEventDestroy(event)); + } } } diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 1c4503d66b81..9633d40bf2bc 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -183,7 +183,8 @@ namespace hpx { namespace cuda { namespace experimental { // main API call to get a future from a stream using allocator, and the // specified mode template - hpx::future get_future(Allocator const& a, cudaStream_t stream, int device = 0) + hpx::future get_future( + Allocator const& a, cudaStream_t stream, int device = 0) { using shared_state = future_data; @@ -200,7 +201,8 @@ namespace hpx { namespace cuda { namespace experimental { unique_ptr p(traits::allocate(alloc, 1), hpx::util::allocator_deleter{alloc}); - traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream, device); + traits::construct( + alloc, p.get(), init_no_addref{}, alloc, stream, device); return hpx::traits::future_access>::create( p.release(), false); diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index b1439ff338bd..b472e5c55e23 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -116,7 +116,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { check_cuda_error(cudaSetDevice(device)); check_cuda_error(cudaEventRecord(event, stream)); - detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f), device}); + detail::add_to_event_callback_queue( + event_callback{event, HPX_MOVE(f), device}); } // Background progress function for async CUDA operations. Checks for completed @@ -179,7 +180,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); - pool.push(HPX_MOVE(continuation.event), continuation.device); + pool.push( + HPX_MOVE(continuation.event), continuation.device); return true; }), event_callback_vector.end()); diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp index fdf95bf8e96f..c797191e1474 100644 --- a/libs/core/async_cuda/src/cuda_future.cpp +++ b/libs/core/async_cuda/src/cuda_future.cpp @@ -20,6 +20,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::future get_future_with_event(cudaStream_t stream, int device) { - return get_future_with_event(hpx::util::internal_allocator<>{}, stream, device); + return get_future_with_event( + hpx::util::internal_allocator<>{}, stream, device); } }}}} // namespace hpx::cuda::experimental::detail diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp index 29377b216b55..60ea190980f0 100644 --- a/libs/core/async_cuda/src/cuda_target.cpp +++ b/libs/core/async_cuda/src/cuda_target.cpp @@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future target::get_future_with_event() const { - return detail::get_future_with_event(handle_.get_stream(), handle_.get_device()); + return detail::get_future_with_event( + handle_.get_stream(), handle_.get_device()); } hpx::future target::get_future_with_callback() const From bf85ef6cd635a5cb51e0f402050545feb46a3cfa Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 16:19:38 -0500 Subject: [PATCH 06/25] Add missing format fix --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index cca0859f920b..3f239416226a 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -51,7 +51,8 @@ namespace hpx { namespace cuda { namespace experimental { ~cuda_event_pool() { HPX_ASSERT_MSG(free_lists_.size != max_number_devices_, - "Number of CUDA event pools does not match the number of devices!"); + "Number of CUDA event pools does not match the number of " + "devices!"); for (int device = 0; device < max_number_devices_; device++) { check_cuda_error(cudaSetDevice(device)); From 6a24f16e8acd820c2a72a8f00f052a9f9c6e9fa4 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Mon, 19 Jun 2023 18:55:41 -0500 Subject: [PATCH 07/25] Add default device arguments --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++-- .../include/hpx/async_cuda/detail/cuda_event_callback.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 3f239416226a..1a7b8256f91e 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -67,7 +67,7 @@ namespace hpx { namespace cuda { namespace experimental { } } - inline bool pop(cudaEvent_t& event, int device) + inline bool pop(cudaEvent_t& event, int device = 0) { HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, "Accessing CUDA event pool with invalid device ID!"); @@ -79,7 +79,7 @@ namespace hpx { namespace cuda { namespace experimental { return true; } - inline bool push(cudaEvent_t event, int device) + inline bool push(cudaEvent_t event, int device = 0) { HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, "Accessing CUDA event pool with invalid device ID!"); diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp index 5c5592dbaded..e1e6ef49cb20 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp @@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::move_only_function; HPX_CORE_EXPORT void add_event_callback( - event_callback_function_type&& f, cudaStream_t stream, int device); + event_callback_function_type&& f, cudaStream_t stream, int device = 0); HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool); HPX_CORE_EXPORT void unregister_polling( From e665a9e4a495c5c9271641c89ee0fa64edb379ed Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Tue, 20 Jun 2023 09:13:33 -0500 Subject: [PATCH 08/25] Fix asserts --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 1a7b8256f91e..ff33a067f497 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -69,7 +69,7 @@ namespace hpx { namespace cuda { namespace experimental { inline bool pop(cudaEvent_t& event, int device = 0) { - HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, + HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_, "Accessing CUDA event pool with invalid device ID!"); // pop an event off the pool, if that fails, create a new one while (!free_lists_[device].pop(event)) @@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental { inline bool push(cudaEvent_t event, int device = 0) { - HPX_ASSERT_MSG(device > 0 && device < max_number_devices_, + HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_, "Accessing CUDA event pool with invalid device ID!"); return free_lists_[device].push(event); } From 791cecd62be30d08209f6b0fd56efaa43ffa17d4 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 21 Jun 2023 13:57:21 -0500 Subject: [PATCH 09/25] Delete copy/move event pool constructors --- .../include/hpx/async_cuda/cuda_event.hpp | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index ff33a067f497..1783dd28721a 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -21,7 +21,7 @@ namespace hpx { namespace cuda { namespace experimental { // of them at startup. struct cuda_event_pool { - static constexpr int initial_events_in_pool = 128; + static constexpr std::size_t initial_events_in_pool = 128; static cuda_event_pool& get_event_pool() { @@ -29,24 +29,6 @@ namespace hpx { namespace cuda { namespace experimental { return event_pool_; } - // create a bunch of events on initialization - cuda_event_pool() - : max_number_devices_(0) - { - check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); - HPX_ASSERT_MSG(max_number_devices_ > 0, - "CUDA polling enabled and called, yet no CUDA device found!"); - for (int device = 0; device < max_number_devices_; device++) - { - check_cuda_error(cudaSetDevice(device)); - free_lists_.emplace_back(initial_events_in_pool); - for (int i = 0; i < initial_events_in_pool; ++i) - { - add_event_to_pool(device); - } - } - } - // on destruction, all objects in stack will be freed ~cuda_event_pool() { @@ -86,7 +68,33 @@ namespace hpx { namespace cuda { namespace experimental { return free_lists_[device].push(event); } + // delete copy / move constructors + cuda_event_pool(cuda_event_pool&&) = delete; + cuda_event_pool& operator=(cuda_event_pool&&) = delete; + cuda_event_pool(const cuda_event_pool&) = delete; + cuda_event_pool& operator=(const cuda_event_pool&) = delete; + private: + // Private singleton constructor + // Creates a bunch of events on initialization + cuda_event_pool() + : max_number_devices_(0) + { + check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); + HPX_ASSERT_MSG(max_number_devices_ > 0, + "CUDA polling enabled and called, yet no CUDA device found!"); + /* free_lists_.reserve(max_number_devices_); */ + for (int device = 0; device < max_number_devices_; device++) + { + check_cuda_error(cudaSetDevice(device)); + free_lists_.emplace_back(initial_events_in_pool); + for (std::size_t i = 0; i < initial_events_in_pool; ++i) + { + add_event_to_pool(device); + } + } + } + void add_event_to_pool(int device) { check_cuda_error(cudaSetDevice(device)); From d93ada24a541a0de315974d97676ce0b2bd95375 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 21 Jun 2023 14:07:16 -0500 Subject: [PATCH 10/25] Fix unused parameter warning --- .../include/hpx/async_cuda/cuda_future.hpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 9633d40bf2bc..85fff23cfdf5 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental { } future_data(init_no_addref no_addref, other_allocator const& alloc, - cudaStream_t stream, int device) + cudaStream_t stream) : lcos::detail::future_data_allocator(no_addref, alloc) , rt_(hpx::get_runtime_ptr()) @@ -201,8 +201,19 @@ namespace hpx { namespace cuda { namespace experimental { unique_ptr p(traits::allocate(alloc, 1), hpx::util::allocator_deleter{alloc}); - traits::construct( - alloc, p.get(), init_no_addref{}, alloc, stream, device); + static_assert(std::is_same_v || + std::is_same_v, + "get_future mode not supported!"); + if constexpr (std::is_same_v) + { + traits::construct( + alloc, p.get(), init_no_addref{}, alloc, stream, device); + } + else if constexpr (std::is_same_v) + { + traits::construct( + alloc, p.get(), init_no_addref{}, alloc, stream); + } return hpx::traits::future_access>::create( p.release(), false); @@ -214,7 +225,7 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future get_future_with_callback( Allocator const& a, cudaStream_t stream) { - return get_future(a, stream, 0); + return get_future(a, stream); } // ------------------------------------------------------------- From 7a5f5510abd19345440e650e4afa248979345eeb Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Thu, 22 Jun 2023 21:31:16 -0500 Subject: [PATCH 11/25] Switch to east const --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 1783dd28721a..07783c97e422 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -71,8 +71,8 @@ namespace hpx { namespace cuda { namespace experimental { // delete copy / move constructors cuda_event_pool(cuda_event_pool&&) = delete; cuda_event_pool& operator=(cuda_event_pool&&) = delete; - cuda_event_pool(const cuda_event_pool&) = delete; - cuda_event_pool& operator=(const cuda_event_pool&) = delete; + cuda_event_pool(cuda_event_pool const&) = delete; + cuda_event_pool& operator=(cuda_event_pool const&) = delete; private: // Private singleton constructor From b37f72a7096ca0d614a239551446453a770dd777 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Thu, 22 Jun 2023 21:31:50 -0500 Subject: [PATCH 12/25] Add -1 default parameter for device auto-detection --- .../include/hpx/async_cuda/cuda_future.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 85fff23cfdf5..7b906baa42ed 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -184,7 +184,7 @@ namespace hpx { namespace cuda { namespace experimental { // specified mode template hpx::future get_future( - Allocator const& a, cudaStream_t stream, int device = 0) + Allocator const& a, cudaStream_t stream, int device) { using shared_state = future_data; @@ -225,15 +225,19 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future get_future_with_callback( Allocator const& a, cudaStream_t stream) { - return get_future(a, stream); + // device id 0 will be dropped in callback mode - can be + // an arbitrary number here + return get_future(a, stream, 0); } // ------------------------------------------------------------- // main API call to get a future from a stream using allocator template hpx::future get_future_with_event( - Allocator const& a, cudaStream_t stream, int device = 0) + Allocator const& a, cudaStream_t stream, int device = -1) { + if (device == -1) + check_cuda_error(cudaGetDevice(&device)); return get_future(a, stream, device); } @@ -245,7 +249,7 @@ namespace hpx { namespace cuda { namespace experimental { // ------------------------------------------------------------- // non allocator version of : get future with an event set HPX_CORE_EXPORT hpx::future get_future_with_event( - cudaStream_t stream, int device = 0); + cudaStream_t stream, int device = -1); } // namespace detail }}} // namespace hpx::cuda::experimental From d1a81f0ea424eadd3ad732644cbe5adbbd745869 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Tue, 27 Jun 2023 11:14:46 -0500 Subject: [PATCH 13/25] Fix assert --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 07783c97e422..8b7b740a167b 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -1,6 +1,5 @@ // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2020 Teodor Nikolov -// // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -32,7 +31,7 @@ namespace hpx { namespace cuda { namespace experimental { // on destruction, all objects in stack will be freed ~cuda_event_pool() { - HPX_ASSERT_MSG(free_lists_.size != max_number_devices_, + HPX_ASSERT_MSG(free_lists_.size() != max_number_devices_, "Number of CUDA event pools does not match the number of " "devices!"); for (int device = 0; device < max_number_devices_; device++) From d1f59531535125d330fb0ef4e48fe279a82f9a45 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Tue, 27 Jun 2023 13:00:29 -0500 Subject: [PATCH 14/25] Restore original device after init --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 8b7b740a167b..6d249d2031e4 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -82,7 +82,8 @@ namespace hpx { namespace cuda { namespace experimental { check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); HPX_ASSERT_MSG(max_number_devices_ > 0, "CUDA polling enabled and called, yet no CUDA device found!"); - /* free_lists_.reserve(max_number_devices_); */ + int original_device = 0; + check_cuda_error(cudaGetDevice(&original_device)); for (int device = 0; device < max_number_devices_; device++) { check_cuda_error(cudaSetDevice(device)); @@ -92,6 +93,9 @@ namespace hpx { namespace cuda { namespace experimental { add_event_to_pool(device); } } + // Restore original device -- avoids confusion about HPX switching + // devices for the current thread if only one device is used. + check_cuda_error(cudaSetDevice(original_device)); } void add_event_to_pool(int device) From 9b3388e92a463efd8da772d6be036a4a4b2101b9 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Tue, 27 Jun 2023 14:54:41 -0500 Subject: [PATCH 15/25] Remove superfluous cudaSetDevice --- libs/core/async_cuda/src/cuda_event_callback.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index b472e5c55e23..1b0267cd068b 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -113,7 +113,6 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { HPX_THROW_EXCEPTION(hpx::error::invalid_status, "add_event_callback", "could not get an event"); } - check_cuda_error(cudaSetDevice(device)); check_cuda_error(cudaEventRecord(event, stream)); detail::add_to_event_callback_queue( From ee33c48def919c1cd7ffed7e9092b7ee451ccd01 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 14:18:32 -0500 Subject: [PATCH 16/25] Put event pool singleton access definition in src --- libs/core/async_cuda/CMakeLists.txt | 2 +- .../include/hpx/async_cuda/cuda_event.hpp | 11 +++++------ libs/core/async_cuda/src/cuda_event.cpp | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 7 deletions(-) create mode 100644 libs/core/async_cuda/src/cuda_event.cpp diff --git a/libs/core/async_cuda/CMakeLists.txt b/libs/core/async_cuda/CMakeLists.txt index 77782a23bc47..b2a7954877bb 100644 --- a/libs/core/async_cuda/CMakeLists.txt +++ b/libs/core/async_cuda/CMakeLists.txt @@ -36,7 +36,7 @@ set(async_cuda_compat_headers # cmake-format: on set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp - get_targets.cpp + get_targets.cpp cuda_event.cpp ) if(HPX_WITH_HIP AND TARGET roc::hipblas) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 6d249d2031e4..c399839a7a13 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -1,5 +1,7 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2020 Teodor Nikolov +// // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -22,16 +24,13 @@ namespace hpx { namespace cuda { namespace experimental { { static constexpr std::size_t initial_events_in_pool = 128; - static cuda_event_pool& get_event_pool() - { - static cuda_event_pool event_pool_; - return event_pool_; - } + HPX_CORE_EXPORT static cuda_event_pool& get_event_pool(); // on destruction, all objects in stack will be freed ~cuda_event_pool() { - HPX_ASSERT_MSG(free_lists_.size() != max_number_devices_, + HPX_ASSERT_MSG(free_lists_.size() != + static_cast(max_number_devices_), "Number of CUDA event pools does not match the number of " "devices!"); for (int device = 0; device < max_number_devices_; device++) diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp new file mode 100644 index 000000000000..809cf1fa892c --- /dev/null +++ b/libs/core/async_cuda/src/cuda_event.cpp @@ -0,0 +1,17 @@ +// Copyright (c) 2023 Gregor Daiß +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include + +namespace hpx { namespace cuda { namespace experimental { + cuda_event_pool& cuda_event_pool::get_event_pool() + { + static cuda_event_pool event_pool_; + return event_pool_; + } +}}} // namespace hpx::cuda::experimental From 707c0254e57bf10b00e84739a7d80afb2db45eee Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 16:08:11 -0500 Subject: [PATCH 17/25] Add basic multi gpu polling test --- .../core/async_cuda/tests/unit/CMakeLists.txt | 5 +- .../tests/unit/cuda_multi_device_polling.cpp | 83 +++++++++++++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp diff --git a/libs/core/async_cuda/tests/unit/CMakeLists.txt b/libs/core/async_cuda/tests/unit/CMakeLists.txt index ce6cf5a19ee2..b14933107024 100644 --- a/libs/core/async_cuda/tests/unit/CMakeLists.txt +++ b/libs/core/async_cuda/tests/unit/CMakeLists.txt @@ -4,16 +4,19 @@ # Distributed under the Boost Software License, Version 1.0. (See accompanying # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -set(tests cuda_future transform_stream) +set(tests cuda_future cuda_multi_device_polling transform_stream) if(HPX_WITH_GPUBLAS) set(benchmarks ${benchmarks} cublas_matmul) endif() set(cublas_matmul_PARAMETERS THREADS_PER_LOCALITY 4) set(cuda_future_PARAMETERS THREADS_PER_LOCALITY 4) +set(cuda_multi_device_polling_PARAMETERS THREADS_PER_LOCALITY 4) set(transform_stream_PARAMETERS THREADS_PER_LOCALITY 4) set(cuda_future_CUDA_SOURCE saxpy trivial_demo) +set(cuda_multi_device_polling_CUDA_SOURCE trivial_demo) + set(transform_stream_CUDA ON) foreach(test ${tests}) diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp new file mode 100644 index 000000000000..2200155055e9 --- /dev/null +++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp @@ -0,0 +1,83 @@ +// Copyright (c) 2023 Gregor Daiß +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// ------------------------------------------------------------------------- +// This example is similar to the unit/cuda_future.cpp example (hence it also uses +// the externally defined cuda_trivial_kernel. See unit/cuda_future.cpp for +// more details regarding this. +// +// This example extends unit/cuda_future.cpp by testing the cuda event polling +// on multiple devices (if available)! If the polling is not working correctly, +// the test will time out as some of the futures are never triggered. + +template +extern void cuda_trivial_kernel(T, cudaStream_t stream); + +extern void launch_saxpy_kernel( + hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks, + unsigned int& threads, void** args); + + +// ------------------------------------------------------------------------- +int hpx_main(hpx::program_options::variables_map& vm) +{ + // install cuda future polling handler + hpx::cuda::experimental::enable_user_polling poll("default"); + + // Print all targets for debug purposes + hpx::cuda::experimental::print_local_targets(); + + int number_devices = 0; + hpx::cuda::experimental::check_cuda_error(cudaGetDeviceCount(&number_devices)); + HPX_ASSERT(number_devices > 0); + + std::vector> futs(number_devices); + + for (auto device_id = 0; device_id < number_devices; device_id++) { + hpx::cuda::experimental::cuda_executor exec( + device_id, hpx::cuda::experimental::event_mode{}); + auto fut = hpx::async(exec, cuda_trivial_kernel, + static_cast(device_id) + 1); + futs[device_id] = fut.then([device_id](hpx::future&&) { + std::cout + << "Continuation for kernel future triggered on device executor " + << device_id << std::endl; + }); + } + + auto final_fut = hpx::when_all(futs); + std::cout << "All executor test kernels launched! " << std::endl; + final_fut.get(); + std::cout << "All executor test kernels finished! " << std::endl; + + return hpx::local::finalize(); +} + +// ------------------------------------------------------------------------- +int main(int argc, char** argv) +{ + std::cout << "[HPX Cuda multi device polling] - Starting...\n" << std::endl; + + hpx::local::init_params init_args; + + auto result = hpx::local::init(hpx_main, argc, argv, init_args); + return result || hpx::util::report_errors(); +} From 8d7f6367ceff3e427243cbd9db2c0a7fe97d2483 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 16:51:55 -0500 Subject: [PATCH 18/25] Fix assert (again) --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index c399839a7a13..fa2c8d80b9e4 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -29,7 +29,7 @@ namespace hpx { namespace cuda { namespace experimental { // on destruction, all objects in stack will be freed ~cuda_event_pool() { - HPX_ASSERT_MSG(free_lists_.size() != + HPX_ASSERT_MSG(free_lists_.size() == static_cast(max_number_devices_), "Number of CUDA event pools does not match the number of " "devices!"); From a7e76e5602e1926b4fcada7c7fab15daf12b3062 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 17:12:27 -0500 Subject: [PATCH 19/25] Add test for default device ID --- .../tests/unit/cuda_multi_device_polling.cpp | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp index 2200155055e9..636c610ca952 100644 --- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp +++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp @@ -46,28 +46,49 @@ int hpx_main(hpx::program_options::variables_map& vm) hpx::cuda::experimental::print_local_targets(); int number_devices = 0; - hpx::cuda::experimental::check_cuda_error(cudaGetDeviceCount(&number_devices)); + hpx::cuda::experimental::check_cuda_error( + cudaGetDeviceCount(&number_devices)); HPX_ASSERT(number_devices > 0); + // Check if the futures complete when using executors on all devices std::vector> futs(number_devices); - - for (auto device_id = 0; device_id < number_devices; device_id++) { - hpx::cuda::experimental::cuda_executor exec( - device_id, hpx::cuda::experimental::event_mode{}); - auto fut = hpx::async(exec, cuda_trivial_kernel, - static_cast(device_id) + 1); - futs[device_id] = fut.then([device_id](hpx::future&&) { - std::cout - << "Continuation for kernel future triggered on device executor " - << device_id << std::endl; - }); + for (auto device_id = 0; device_id < number_devices; device_id++) + { + hpx::cuda::experimental::cuda_executor exec( + device_id, hpx::cuda::experimental::event_mode{}); + auto fut = hpx::async(exec, cuda_trivial_kernel, + static_cast(device_id) + 1); + futs[device_id] = fut.then([device_id](hpx::future&&) { + std::cout << "Continuation for kernel future triggered on device " + "executor " + << device_id << std::endl; + }); } - auto final_fut = hpx::when_all(futs); std::cout << "All executor test kernels launched! " << std::endl; final_fut.get(); std::cout << "All executor test kernels finished! " << std::endl; + // Test to see if HPX correctly picks up the current device in case + // get_future_with_event is not given a device_id + for (auto device_id = 0; device_id < number_devices; device_id++) + { + hpx::cuda::experimental::check_cuda_error(cudaSetDevice(device_id)); + cudaStream_t device_stream; + hpx::cuda::experimental::check_cuda_error( + cudaStreamCreate(&device_stream)); + cuda_trivial_kernel( + number_devices + device_id + 1, device_stream); + auto fut = hpx::cuda::experimental::detail::get_future_with_event( + device_stream); + fut.get(); + std::cout + << "get_future_with_event default ID test finished on device " + << device_id << std::endl; + hpx::cuda::experimental::check_cuda_error( + cudaStreamDestroy(device_stream)); + } + return hpx::local::finalize(); } From 525ad21d1681da5f84eb149b9ba00608841bef98 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 17:13:50 -0500 Subject: [PATCH 20/25] Fix some format issues / update file author lists --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++-- libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp | 5 +++-- libs/core/async_cuda/src/cuda_event.cpp | 2 +- libs/core/async_cuda/src/cuda_event_callback.cpp | 1 + libs/core/async_cuda/src/cuda_future.cpp | 1 + 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index fa2c8d80b9e4..31c6cc768f85 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -73,8 +73,8 @@ namespace hpx { namespace cuda { namespace experimental { cuda_event_pool& operator=(cuda_event_pool const&) = delete; private: - // Private singleton constructor - // Creates a bunch of events on initialization + // Private singleton constructor. Creates a initial_events_in_pool of + // events on initialization cuda_event_pool() : max_number_devices_(0) { diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 7b906baa42ed..62fc91c2d1d4 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Thomas Heller // Copyright (c) 2016 Hartmut Kaiser @@ -236,8 +237,8 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future get_future_with_event( Allocator const& a, cudaStream_t stream, int device = -1) { - if (device == -1) - check_cuda_error(cudaGetDevice(&device)); + if (device == -1) + check_cuda_error(cudaGetDevice(&device)); return get_future(a, stream, device); } diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp index 809cf1fa892c..470dbf5533d4 100644 --- a/libs/core/async_cuda/src/cuda_event.cpp +++ b/libs/core/async_cuda/src/cuda_event.cpp @@ -8,7 +8,7 @@ #include #include -namespace hpx { namespace cuda { namespace experimental { +namespace hpx { namespace cuda { namespace experimental { cuda_event_pool& cuda_event_pool::get_event_pool() { static cuda_event_pool event_pool_; diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index 1b0267cd068b..9597656ecbe6 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2021 ETH Zurich // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Hartmut Kaiser diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp index c797191e1474..545b1dc9b77c 100644 --- a/libs/core/async_cuda/src/cuda_future.cpp +++ b/libs/core/async_cuda/src/cuda_future.cpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Hartmut Kaiser // Copyright (c) 2016 Thomas Heller From 3999ecc8013ac35e01ca000ac7d1a813e2e124e8 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 17:30:42 -0500 Subject: [PATCH 21/25] Fix test format --- .../async_cuda/tests/unit/cuda_multi_device_polling.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp index 636c610ca952..24493aa58992 100644 --- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp +++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp @@ -35,7 +35,6 @@ extern void launch_saxpy_kernel( hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks, unsigned int& threads, void** args); - // ------------------------------------------------------------------------- int hpx_main(hpx::program_options::variables_map& vm) { @@ -82,9 +81,8 @@ int hpx_main(hpx::program_options::variables_map& vm) auto fut = hpx::cuda::experimental::detail::get_future_with_event( device_stream); fut.get(); - std::cout - << "get_future_with_event default ID test finished on device " - << device_id << std::endl; + std::cout << "get_future_with_event default ID test finished on device " + << device_id << std::endl; hpx::cuda::experimental::check_cuda_error( cudaStreamDestroy(device_stream)); } From 46b477a47e2f621932fe6b68214ead7d5640ec1e Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 22:57:46 -0500 Subject: [PATCH 22/25] Cal setdevice when creating events on-demand --- .../core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 31c6cc768f85..5a13a911c857 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -54,7 +54,15 @@ namespace hpx { namespace cuda { namespace experimental { // pop an event off the pool, if that fails, create a new one while (!free_lists_[device].pop(event)) { + // Set correct device in case if required + int original_device = -1; + check_cuda_error(cudaGetDevice(&original_device)); + if (original_device != device) + check_cuda_error(cudaSetDevice(device)); add_event_to_pool(device); + // reset to original device if required + if (original_device != device) + check_cuda_error(cudaSetDevice(original_device)); } return true; } From 4eb0b394e9999382fb355c055da44bc45087475e Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Wed, 28 Jun 2023 23:17:03 -0500 Subject: [PATCH 23/25] Fix format (again) --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 5a13a911c857..727379a1db09 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -60,9 +60,9 @@ namespace hpx { namespace cuda { namespace experimental { if (original_device != device) check_cuda_error(cudaSetDevice(device)); add_event_to_pool(device); - // reset to original device if required + // reset to original device if required if (original_device != device) - check_cuda_error(cudaSetDevice(original_device)); + check_cuda_error(cudaSetDevice(original_device)); } return true; } From e547c24a9ba4cc95da13b40e320ce3ad81e3ee29 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Thu, 29 Jun 2023 11:33:26 -0500 Subject: [PATCH 24/25] Remove superfluous api call --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index 727379a1db09..f9c7da830cbb 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -54,11 +54,9 @@ namespace hpx { namespace cuda { namespace experimental { // pop an event off the pool, if that fails, create a new one while (!free_lists_[device].pop(event)) { - // Set correct device in case if required + // Save current device int original_device = -1; check_cuda_error(cudaGetDevice(&original_device)); - if (original_device != device) - check_cuda_error(cudaSetDevice(device)); add_event_to_pool(device); // reset to original device if required if (original_device != device) From 9f0d330d69770be4613908130dc4af94a5477779 Mon Sep 17 00:00:00 2001 From: Gregor Daiss Date: Thu, 6 Jul 2023 12:51:50 -0500 Subject: [PATCH 25/25] Fix inspect --- libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp | 4 ++++ libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp | 2 ++ libs/core/async_cuda/src/cuda_event.cpp | 2 ++ libs/core/async_cuda/src/cuda_event_callback.cpp | 2 ++ libs/core/async_cuda/src/cuda_future.cpp | 2 ++ libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp | 3 +++ 6 files changed, 15 insertions(+) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index f9c7da830cbb..ce4db339969f 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -5,11 +5,15 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #pragma once +#include #include +#include #include #include #include diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 62fc91c2d1d4..e5e617c47171 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -6,6 +6,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #pragma once diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp index 470dbf5533d4..1425937a8f29 100644 --- a/libs/core/async_cuda/src/cuda_event.cpp +++ b/libs/core/async_cuda/src/cuda_event.cpp @@ -3,6 +3,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include #include diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index 9597656ecbe6..90698b402a98 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -7,6 +7,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include #include diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp index 545b1dc9b77c..accb87e1ff89 100644 --- a/libs/core/async_cuda/src/cuda_future.cpp +++ b/libs/core/async_cuda/src/cuda_future.cpp @@ -6,6 +6,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include #include diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp index 24493aa58992..ca1ee5579c7f 100644 --- a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp +++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp @@ -3,8 +3,11 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include +#include #include #include #include