diff --git a/libs/core/async_cuda/CMakeLists.txt b/libs/core/async_cuda/CMakeLists.txt index 77782a23bc47..b2a7954877bb 100644 --- a/libs/core/async_cuda/CMakeLists.txt +++ b/libs/core/async_cuda/CMakeLists.txt @@ -36,7 +36,7 @@ set(async_cuda_compat_headers # cmake-format: on set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp - get_targets.cpp + get_targets.cpp cuda_event.cpp ) if(HPX_WITH_HIP AND TARGET roc::hipblas) diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp index f2ae7ef59f5e..ce4db339969f 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp @@ -1,12 +1,19 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2020 Teodor Nikolov // // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #pragma once +#include +#include + +#include #include #include #include @@ -19,55 +26,90 @@ namespace hpx { namespace cuda { namespace experimental { // of them at startup. struct cuda_event_pool { - static constexpr int initial_events_in_pool = 128; - - static cuda_event_pool& get_event_pool() - { - static cuda_event_pool event_pool_; - return event_pool_; - } + static constexpr std::size_t initial_events_in_pool = 128; - // create a bunch of events on initialization - cuda_event_pool() - : free_list_(initial_events_in_pool) - { - for (int i = 0; i < initial_events_in_pool; ++i) - { - add_event_to_pool(); - } - } + HPX_CORE_EXPORT static cuda_event_pool& get_event_pool(); // on destruction, all objects in stack will be freed ~cuda_event_pool() { - cudaEvent_t event; - bool ok = true; - while (ok) + HPX_ASSERT_MSG(free_lists_.size() == + static_cast(max_number_devices_), + "Number of CUDA event pools does not match the number of " + "devices!"); + for (int device = 0; device < max_number_devices_; device++) { - ok = free_list_.pop(event); - if (ok) - check_cuda_error(cudaEventDestroy(event)); + check_cuda_error(cudaSetDevice(device)); + cudaEvent_t event; + bool ok = true; + while (ok) + { + ok = free_lists_[device].pop(event); + if (ok) + check_cuda_error(cudaEventDestroy(event)); + } } } - inline bool pop(cudaEvent_t& event) + inline bool pop(cudaEvent_t& event, int device = 0) { + HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_, + "Accessing CUDA event pool with invalid device ID!"); // pop an event off the pool, if that fails, create a new one - while (!free_list_.pop(event)) + while (!free_lists_[device].pop(event)) { - add_event_to_pool(); + // Save current device + int original_device = -1; + check_cuda_error(cudaGetDevice(&original_device)); + add_event_to_pool(device); + // reset to original device if required + if (original_device != device) + check_cuda_error(cudaSetDevice(original_device)); } return true; } - inline bool push(cudaEvent_t event) + inline bool push(cudaEvent_t event, int device = 0) { - return free_list_.push(event); + HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_, + "Accessing CUDA event pool with invalid device ID!"); + return free_lists_[device].push(event); } + // delete copy / move constructors + cuda_event_pool(cuda_event_pool&&) = delete; + cuda_event_pool& operator=(cuda_event_pool&&) = delete; + cuda_event_pool(cuda_event_pool const&) = delete; + cuda_event_pool& operator=(cuda_event_pool const&) = delete; + private: - void add_event_to_pool() + // Private singleton constructor. Creates a initial_events_in_pool of + // events on initialization + cuda_event_pool() + : max_number_devices_(0) + { + check_cuda_error(cudaGetDeviceCount(&max_number_devices_)); + HPX_ASSERT_MSG(max_number_devices_ > 0, + "CUDA polling enabled and called, yet no CUDA device found!"); + int original_device = 0; + check_cuda_error(cudaGetDevice(&original_device)); + for (int device = 0; device < max_number_devices_; device++) + { + check_cuda_error(cudaSetDevice(device)); + free_lists_.emplace_back(initial_events_in_pool); + for (std::size_t i = 0; i < initial_events_in_pool; ++i) + { + add_event_to_pool(device); + } + } + // Restore original device -- avoids confusion about HPX switching + // devices for the current thread if only one device is used. + check_cuda_error(cudaSetDevice(original_device)); + } + + void add_event_to_pool(int device) { + check_cuda_error(cudaSetDevice(device)); cudaEvent_t event; // Create an cuda_event to query a CUDA/CUBLAS kernel for completion. // Timing is disabled for performance. [1] @@ -75,10 +117,11 @@ namespace hpx { namespace cuda { namespace experimental { // [1]: CUDA Runtime API, section 5.5 cuda_event Management check_cuda_error( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - free_list_.push(event); + free_lists_[device].push(event); } + int max_number_devices_; - // pool is dynamically sized and can grow if needed - hpx::lockfree::stack free_list_; + // One pool per GPU - each pool is dynamically sized and can grow if needed + std::deque> free_lists_; }; }}} // namespace hpx::cuda::experimental diff --git a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp index 042b8199dbeb..e5e617c47171 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Thomas Heller // Copyright (c) 2016 Hartmut Kaiser @@ -5,6 +6,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #pragma once @@ -81,7 +84,7 @@ namespace hpx { namespace cuda { namespace experimental { future_data() {} future_data(init_no_addref no_addref, other_allocator const& alloc, - cudaStream_t stream) + cudaStream_t stream, int device) : lcos::detail::future_data_allocator(no_addref, alloc) { @@ -104,7 +107,7 @@ namespace hpx { namespace cuda { namespace experimental { status))); } }, - stream); + stream, device); } }; @@ -183,7 +186,8 @@ namespace hpx { namespace cuda { namespace experimental { // main API call to get a future from a stream using allocator, and the // specified mode template - hpx::future get_future(Allocator const& a, cudaStream_t stream) + hpx::future get_future( + Allocator const& a, cudaStream_t stream, int device) { using shared_state = future_data; @@ -200,7 +204,19 @@ namespace hpx { namespace cuda { namespace experimental { unique_ptr p(traits::allocate(alloc, 1), hpx::util::allocator_deleter{alloc}); - traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream); + static_assert(std::is_same_v || + std::is_same_v, + "get_future mode not supported!"); + if constexpr (std::is_same_v) + { + traits::construct( + alloc, p.get(), init_no_addref{}, alloc, stream, device); + } + else if constexpr (std::is_same_v) + { + traits::construct( + alloc, p.get(), init_no_addref{}, alloc, stream); + } return hpx::traits::future_access>::create( p.release(), false); @@ -212,16 +228,20 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future get_future_with_callback( Allocator const& a, cudaStream_t stream) { - return get_future(a, stream); + // device id 0 will be dropped in callback mode - can be + // an arbitrary number here + return get_future(a, stream, 0); } // ------------------------------------------------------------- // main API call to get a future from a stream using allocator template hpx::future get_future_with_event( - Allocator const& a, cudaStream_t stream) + Allocator const& a, cudaStream_t stream, int device = -1) { - return get_future(a, stream); + if (device == -1) + check_cuda_error(cudaGetDevice(&device)); + return get_future(a, stream, device); } // ------------------------------------------------------------- @@ -231,7 +251,8 @@ namespace hpx { namespace cuda { namespace experimental { // ------------------------------------------------------------- // non allocator version of : get future with an event set - HPX_CORE_EXPORT hpx::future get_future_with_event(cudaStream_t); + HPX_CORE_EXPORT hpx::future get_future_with_event( + cudaStream_t stream, int device = -1); } // namespace detail }}} // namespace hpx::cuda::experimental diff --git a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp index fde4361569d5..e1e6ef49cb20 100644 --- a/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp +++ b/libs/core/async_cuda/include/hpx/async_cuda/detail/cuda_event_callback.hpp @@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::move_only_function; HPX_CORE_EXPORT void add_event_callback( - event_callback_function_type&& f, cudaStream_t stream); + event_callback_function_type&& f, cudaStream_t stream, int device = 0); HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool); HPX_CORE_EXPORT void unregister_polling( diff --git a/libs/core/async_cuda/src/cuda_event.cpp b/libs/core/async_cuda/src/cuda_event.cpp new file mode 100644 index 000000000000..1425937a8f29 --- /dev/null +++ b/libs/core/async_cuda/src/cuda_event.cpp @@ -0,0 +1,19 @@ +// Copyright (c) 2023 Gregor Daiß +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii + +#include +#include +#include + +namespace hpx { namespace cuda { namespace experimental { + cuda_event_pool& cuda_event_pool::get_event_pool() + { + static cuda_event_pool event_pool_; + return event_pool_; + } +}}} // namespace hpx::cuda::experimental diff --git a/libs/core/async_cuda/src/cuda_event_callback.cpp b/libs/core/async_cuda/src/cuda_event_callback.cpp index 0d53bd4af3f7..90698b402a98 100644 --- a/libs/core/async_cuda/src/cuda_event_callback.cpp +++ b/libs/core/async_cuda/src/cuda_event_callback.cpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2021 ETH Zurich // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Hartmut Kaiser @@ -6,6 +7,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include #include @@ -50,6 +53,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { { cudaEvent_t event; event_callback_function_type f; + int device; }; using event_callback_queue_type = @@ -104,17 +108,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { } void add_event_callback( - event_callback_function_type&& f, cudaStream_t stream) + event_callback_function_type&& f, cudaStream_t stream, int device) { cudaEvent_t event; - if (!cuda_event_pool::get_event_pool().pop(event)) + if (!cuda_event_pool::get_event_pool().pop(event, device)) { HPX_THROW_EXCEPTION(hpx::error::invalid_status, "add_event_callback", "could not get an event"); } check_cuda_error(cudaEventRecord(event, stream)); - detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)}); + detail::add_to_event_callback_queue( + event_callback{event, HPX_MOVE(f), device}); } // Background progress function for async CUDA operations. Checks for completed @@ -177,7 +182,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); - pool.push(HPX_MOVE(continuation.event)); + pool.push( + HPX_MOVE(continuation.event), continuation.device); return true; }), event_callback_vector.end()); @@ -199,7 +205,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { "active events", debug::dec<3>(get_number_of_active_events())); continuation.f(status); - pool.push(HPX_MOVE(continuation.event)); + pool.push(HPX_MOVE(continuation.event), continuation.device); } } diff --git a/libs/core/async_cuda/src/cuda_future.cpp b/libs/core/async_cuda/src/cuda_future.cpp index 6b1dfbbaa436..accb87e1ff89 100644 --- a/libs/core/async_cuda/src/cuda_future.cpp +++ b/libs/core/async_cuda/src/cuda_future.cpp @@ -1,3 +1,4 @@ +// Copyright (c) 2023 Gregor Daiß // Copyright (c) 2020 John Biddiscombe // Copyright (c) 2016 Hartmut Kaiser // Copyright (c) 2016 Thomas Heller @@ -5,6 +6,8 @@ // SPDX-License-Identifier: BSL-1.0 // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii #include #include @@ -18,8 +21,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail { hpx::util::internal_allocator<>{}, stream); } - hpx::future get_future_with_event(cudaStream_t stream) + hpx::future get_future_with_event(cudaStream_t stream, int device) { - return get_future_with_event(hpx::util::internal_allocator<>{}, stream); + return get_future_with_event( + hpx::util::internal_allocator<>{}, stream, device); } }}}} // namespace hpx::cuda::experimental::detail diff --git a/libs/core/async_cuda/src/cuda_target.cpp b/libs/core/async_cuda/src/cuda_target.cpp index 6906308ebc62..60ea190980f0 100644 --- a/libs/core/async_cuda/src/cuda_target.cpp +++ b/libs/core/async_cuda/src/cuda_target.cpp @@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental { hpx::future target::get_future_with_event() const { - return detail::get_future_with_event(handle_.get_stream()); + return detail::get_future_with_event( + handle_.get_stream(), handle_.get_device()); } hpx::future target::get_future_with_callback() const diff --git a/libs/core/async_cuda/tests/unit/CMakeLists.txt b/libs/core/async_cuda/tests/unit/CMakeLists.txt index ce6cf5a19ee2..b14933107024 100644 --- a/libs/core/async_cuda/tests/unit/CMakeLists.txt +++ b/libs/core/async_cuda/tests/unit/CMakeLists.txt @@ -4,16 +4,19 @@ # Distributed under the Boost Software License, Version 1.0. (See accompanying # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) -set(tests cuda_future transform_stream) +set(tests cuda_future cuda_multi_device_polling transform_stream) if(HPX_WITH_GPUBLAS) set(benchmarks ${benchmarks} cublas_matmul) endif() set(cublas_matmul_PARAMETERS THREADS_PER_LOCALITY 4) set(cuda_future_PARAMETERS THREADS_PER_LOCALITY 4) +set(cuda_multi_device_polling_PARAMETERS THREADS_PER_LOCALITY 4) set(transform_stream_PARAMETERS THREADS_PER_LOCALITY 4) set(cuda_future_CUDA_SOURCE saxpy trivial_demo) +set(cuda_multi_device_polling_CUDA_SOURCE trivial_demo) + set(transform_stream_CUDA ON) foreach(test ${tests}) diff --git a/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp new file mode 100644 index 000000000000..ca1ee5579c7f --- /dev/null +++ b/libs/core/async_cuda/tests/unit/cuda_multi_device_polling.cpp @@ -0,0 +1,105 @@ +// Copyright (c) 2023 Gregor Daiß +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// hpxinspect:noascii + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +// ------------------------------------------------------------------------- +// This example is similar to the unit/cuda_future.cpp example (hence it also uses +// the externally defined cuda_trivial_kernel. See unit/cuda_future.cpp for +// more details regarding this. +// +// This example extends unit/cuda_future.cpp by testing the cuda event polling +// on multiple devices (if available)! If the polling is not working correctly, +// the test will time out as some of the futures are never triggered. + +template +extern void cuda_trivial_kernel(T, cudaStream_t stream); + +extern void launch_saxpy_kernel( + hpx::cuda::experimental::cuda_executor& cudaexec, unsigned int& blocks, + unsigned int& threads, void** args); + +// ------------------------------------------------------------------------- +int hpx_main(hpx::program_options::variables_map& vm) +{ + // install cuda future polling handler + hpx::cuda::experimental::enable_user_polling poll("default"); + + // Print all targets for debug purposes + hpx::cuda::experimental::print_local_targets(); + + int number_devices = 0; + hpx::cuda::experimental::check_cuda_error( + cudaGetDeviceCount(&number_devices)); + HPX_ASSERT(number_devices > 0); + + // Check if the futures complete when using executors on all devices + std::vector> futs(number_devices); + for (auto device_id = 0; device_id < number_devices; device_id++) + { + hpx::cuda::experimental::cuda_executor exec( + device_id, hpx::cuda::experimental::event_mode{}); + auto fut = hpx::async(exec, cuda_trivial_kernel, + static_cast(device_id) + 1); + futs[device_id] = fut.then([device_id](hpx::future&&) { + std::cout << "Continuation for kernel future triggered on device " + "executor " + << device_id << std::endl; + }); + } + auto final_fut = hpx::when_all(futs); + std::cout << "All executor test kernels launched! " << std::endl; + final_fut.get(); + std::cout << "All executor test kernels finished! " << std::endl; + + // Test to see if HPX correctly picks up the current device in case + // get_future_with_event is not given a device_id + for (auto device_id = 0; device_id < number_devices; device_id++) + { + hpx::cuda::experimental::check_cuda_error(cudaSetDevice(device_id)); + cudaStream_t device_stream; + hpx::cuda::experimental::check_cuda_error( + cudaStreamCreate(&device_stream)); + cuda_trivial_kernel( + number_devices + device_id + 1, device_stream); + auto fut = hpx::cuda::experimental::detail::get_future_with_event( + device_stream); + fut.get(); + std::cout << "get_future_with_event default ID test finished on device " + << device_id << std::endl; + hpx::cuda::experimental::check_cuda_error( + cudaStreamDestroy(device_stream)); + } + + return hpx::local::finalize(); +} + +// ------------------------------------------------------------------------- +int main(int argc, char** argv) +{ + std::cout << "[HPX Cuda multi device polling] - Starting...\n" << std::endl; + + hpx::local::init_params init_args; + + auto result = hpx::local::init(hpx_main, argc, argv, init_args); + return result || hpx::util::report_errors(); +}