Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA/HIP MultiGPU Event Polling #6284

Merged
merged 27 commits into from
Jul 8, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9f6dfca
Initial hardcoded version
G-071 Jun 19, 2023
e91c81a
Detect number of GPUs
G-071 Jun 19, 2023
25494a2
Cleanup
G-071 Jun 19, 2023
64ecdfc
Add missing default value
G-071 Jun 19, 2023
12b3e3c
Fix format
G-071 Jun 19, 2023
bf85ef6
Add missing format fix
G-071 Jun 19, 2023
6a24f16
Add default device arguments
G-071 Jun 19, 2023
e665a9e
Fix asserts
G-071 Jun 20, 2023
791cecd
Delete copy/move event pool constructors
G-071 Jun 21, 2023
d93ada2
Fix unused parameter warning
G-071 Jun 21, 2023
7a5f551
Switch to east const
G-071 Jun 23, 2023
b37f72a
Add -1 default parameter for device auto-detection
G-071 Jun 23, 2023
cf60025
Merge branch 'STEllAR-GROUP:master' into add-multigpu-polling
G-071 Jun 26, 2023
df94966
Merge branch 'STEllAR-GROUP:master' into add-multigpu-polling
G-071 Jun 27, 2023
d1a81f0
Fix assert
G-071 Jun 27, 2023
d1f5953
Restore original device after init
G-071 Jun 27, 2023
9b3388e
Remove superfluous cudaSetDevice
G-071 Jun 27, 2023
ee33c48
Put event pool singleton access definition in src
G-071 Jun 28, 2023
707c025
Add basic multi gpu polling test
G-071 Jun 28, 2023
8d7f636
Fix assert (again)
G-071 Jun 28, 2023
a7e76e5
Add test for default device ID
G-071 Jun 28, 2023
525ad21
Fix some format issues / update file author lists
G-071 Jun 28, 2023
3999ecc
Fix test format
G-071 Jun 28, 2023
46b477a
Cal setdevice when creating events on-demand
G-071 Jun 29, 2023
4eb0b39
Fix format (again)
G-071 Jun 29, 2023
e547c24
Remove superfluous api call
G-071 Jun 29, 2023
9f0d330
Fix inspect
G-071 Jul 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 57 additions & 26 deletions libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#pragma once

#include <deque>

#include <hpx/async_cuda/cuda_exception.hpp>
#include <hpx/async_cuda/custom_gpu_api.hpp>
#include <hpx/concurrency/stack.hpp>
Expand All @@ -19,66 +21,95 @@ namespace hpx { namespace cuda { namespace experimental {
// of them at startup.
struct cuda_event_pool
{
static constexpr int initial_events_in_pool = 128;
static constexpr std::size_t initial_events_in_pool = 128;

static cuda_event_pool& get_event_pool()
{
static cuda_event_pool event_pool_;
return event_pool_;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I suggest that you move the (HPX_CORE_EXPORTed) implementation of this function into a source file? The rationale is that otherwise on some platforms (Mac, Windows) each executable module (shared library, executable) that will contain code calling this function will have its own instance of the event_pool_ variable.


// create a bunch of events on initialization
cuda_event_pool()
: free_list_(initial_events_in_pool)
{
for (int i = 0; i < initial_events_in_pool; ++i)
{
add_event_to_pool();
}
}

// on destruction, all objects in stack will be freed
~cuda_event_pool()
{
cudaEvent_t event;
bool ok = true;
while (ok)
HPX_ASSERT_MSG(free_lists_.size != max_number_devices_,
"Number of CUDA event pools does not match the number of "
"devices!");
for (int device = 0; device < max_number_devices_; device++)
{
ok = free_list_.pop(event);
if (ok)
check_cuda_error(cudaEventDestroy(event));
check_cuda_error(cudaSetDevice(device));
cudaEvent_t event;
bool ok = true;
while (ok)
{
ok = free_lists_[device].pop(event);
if (ok)
check_cuda_error(cudaEventDestroy(event));
}
}
}

inline bool pop(cudaEvent_t& event)
inline bool pop(cudaEvent_t& event, int device = 0)
{
HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
"Accessing CUDA event pool with invalid device ID!");
// pop an event off the pool, if that fails, create a new one
while (!free_list_.pop(event))
while (!free_lists_[device].pop(event))
{
add_event_to_pool();
add_event_to_pool(device);
}
return true;
}

inline bool push(cudaEvent_t event)
inline bool push(cudaEvent_t event, int device = 0)
{
return free_list_.push(event);
HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
"Accessing CUDA event pool with invalid device ID!");
return free_lists_[device].push(event);
}

// delete copy / move constructors
cuda_event_pool(cuda_event_pool&&) = delete;
cuda_event_pool& operator=(cuda_event_pool&&) = delete;
cuda_event_pool(const cuda_event_pool&) = delete;
hkaiser marked this conversation as resolved.
Show resolved Hide resolved
cuda_event_pool& operator=(const cuda_event_pool&) = delete;

private:
void add_event_to_pool()
// Private singleton constructor
// Creates a bunch of events on initialization
cuda_event_pool()
: max_number_devices_(0)
{
check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
HPX_ASSERT_MSG(max_number_devices_ > 0,
"CUDA polling enabled and called, yet no CUDA device found!");
/* free_lists_.reserve(max_number_devices_); */
for (int device = 0; device < max_number_devices_; device++)
{
check_cuda_error(cudaSetDevice(device));
free_lists_.emplace_back(initial_events_in_pool);
for (std::size_t i = 0; i < initial_events_in_pool; ++i)
{
add_event_to_pool(device);
}
}
}

void add_event_to_pool(int device)
{
check_cuda_error(cudaSetDevice(device));
cudaEvent_t event;
// Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
// Timing is disabled for performance. [1]
//
// [1]: CUDA Runtime API, section 5.5 cuda_event Management
check_cuda_error(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
free_list_.push(event);
free_lists_[device].push(event);
}
int max_number_devices_;

// pool is dynamically sized and can grow if needed
hpx::lockfree::stack<cudaEvent_t> free_list_;
// One pool per GPU - each pool is dynamically sized and can grow if needed
std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
hkaiser marked this conversation as resolved.
Show resolved Hide resolved
};
}}} // namespace hpx::cuda::experimental
21 changes: 12 additions & 9 deletions libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ namespace hpx { namespace cuda { namespace experimental {
future_data() {}

future_data(init_no_addref no_addref, other_allocator const& alloc,
cudaStream_t stream)
cudaStream_t stream, int device)
: lcos::detail::future_data_allocator<void, Allocator,
future_data>(no_addref, alloc)
{
Expand All @@ -104,7 +104,7 @@ namespace hpx { namespace cuda { namespace experimental {
status)));
}
},
stream);
stream, device);
}
};

Expand All @@ -128,7 +128,7 @@ namespace hpx { namespace cuda { namespace experimental {
}

future_data(init_no_addref no_addref, other_allocator const& alloc,
cudaStream_t stream)
cudaStream_t stream, int device)
: lcos::detail::future_data_allocator<void, Allocator,
future_data>(no_addref, alloc)
, rt_(hpx::get_runtime_ptr())
Expand Down Expand Up @@ -183,7 +183,8 @@ namespace hpx { namespace cuda { namespace experimental {
// main API call to get a future from a stream using allocator, and the
// specified mode
template <typename Allocator, typename Mode>
hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
hpx::future<void> get_future(
Allocator const& a, cudaStream_t stream, int device = 0)
{
using shared_state = future_data<Allocator, Mode>;

Expand All @@ -200,7 +201,8 @@ namespace hpx { namespace cuda { namespace experimental {
unique_ptr p(traits::allocate(alloc, 1),
hpx::util::allocator_deleter<other_allocator>{alloc});

traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
traits::construct(
alloc, p.get(), init_no_addref{}, alloc, stream, device);

return hpx::traits::future_access<future<void>>::create(
p.release(), false);
Expand All @@ -212,16 +214,16 @@ namespace hpx { namespace cuda { namespace experimental {
hpx::future<void> get_future_with_callback(
Allocator const& a, cudaStream_t stream)
{
return get_future<Allocator, callback_mode>(a, stream);
return get_future<Allocator, callback_mode>(a, stream, 0);
}

// -------------------------------------------------------------
// main API call to get a future from a stream using allocator
template <typename Allocator>
hpx::future<void> get_future_with_event(
Allocator const& a, cudaStream_t stream)
Allocator const& a, cudaStream_t stream, int device = 0)
{
return get_future<Allocator, event_mode>(a, stream);
return get_future<Allocator, event_mode>(a, stream, device);
}

// -------------------------------------------------------------
Expand All @@ -231,7 +233,8 @@ namespace hpx { namespace cuda { namespace experimental {

// -------------------------------------------------------------
// non allocator version of : get future with an event set
HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
cudaStream_t stream, int device = 0);
} // namespace detail
}}} // namespace hpx::cuda::experimental

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
hpx::move_only_function<void(cudaError_t)>;

HPX_CORE_EXPORT void add_event_callback(
event_callback_function_type&& f, cudaStream_t stream);
event_callback_function_type&& f, cudaStream_t stream, int device = 0);

HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
HPX_CORE_EXPORT void unregister_polling(
Expand Down
14 changes: 9 additions & 5 deletions libs/core/async_cuda/src/cuda_event_callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
{
cudaEvent_t event;
event_callback_function_type f;
int device;
};

using event_callback_queue_type =
Expand Down Expand Up @@ -104,17 +105,19 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
}

void add_event_callback(
event_callback_function_type&& f, cudaStream_t stream)
event_callback_function_type&& f, cudaStream_t stream, int device)
{
cudaEvent_t event;
if (!cuda_event_pool::get_event_pool().pop(event))
if (!cuda_event_pool::get_event_pool().pop(event, device))
{
HPX_THROW_EXCEPTION(hpx::error::invalid_status,
"add_event_callback", "could not get an event");
}
check_cuda_error(cudaSetDevice(device));
check_cuda_error(cudaEventRecord(event, stream));

detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
detail::add_to_event_callback_queue(
event_callback{event, HPX_MOVE(f), device});
}

// Background progress function for async CUDA operations. Checks for completed
Expand Down Expand Up @@ -177,7 +180,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
"active events",
debug::dec<3>(get_number_of_active_events()));
continuation.f(status);
pool.push(HPX_MOVE(continuation.event));
pool.push(
HPX_MOVE(continuation.event), continuation.device);
return true;
}),
event_callback_vector.end());
Expand All @@ -199,7 +203,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
"active events",
debug::dec<3>(get_number_of_active_events()));
continuation.f(status);
pool.push(HPX_MOVE(continuation.event));
pool.push(HPX_MOVE(continuation.event), continuation.device);
}
}

Expand Down
5 changes: 3 additions & 2 deletions libs/core/async_cuda/src/cuda_future.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
hpx::util::internal_allocator<>{}, stream);
}

hpx::future<void> get_future_with_event(cudaStream_t stream)
hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
{
return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
return get_future_with_event(
hpx::util::internal_allocator<>{}, stream, device);
}
}}}} // namespace hpx::cuda::experimental::detail
3 changes: 2 additions & 1 deletion libs/core/async_cuda/src/cuda_target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {

hpx::future<void> target::get_future_with_event() const
{
return detail::get_future_with_event(handle_.get_stream());
return detail::get_future_with_event(
handle_.get_stream(), handle_.get_device());
}

hpx::future<void> target::get_future_with_callback() const
Expand Down