Skip to content

Commit

Permalink
Merge pull request #6284 from G-071/add-multigpu-polling
Browse files Browse the repository at this point in the history
Add CUDA/HIP MultiGPU Event Polling
  • Loading branch information
hkaiser authored Jul 8, 2023
2 parents 2e61670 + 9f0d330 commit 75faae5
Show file tree
Hide file tree
Showing 10 changed files with 252 additions and 50 deletions.
2 changes: 1 addition & 1 deletion libs/core/async_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ set(async_cuda_compat_headers
# cmake-format: on

set(async_cuda_sources cuda_event_callback.cpp cuda_future.cpp cuda_target.cpp
get_targets.cpp
get_targets.cpp cuda_event.cpp
)

if(HPX_WITH_HIP AND TARGET roc::hipblas)
Expand Down
105 changes: 74 additions & 31 deletions libs/core/async_cuda/include/hpx/async_cuda/cuda_event.hpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
// Copyright (c) 2023 Gregor Daiß
// Copyright (c) 2020 John Biddiscombe
// Copyright (c) 2020 Teodor Nikolov
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// hpxinspect:noascii

#pragma once

#include <cstddef>
#include <deque>

#include <hpx/assert.hpp>
#include <hpx/async_cuda/cuda_exception.hpp>
#include <hpx/async_cuda/custom_gpu_api.hpp>
#include <hpx/concurrency/stack.hpp>
Expand All @@ -19,66 +26,102 @@ namespace hpx { namespace cuda { namespace experimental {
// of them at startup.
struct cuda_event_pool
{
static constexpr int initial_events_in_pool = 128;

static cuda_event_pool& get_event_pool()
{
static cuda_event_pool event_pool_;
return event_pool_;
}
static constexpr std::size_t initial_events_in_pool = 128;

// create a bunch of events on initialization
cuda_event_pool()
: free_list_(initial_events_in_pool)
{
for (int i = 0; i < initial_events_in_pool; ++i)
{
add_event_to_pool();
}
}
HPX_CORE_EXPORT static cuda_event_pool& get_event_pool();

// on destruction, all objects in stack will be freed
~cuda_event_pool()
{
cudaEvent_t event;
bool ok = true;
while (ok)
HPX_ASSERT_MSG(free_lists_.size() ==
static_cast<std::size_t>(max_number_devices_),
"Number of CUDA event pools does not match the number of "
"devices!");
for (int device = 0; device < max_number_devices_; device++)
{
ok = free_list_.pop(event);
if (ok)
check_cuda_error(cudaEventDestroy(event));
check_cuda_error(cudaSetDevice(device));
cudaEvent_t event;
bool ok = true;
while (ok)
{
ok = free_lists_[device].pop(event);
if (ok)
check_cuda_error(cudaEventDestroy(event));
}
}
}

inline bool pop(cudaEvent_t& event)
inline bool pop(cudaEvent_t& event, int device = 0)
{
HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
"Accessing CUDA event pool with invalid device ID!");
// pop an event off the pool, if that fails, create a new one
while (!free_list_.pop(event))
while (!free_lists_[device].pop(event))
{
add_event_to_pool();
// Save current device
int original_device = -1;
check_cuda_error(cudaGetDevice(&original_device));
add_event_to_pool(device);
// reset to original device if required
if (original_device != device)
check_cuda_error(cudaSetDevice(original_device));
}
return true;
}

inline bool push(cudaEvent_t event)
inline bool push(cudaEvent_t event, int device = 0)
{
return free_list_.push(event);
HPX_ASSERT_MSG(device >= 0 && device < max_number_devices_,
"Accessing CUDA event pool with invalid device ID!");
return free_lists_[device].push(event);
}

// delete copy / move constructors
cuda_event_pool(cuda_event_pool&&) = delete;
cuda_event_pool& operator=(cuda_event_pool&&) = delete;
cuda_event_pool(cuda_event_pool const&) = delete;
cuda_event_pool& operator=(cuda_event_pool const&) = delete;

private:
void add_event_to_pool()
// Private singleton constructor. Creates a initial_events_in_pool of
// events on initialization
cuda_event_pool()
: max_number_devices_(0)
{
check_cuda_error(cudaGetDeviceCount(&max_number_devices_));
HPX_ASSERT_MSG(max_number_devices_ > 0,
"CUDA polling enabled and called, yet no CUDA device found!");
int original_device = 0;
check_cuda_error(cudaGetDevice(&original_device));
for (int device = 0; device < max_number_devices_; device++)
{
check_cuda_error(cudaSetDevice(device));
free_lists_.emplace_back(initial_events_in_pool);
for (std::size_t i = 0; i < initial_events_in_pool; ++i)
{
add_event_to_pool(device);
}
}
// Restore original device -- avoids confusion about HPX switching
// devices for the current thread if only one device is used.
check_cuda_error(cudaSetDevice(original_device));
}

void add_event_to_pool(int device)
{
check_cuda_error(cudaSetDevice(device));
cudaEvent_t event;
// Create an cuda_event to query a CUDA/CUBLAS kernel for completion.
// Timing is disabled for performance. [1]
//
// [1]: CUDA Runtime API, section 5.5 cuda_event Management
check_cuda_error(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
free_list_.push(event);
free_lists_[device].push(event);
}
int max_number_devices_;

// pool is dynamically sized and can grow if needed
hpx::lockfree::stack<cudaEvent_t> free_list_;
// One pool per GPU - each pool is dynamically sized and can grow if needed
std::deque<hpx::lockfree::stack<cudaEvent_t>> free_lists_;
};
}}} // namespace hpx::cuda::experimental
37 changes: 29 additions & 8 deletions libs/core/async_cuda/include/hpx/async_cuda/cuda_future.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// Copyright (c) 2023 Gregor Daiß
// Copyright (c) 2020 John Biddiscombe
// Copyright (c) 2016 Thomas Heller
// Copyright (c) 2016 Hartmut Kaiser
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// hpxinspect:noascii

#pragma once

Expand Down Expand Up @@ -81,7 +84,7 @@ namespace hpx { namespace cuda { namespace experimental {
future_data() {}

future_data(init_no_addref no_addref, other_allocator const& alloc,
cudaStream_t stream)
cudaStream_t stream, int device)
: lcos::detail::future_data_allocator<void, Allocator,
future_data>(no_addref, alloc)
{
Expand All @@ -104,7 +107,7 @@ namespace hpx { namespace cuda { namespace experimental {
status)));
}
},
stream);
stream, device);
}
};

Expand Down Expand Up @@ -183,7 +186,8 @@ namespace hpx { namespace cuda { namespace experimental {
// main API call to get a future from a stream using allocator, and the
// specified mode
template <typename Allocator, typename Mode>
hpx::future<void> get_future(Allocator const& a, cudaStream_t stream)
hpx::future<void> get_future(
Allocator const& a, cudaStream_t stream, int device)
{
using shared_state = future_data<Allocator, Mode>;

Expand All @@ -200,7 +204,19 @@ namespace hpx { namespace cuda { namespace experimental {
unique_ptr p(traits::allocate(alloc, 1),
hpx::util::allocator_deleter<other_allocator>{alloc});

traits::construct(alloc, p.get(), init_no_addref{}, alloc, stream);
static_assert(std::is_same_v<Mode, event_mode> ||
std::is_same_v<Mode, callback_mode>,
"get_future mode not supported!");
if constexpr (std::is_same_v<Mode, event_mode>)
{
traits::construct(
alloc, p.get(), init_no_addref{}, alloc, stream, device);
}
else if constexpr (std::is_same_v<Mode, callback_mode>)
{
traits::construct(
alloc, p.get(), init_no_addref{}, alloc, stream);
}

return hpx::traits::future_access<future<void>>::create(
p.release(), false);
Expand All @@ -212,16 +228,20 @@ namespace hpx { namespace cuda { namespace experimental {
hpx::future<void> get_future_with_callback(
Allocator const& a, cudaStream_t stream)
{
return get_future<Allocator, callback_mode>(a, stream);
// device id 0 will be dropped in callback mode - can be
// an arbitrary number here
return get_future<Allocator, callback_mode>(a, stream, 0);
}

// -------------------------------------------------------------
// main API call to get a future from a stream using allocator
template <typename Allocator>
hpx::future<void> get_future_with_event(
Allocator const& a, cudaStream_t stream)
Allocator const& a, cudaStream_t stream, int device = -1)
{
return get_future<Allocator, event_mode>(a, stream);
if (device == -1)
check_cuda_error(cudaGetDevice(&device));
return get_future<Allocator, event_mode>(a, stream, device);
}

// -------------------------------------------------------------
Expand All @@ -231,7 +251,8 @@ namespace hpx { namespace cuda { namespace experimental {

// -------------------------------------------------------------
// non allocator version of : get future with an event set
HPX_CORE_EXPORT hpx::future<void> get_future_with_event(cudaStream_t);
HPX_CORE_EXPORT hpx::future<void> get_future_with_event(
cudaStream_t stream, int device = -1);
} // namespace detail
}}} // namespace hpx::cuda::experimental

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
hpx::move_only_function<void(cudaError_t)>;

HPX_CORE_EXPORT void add_event_callback(
event_callback_function_type&& f, cudaStream_t stream);
event_callback_function_type&& f, cudaStream_t stream, int device = 0);

HPX_CORE_EXPORT void register_polling(hpx::threads::thread_pool_base& pool);
HPX_CORE_EXPORT void unregister_polling(
Expand Down
19 changes: 19 additions & 0 deletions libs/core/async_cuda/src/cuda_event.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) 2023 Gregor Daiß
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// hpxinspect:noascii

#include <hpx/assert.hpp>
#include <hpx/async_cuda/cuda_event.hpp>
#include <hpx/async_cuda/custom_gpu_api.hpp>

namespace hpx { namespace cuda { namespace experimental {
cuda_event_pool& cuda_event_pool::get_event_pool()
{
static cuda_event_pool event_pool_;
return event_pool_;
}
}}} // namespace hpx::cuda::experimental
16 changes: 11 additions & 5 deletions libs/core/async_cuda/src/cuda_event_callback.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// Copyright (c) 2023 Gregor Daiß
// Copyright (c) 2021 ETH Zurich
// Copyright (c) 2020 John Biddiscombe
// Copyright (c) 2016 Hartmut Kaiser
Expand All @@ -6,6 +7,8 @@
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// hpxinspect:noascii

#include <hpx/config.hpp>
#include <hpx/assert.hpp>
Expand Down Expand Up @@ -50,6 +53,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
{
cudaEvent_t event;
event_callback_function_type f;
int device;
};

using event_callback_queue_type =
Expand Down Expand Up @@ -104,17 +108,18 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
}

void add_event_callback(
event_callback_function_type&& f, cudaStream_t stream)
event_callback_function_type&& f, cudaStream_t stream, int device)
{
cudaEvent_t event;
if (!cuda_event_pool::get_event_pool().pop(event))
if (!cuda_event_pool::get_event_pool().pop(event, device))
{
HPX_THROW_EXCEPTION(hpx::error::invalid_status,
"add_event_callback", "could not get an event");
}
check_cuda_error(cudaEventRecord(event, stream));

detail::add_to_event_callback_queue(event_callback{event, HPX_MOVE(f)});
detail::add_to_event_callback_queue(
event_callback{event, HPX_MOVE(f), device});
}

// Background progress function for async CUDA operations. Checks for completed
Expand Down Expand Up @@ -177,7 +182,8 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
"active events",
debug::dec<3>(get_number_of_active_events()));
continuation.f(status);
pool.push(HPX_MOVE(continuation.event));
pool.push(
HPX_MOVE(continuation.event), continuation.device);
return true;
}),
event_callback_vector.end());
Expand All @@ -199,7 +205,7 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
"active events",
debug::dec<3>(get_number_of_active_events()));
continuation.f(status);
pool.push(HPX_MOVE(continuation.event));
pool.push(HPX_MOVE(continuation.event), continuation.device);
}
}

Expand Down
8 changes: 6 additions & 2 deletions libs/core/async_cuda/src/cuda_future.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// Copyright (c) 2023 Gregor Daiß
// Copyright (c) 2020 John Biddiscombe
// Copyright (c) 2016 Hartmut Kaiser
// Copyright (c) 2016 Thomas Heller
//
// SPDX-License-Identifier: BSL-1.0
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// hpxinspect:noascii

#include <hpx/allocator_support/internal_allocator.hpp>
#include <hpx/assert.hpp>
Expand All @@ -18,8 +21,9 @@ namespace hpx { namespace cuda { namespace experimental { namespace detail {
hpx::util::internal_allocator<>{}, stream);
}

hpx::future<void> get_future_with_event(cudaStream_t stream)
hpx::future<void> get_future_with_event(cudaStream_t stream, int device)
{
return get_future_with_event(hpx::util::internal_allocator<>{}, stream);
return get_future_with_event(
hpx::util::internal_allocator<>{}, stream, device);
}
}}}} // namespace hpx::cuda::experimental::detail
3 changes: 2 additions & 1 deletion libs/core/async_cuda/src/cuda_target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ namespace hpx { namespace cuda { namespace experimental {

hpx::future<void> target::get_future_with_event() const
{
return detail::get_future_with_event(handle_.get_stream());
return detail::get_future_with_event(
handle_.get_stream(), handle_.get_device());
}

hpx::future<void> target::get_future_with_callback() const
Expand Down
Loading

0 comments on commit 75faae5

Please sign in to comment.