From 2b48e220d954c31fa6b0a226821b4da9b6988e66 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Sun, 11 Feb 2024 23:26:47 +0200 Subject: [PATCH] Regards #291: Added support for untyped, owned, memory regions: * Added a `unique_region` class - non-templated * `unique_region`'s "decay" into `cuda::memory::region_t`'s - so there's no need to reimplement half the world for them * Added a missing variant of `cuda::memory::copy()` * Added an example program based on vectorAdd, which uses unique_regions (I would have replaced vectorAdd with it - had it not requirted to lines of source more than before...) * Removed some commented-out unique-pointer-related code --- examples/CMakeLists.txt | 1 + examples/by_api_module/unified_addressing.cpp | 10 +- .../vectorAdd_unique_regions.cu | 82 ++++++ src/cuda/api.hpp | 2 + src/cuda/api/memory.hpp | 10 + .../api/multi_wrapper_impls/unique_region.hpp | 99 +++++++ src/cuda/api/types.hpp | 48 ++-- src/cuda/api/unique_region.hpp | 249 ++++++++++++++++++ 8 files changed, 481 insertions(+), 20 deletions(-) create mode 100644 examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu create mode 100644 src/cuda/api/multi_wrapper_impls/unique_region.hpp create mode 100644 src/cuda/api/unique_region.hpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d4b9cb84..701fdaa4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -56,6 +56,7 @@ link_libraries(cuda-api-wrappers::runtime-and-driver) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin") add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu) +add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu) add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu) add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu) add_executable(vectorAdd_nvrtc modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp) diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp index 1cc545c0..ffd7d67a 100644 --- a/examples/by_api_module/unified_addressing.cpp +++ b/examples/by_api_module/unified_addressing.cpp @@ -28,13 +28,13 @@ void pointer_properties(const cuda::device_t& device) cuda::context::create(device), cuda::context::create(device) }; - cuda::memory::device::unique_ptr regions[2] = { - cuda::memory::make_unique(contexts[0], fixed_size), - cuda::memory::make_unique(contexts[1], fixed_size) + cuda::memory::device::unique_region regions[2] = { + cuda::memory::make_unique_region(contexts[0], fixed_size), + cuda::memory::make_unique_region(contexts[1], fixed_size) }; void* raw_pointers[2] = { - regions[0].get(), - regions[1].get() + regions[0].data(), + regions[1].data() }; cuda::memory::pointer_t pointers[2] = { cuda::memory::pointer::wrap(raw_pointers[0]), diff --git a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu new file mode 100644 index 00000000..5bcc5971 --- /dev/null +++ b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu @@ -0,0 +1,82 @@ +/** + * Derived from the nVIDIA CUDA 8.0 samples by + * + * Eyal Rozenberg + * + * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA + * and the deriver is the owner of this code according to the EULA. + * + * Use this reasonably. If you want to discuss licensing formalities, please + * contact the author. + */ + +#include + +#include +#include +#include + +__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) { C[i] = A[i] + B[i]; } +} + +int main() +{ + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << "\n"; + exit(EXIT_FAILURE); + } + + int numElements = 50000; + std::cout << "[Vector addition of " << numElements << " elements]\n"; + + // If we could rely on C++14, we would use std::make_unique + auto h_A = std::unique_ptr(new float[numElements]); + auto h_B = std::unique_ptr(new float[numElements]); + auto h_C = std::unique_ptr(new float[numElements]); + + auto generator = []() { return rand() / (float) RAND_MAX; }; + std::generate(h_A.get(), h_A.get() + numElements, generator); + std::generate(h_B.get(), h_B.get() + numElements, generator); + + auto device = cuda::device::current::get(); + auto d_A = cuda::memory::make_unique_region(device, numElements); + auto d_B = cuda::memory::make_unique_region(device, numElements); + auto d_C = cuda::memory::make_unique_region(device, numElements); + auto sp_A = d_A.as_span(); + auto sp_B = d_A.as_span(); + auto sp_C = d_A.as_span(); + + cuda::memory::copy(sp_A, h_A.get()); + cuda::memory::copy(sp_B, h_B.get()); + + auto launch_config = cuda::launch_config_builder() + .overall_size(numElements) + .block_size(256) + .build(); + + std::cout + << "CUDA kernel launch with " << launch_config.dimensions.grid.x + << " blocks of " << launch_config.dimensions.block.x << " threads each\n"; + + cuda::launch( + vectorAdd, launch_config, + sp_A.data(), sp_B.data(), sp_C.data(), numElements + ); + + cuda::memory::copy(h_C.get(), d_C); + + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) { + if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) { + std::cerr << "Result verification failed at element " << i << "\n"; + exit(EXIT_FAILURE); + } + } + + std::cout << "Test PASSED\n"; + std::cout << "SUCCESS\n"; +} + diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index 1f556219..b46c327e 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -31,6 +31,7 @@ #include "api/memory_pool.hpp" #endif #include "api/unique_ptr.hpp" +#include "api/unique_region.hpp" #include "api/link_options.hpp" #include "api/device.hpp" @@ -67,6 +68,7 @@ #include "api/multi_wrapper_impls/context.hpp" #include "api/multi_wrapper_impls/stream.hpp" #include "api/multi_wrapper_impls/memory.hpp" +#include "api/multi_wrapper_impls/unique_region.hpp" #include "api/multi_wrapper_impls/virtual_memory.hpp" #include "api/multi_wrapper_impls/kernel.hpp" #include "api/multi_wrapper_impls/kernel_launch.hpp" diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 16436c8f..bb56ea12 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -503,6 +503,16 @@ inline void copy(region_t destination, void* source, size_t num_bytes) return copy(destination.start(), source, num_bytes); } +inline void copy(void* destination, const_region_t source, size_t num_bytes) +{ +#ifndef NDEBUG + if (source.size() < num_bytes) { + throw ::std::logic_error("Number of bytes to copy exceeds source size"); + } +#endif + return copy(destination, source.start(), num_bytes); +} + inline void copy(region_t destination, void* source) { return copy(destination, source, destination.size()); diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp new file mode 100644 index 00000000..3ce64635 --- /dev/null +++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp @@ -0,0 +1,99 @@ +/** + * @file + * + * @brief Implementations of @ref `make_unique_region()` functions + */ +#pragma once +#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ +#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ + +#include "../unique_region.hpp" +#include "../types.hpp" + +namespace cuda { + +namespace memory { + +namespace device { + +inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements) +{ + return detail_::make_unique_region(context.handle(), num_elements); +} + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param device on which to construct the array of elements + * @param num_elements the number of elements to allocate + * @return an ::std::unique_ptr pointing to the constructed T array + */ +inline unique_region make_unique_region(const device_t& device, size_t num_elements) +{ + auto pc = device.primary_context(); + CAW_SET_SCOPE_CONTEXT(pc.handle()); + return make_unique_region(pc, num_elements); +} + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory on the current device. + * + * @note The allocation will be made in the device's primary context - + * which will be created if it has not yet been. + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the constructed T array + */ +inline unique_region make_unique_region(size_t num_elements) +{ + auto current_device_id = cuda::device::current::detail_::get_id(); + auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); + return make_unique_region(pc, num_elements); +} + +} // namespace device + +namespace managed { + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + CAW_SET_SCOPE_CONTEXT(context.handle()); + return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) }; +} + +inline unique_region make_unique_region( + const device_t& device, + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + auto pc = device.primary_context(); + return make_unique_region(pc, num_bytes, initial_visibility); +} + +inline unique_region make_unique_region( + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + auto current_device_id = cuda::device::current::detail_::get_id(); + auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); + return make_unique_region(pc, num_bytes, initial_visibility); +} + +} // namespace managed + +} // namespace memory + +} // namespace cuda + +#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ + diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp index 3c96ba07..07dcfe7d 100644 --- a/src/cuda/api/types.hpp +++ b/src/cuda/api/types.hpp @@ -637,14 +637,18 @@ namespace detail_ { // Note: T should be either void or void const, nothing else template class base_region_t { +public: + using pointer = T*; + using const_pointer = const T*; + using size_type = size_t; + private: - T* start_ = nullptr; - size_t size_in_bytes_ = 0; + pointer start_ = nullptr; + size_type size_in_bytes_ = 0; - using char_type = typename ::std::conditional<::std::is_const::value, const char *, char *>::type; public: base_region_t() noexcept = default; - base_region_t(T* start, size_t size_in_bytes) noexcept + base_region_t(pointer start, size_t size_in_bytes) noexcept : start_(start), size_in_bytes_(size_in_bytes) {} base_region_t(device::address_t start, size_t size_in_bytes) noexcept : start_(as_pointer(start)), size_in_bytes_(size_in_bytes) {} @@ -657,7 +661,7 @@ class base_region_t { } template - explicit operator span() const + span as_span() const NOEXCEPT_IF_NDEBUG { static_assert( ::std::is_const::value or not ::std::is_const::type>::value, @@ -675,13 +679,16 @@ class base_region_t { } - T*& start() noexcept { return start_; } - size_t& size() noexcept { return size_in_bytes_; } + template + operator span() const NOEXCEPT_IF_NDEBUG { return as_span(); } + + pointer& start() noexcept { return start_; } + size_type& size() noexcept { return size_in_bytes_; } - size_t size() const noexcept { return size_in_bytes_; } - T* start() const noexcept { return start_; } - T* data() const noexcept { return start(); } - T* get() const noexcept { return start(); } + size_type size() const noexcept { return size_in_bytes_; } + pointer start() const noexcept { return start_; } + pointer data() const noexcept { return start(); } + pointer get() const noexcept { return start(); } device::address_t device_address() const noexcept { @@ -702,7 +709,10 @@ class base_region_t { throw ::std::invalid_argument("subregion exceeds original region bounds"); } #endif - return { static_cast(start_) + offset_in_bytes, size_in_bytes }; + using char_type = typename ::std::conditional<::std::is_const::value, const char *, char *>::type; + return { + static_cast(start_) + offset_in_bytes, size_in_bytes + }; } }; @@ -724,7 +734,11 @@ bool operator!=(const base_region_t& lhs, const base_region_t& rhs) struct region_t : public detail_::base_region_t { using base_region_t::base_region_t; - region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const + using base_region_t::pointer; + using base_region_t::const_pointer; + using base_region_t::size_type; + + region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const { auto parent_class_subregion = base_region_t::subregion(offset_in_bytes, size_in_bytes); return { parent_class_subregion.data(), parent_class_subregion.size() }; @@ -733,8 +747,12 @@ struct region_t : public detail_::base_region_t { struct const_region_t : public detail_::base_region_t { using base_region_t::base_region_t; - const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {} - const_region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const + using base_region_t::pointer; + using base_region_t::const_pointer; + using base_region_t::size_type; + + const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {} + const_region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const { auto parent_class_subregion = base_region_t::subregion(offset_in_bytes, size_in_bytes); return { parent_class_subregion.data(), parent_class_subregion.size() }; diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp new file mode 100644 index 00000000..98f21b7f --- /dev/null +++ b/src/cuda/api/unique_region.hpp @@ -0,0 +1,249 @@ +/** + * @file + * + * @brief A smart pointer for CUDA device- and host-side memory, similar + * to the standard library's ::std::unique_ptr. + * + * @note Unique pointers, like any (wrapped) memory allocations, do _not_ extend the lifetime of + * contexts (primary or otherwise). In particular, they do not increase primary context refcounts. + * + */ +#ifndef CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_ +#define CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_ + +#include "memory.hpp" +#include + +namespace cuda { +namespace memory { + +/** + * A class for holding a @ref `region_t` of memory owned "uniquely" by + * its creator - similar to how `::std::unique_ptr` holds a uniquely- + * owned pointer. + * + * @note The class is not templated on the element type - since that + * is quite immaterial to its management (as well as its copying etc.) + * + * @tparam Deleter Similar to @ref ::std::unique_ptr's Deleter parameter; + * it needs to be default-constructible and have an operator(). + * + * @todo : Should we really expose the region parent class? We could, + * instead, have a `get()` or `region()` method which provides lvalue-ref + * access to it. + * + * @todo: Should this be called a `unique_region`, a-la-`unique_ptr`? Or + * perhaps `unique_ptr` is a misnomer, and should have been called `owned_ptr` + * (as opposed to `shared_ptr`), and regardless, this should be called an + * `owned_region`? + */ +template +class unique_region : public region_t { +public: // types + using parent = region_t; + using region_t::pointer; + using region_t::const_pointer; + using region_t::size_type; + using deleter_type = Deleter; + // and _no_ element_type! + +public: + + /// Default constructor, creates an empty unique_region which owns nothing + constexpr unique_region() noexcept = default; + + /// Act like the default constructor for nullptr_t's + constexpr unique_region(::std::nullptr_t) noexcept : unique_region() { } + + /// Take ownership of an existing region + explicit unique_region(region_t region) noexcept : region_t{region} { } + + // Note: No constructor which also takes a deleter. We do not hold a deleter + // member - unlike unique_ptr's. If we wanted a general-purpose unique region + // that's not just GPU allcoation-oriented, we might have had one of those. + + /// Move constructor. + unique_region(unique_region&& other) noexcept : unique_region(other.release()) { } + // Disable copy construction + unique_region(const unique_region&) = delete; + + // Note: No conversion from "another type" like with ::std::unique_pointer, since + // this class is not variant with the element type; and there's not much sense in + // supporting conversion of memory between different deleters (/ allocators). + + ~unique_region() noexcept + { + if (data() != nullptr) { + deleter_type{}(data()); + } + start() = nullptr; +#ifndef NDEBUG + size() = 0; +#endif + } + + /// No copy-assignment - that would break our ownership guarantee + unique_region& operator=(const unique_region&) = delete; + + /// A Move-assignment operator, which takes ownership of the other region + unique_region& operator=(unique_region&& other) noexcept + { + reset(other.release()); + return *this; + } + + // No "assignment from anoterh type", a s + + /// Reset the %unique_region to empty, invoking the deleter if necessary. + unique_region& + operator=(::std::nullptr_t) noexcept + { + reset(); + return *this; + } + + /// No plain dereferencing - as there is no guarantee that any object has been + /// initialized at those locations, nor do we know its type + + /// TODO: Should we support arrow-dereferencing? + + operator const_region_t() const noexcept { return *this; } + + /// Return the stored pointer. + region_t get() const noexcept { return *this; } + + /// Return a deleter of the fixed type (it can't be a reference - + /// we don't keep a deleter object) + deleter_type get_deleter() const noexcept { return Deleter{}; } + + /// Return @c true if the stored pointer is not null. + explicit operator bool() const noexcept { return data() != nullptr; } + + // Modifiers. + + /// Release ownership of any stored pointer. + region_t release() noexcept + { + // TODO: Shouldn't I use move construction for release? + region_t released { *this }; + start() = nullptr; +#ifndef NDEBUG + size() = 0; +#endif + return released; + } + + /** @brief Replace the stored pointer. + * + * @param ptr The new pointer to store. + * + * The deleter will be invoked if a pointer is already owned. + */ + void reset(region_t region = region_t{}) + { + ::std::swap(*this, region); + if (region.start() != nullptr) { + get_deleter()(region); + } + } + + /// Exchange the pointer and deleter with another object. + void swap(unique_region& other) noexcept + { + ::std::swap(*this, other); + } +}; // class unique_region + +namespace device { + +using unique_region = memory::unique_region; + +namespace detail_ { + +inline unique_region make_unique_region(const context::handle_t context_handle, size_t num_bytes) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return unique_region{ allocate_in_current_context(num_bytes) }; +} + +} // namespace detail_ + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory. + * + * @note CUDA's runtime API always has a current device; but - + * there is not necessary a current context; so a primary context + * for a device may be created through this call. + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param context The CUDA device context in which to make the + * allocation. + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the constructed T array +*/ +inline unique_region make_unique_region(const context_t& context, size_t num_bytes); +inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +inline unique_region make_unique_region(size_t num_bytes); + +} // namespace device + + +/// See @ref `device::make_unique_region(const context_t& context, size_t num_elements)` +inline device::unique_region make_unique_region(const context_t& context, size_t num_elements) +{ + return device::make_unique_region(context, num_elements); +} + +/// See @ref `device::make_unique_region(const device_t& device, size_t num_elements)` +inline device::unique_region make_unique_region(const device_t& device, size_t num_elements) +{ + return device::make_unique_region(device, num_elements); +} + +namespace host { + +using unique_region = memory::unique_region; + +inline unique_region make_unique_region(const context_t& context, size_t num_bytes); +inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +inline unique_region make_unique_region(size_t num_bytes); + +} // namespace host + +namespace managed { + +using unique_region = memory::unique_region; + +namespace detail_ { + +inline unique_region make_unique_region( + const context::handle_t context_handle, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return unique_region { allocate_in_current_context(num_bytes, initial_visibility) }; +} + +} // namespace detail_ + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); +inline unique_region make_unique_region( + const device_t& device, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); +inline unique_region make_unique_region( + size_t num_bytes); + +} // namespace managed + +} // namespace memory +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_