diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d4b9cb84..701fdaa4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -56,6 +56,7 @@ link_libraries(cuda-api-wrappers::runtime-and-driver) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin") add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu) +add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu) add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu) add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu) add_executable(vectorAdd_nvrtc modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp) diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp index 1cc545c0..ffd7d67a 100644 --- a/examples/by_api_module/unified_addressing.cpp +++ b/examples/by_api_module/unified_addressing.cpp @@ -28,13 +28,13 @@ void pointer_properties(const cuda::device_t& device) cuda::context::create(device), cuda::context::create(device) }; - cuda::memory::device::unique_ptr regions[2] = { - cuda::memory::make_unique(contexts[0], fixed_size), - cuda::memory::make_unique(contexts[1], fixed_size) + cuda::memory::device::unique_region regions[2] = { + cuda::memory::make_unique_region(contexts[0], fixed_size), + cuda::memory::make_unique_region(contexts[1], fixed_size) }; void* raw_pointers[2] = { - regions[0].get(), - regions[1].get() + regions[0].data(), + regions[1].data() }; cuda::memory::pointer_t pointers[2] = { cuda::memory::pointer::wrap(raw_pointers[0]), diff --git a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu new file mode 100644 index 00000000..c72bf32e --- /dev/null +++ b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu @@ -0,0 +1,85 @@ +/** + * Derived from the nVIDIA CUDA 8.0 samples by + * + * Eyal Rozenberg + * + * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA + * and the deriver is the owner of this code according to the EULA. + * + * Use this reasonably. If you want to discuss licensing formalities, please + * contact the author. + */ + +#include "../../common.hpp" + +#include + +#include +#include +#include + +__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < numElements) { C[i] = A[i] + B[i]; } +} + +int main() +{ + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << "\n"; + exit(EXIT_FAILURE); + } + + int numElements = 50000; + std::cout << "[Vector addition of " << numElements << " elements]\n"; + + // If we could rely on C++14, we would use std::make_unique + auto h_A = std::unique_ptr(new float[numElements]); + auto h_B = std::unique_ptr(new float[numElements]); + auto h_C = std::unique_ptr(new float[numElements]); + + auto generator = []() { return rand() / (float) RAND_MAX; }; + std::generate(h_A.get(), h_A.get() + numElements, generator); + std::generate(h_B.get(), h_B.get() + numElements, generator); + + auto device = cuda::device::current::get(); + + auto d_A = cuda::memory::make_unique_region(device, numElements * sizeof(float)); + auto d_B = cuda::memory::make_unique_region(device, numElements * sizeof(float)); + auto d_C = cuda::memory::make_unique_region(device, numElements * sizeof(float)); + auto sp_A = d_A.as_span(); + auto sp_B = d_B.as_span(); + auto sp_C = d_C.as_span(); + + cuda::memory::copy(sp_A, h_A.get()); + cuda::memory::copy(sp_B, h_B.get()); + + auto launch_config = cuda::launch_config_builder() + .overall_size(numElements) + .block_size(256) + .build(); + + std::cout + << "CUDA kernel launch with " << launch_config.dimensions.grid.x + << " blocks of " << launch_config.dimensions.block.x << " threads each\n"; + + cuda::launch( + vectorAdd, launch_config, + sp_A.data(), sp_B.data(), sp_C.data(), numElements + ); + + cuda::memory::copy(h_C.get(), sp_C); + + // Verify that the result vector is correct + for (int i = 0; i < numElements; ++i) { + if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) { + std::cerr << "Result verification failed at element " << i << "\n"; + exit(EXIT_FAILURE); + } + } + + std::cout << "Test PASSED\n"; + std::cout << "SUCCESS\n"; +} + diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index a4456852..21de5415 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -32,6 +32,7 @@ #include "api/memory_pool.hpp" #endif #include "api/unique_ptr.hpp" +#include "api/unique_region.hpp" #include "api/link_options.hpp" #include "api/device.hpp" @@ -68,6 +69,7 @@ #include "api/multi_wrapper_impls/context.hpp" #include "api/multi_wrapper_impls/stream.hpp" #include "api/multi_wrapper_impls/memory.hpp" +#include "api/multi_wrapper_impls/unique_region.hpp" #include "api/multi_wrapper_impls/virtual_memory.hpp" #include "api/multi_wrapper_impls/kernel.hpp" #include "api/multi_wrapper_impls/kernel_launch.hpp" diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index aaf20896..22e22e5c 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -528,6 +528,16 @@ inline void copy(region_t destination, void* source, size_t num_bytes) return copy(destination.start(), source, num_bytes); } +inline void copy(void* destination, const_region_t source, size_t num_bytes) +{ +#ifndef NDEBUG + if (source.size() < num_bytes) { + throw ::std::logic_error("Number of bytes to copy exceeds source size"); + } +#endif + return copy(destination, source.start(), num_bytes); +} + inline void copy(region_t destination, void* source) { return copy(destination, source, destination.size()); diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp new file mode 100644 index 00000000..7c40a4da --- /dev/null +++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp @@ -0,0 +1,122 @@ +/** + * @file + * + * @brief Implementations of @ref `make_unique_region()` functions + */ +#pragma once +#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ +#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ + +#include "../unique_region.hpp" +#include "../types.hpp" + +namespace cuda { + +namespace memory { + +namespace device { + +inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements) +{ + return detail_::make_unique_region(context.handle(), num_elements); +} + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param device on which to construct the array of elements + * @param num_elements the number of elements to allocate + * @return an ::std::unique_ptr pointing to the constructed T array + */ +inline unique_region make_unique_region(const device_t& device, size_t num_elements) +{ + auto pc = device.primary_context(); + return make_unique_region(pc, num_elements); +} + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory on the current device. + * + * @note The allocation will be made in the device's primary context - + * which will be created if it has not yet been. + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the constructed T array + */ +inline unique_region make_unique_region(size_t num_elements) +{ + auto current_device_id = cuda::device::current::detail_::get_id(); + auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); + return make_unique_region(pc, num_elements); +} + +} // namespace device + +namespace host { + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + allocation_options options) +{ + CAW_SET_SCOPE_CONTEXT(context.handle()); + return unique_region{ allocate(num_bytes, options) }; +} + +inline unique_region make_unique_region(const device_t& device, size_t num_bytes) +{ + auto pc = device.primary_context(); + return make_unique_region(pc, num_bytes); +} + +inline unique_region make_unique_region(size_t num_bytes) +{ + return unique_region { allocate(num_bytes) }; +} + +} // namespace host + +namespace managed { + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + CAW_SET_SCOPE_CONTEXT(context.handle()); + return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) }; +} + +inline unique_region make_unique_region( + const device_t& device, + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + auto pc = device.primary_context(); + return make_unique_region(pc, num_bytes, initial_visibility); +} + +inline unique_region make_unique_region( + size_t num_bytes, + initial_visibility_t initial_visibility) +{ + auto current_device_id = cuda::device::current::detail_::get_id(); + auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id); + return make_unique_region(pc, num_bytes, initial_visibility); +} + +} // namespace managed + +} // namespace memory + +} // namespace cuda + +#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_ + diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp new file mode 100644 index 00000000..c9cbe1ab --- /dev/null +++ b/src/cuda/api/unique_region.hpp @@ -0,0 +1,252 @@ +/** + * @file + * + * @brief A smart pointer for CUDA device- and host-side memory, similar + * to the standard library's ::std::unique_ptr. + * + * @note Unique pointers, like any (wrapped) memory allocations, do _not_ extend the lifetime of + * contexts (primary or otherwise). In particular, they do not increase primary context refcounts. + * + */ +#ifndef CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_ +#define CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_ + +#include "memory.hpp" +#include + +namespace cuda { +namespace memory { + +/** + * A class for holding a @ref `region_t` of memory owned "uniquely" by + * its creator - similar to how `::std::unique_ptr` holds a uniquely- + * owned pointer. + * + * @note The class is not templated on the element type - since that + * is quite immaterial to its management (as well as its copying etc.) + * + * @tparam Deleter Similar to @ref ::std::unique_ptr's Deleter parameter; + * it needs to be default-constructible and have an operator(). + * + * @todo : Should we really expose the region parent class? We could, + * instead, have a `get()` or `region()` method which provides lvalue-ref + * access to it. + * + * @todo: Should this be called a `unique_region`, a-la-`unique_ptr`? Or + * perhaps `unique_ptr` is a misnomer, and should have been called `owned_ptr` + * (as opposed to `shared_ptr`), and regardless, this should be called an + * `owned_region`? + */ +template +class unique_region : public region_t { +public: // types + using parent = region_t; + using region_t::pointer; + using region_t::const_pointer; + using region_t::size_type; + using deleter_type = Deleter; + // and _no_ element_type! + +public: + + /// Default constructor, creates an empty unique_region which owns nothing + constexpr unique_region() noexcept = default; + + /// Act like the default constructor for nullptr_t's + constexpr unique_region(::std::nullptr_t) noexcept : unique_region() { } + + /// Take ownership of an existing region + explicit unique_region(region_t region) noexcept : region_t{region} { } + + // Note: No constructor which also takes a deleter. We do not hold a deleter + // member - unlike unique_ptr's. If we wanted a general-purpose unique region + // that's not just GPU allcoation-oriented, we might have had one of those. + + /// Move constructor. + unique_region(unique_region&& other) noexcept : unique_region(other.release()) { } + // Disable copy construction + unique_region(const unique_region&) = delete; + + // Note: No conversion from "another type" like with ::std::unique_pointer, since + // this class is not variant with the element type; and there's not much sense in + // supporting conversion of memory between different deleters (/ allocators). + + ~unique_region() noexcept + { + if (data() != nullptr) { + deleter_type{}(data()); + } + start() = nullptr; +#ifndef NDEBUG + size() = 0; +#endif + } + + /// No copy-assignment - that would break our ownership guarantee + unique_region& operator=(const unique_region&) = delete; + + /// A Move-assignment operator, which takes ownership of the other region + unique_region& operator=(unique_region&& other) noexcept + { + reset(other.release()); + return *this; + } + + // No "assignment from anoterh type", a s + + /// Reset the %unique_region to empty, invoking the deleter if necessary. + unique_region& + operator=(::std::nullptr_t) noexcept + { + reset(); + return *this; + } + + /// No plain dereferencing - as there is no guarantee that any object has been + /// initialized at those locations, nor do we know its type + + /// TODO: Should we support arrow-dereferencing? + + operator const_region_t() const noexcept { return *this; } + + /// Return the stored pointer. + region_t get() const noexcept { return *this; } + + /// Return a deleter of the fixed type (it can't be a reference - + /// we don't keep a deleter object) + deleter_type get_deleter() const noexcept { return Deleter{}; } + + /// Return @c true if the stored pointer is not null. + explicit operator bool() const noexcept { return data() != nullptr; } + + // Modifiers. + + /// Release ownership of any stored pointer. + region_t release() noexcept + { + // TODO: Shouldn't I use move construction for release? + region_t released { *this }; + start() = nullptr; +#ifndef NDEBUG + size() = 0; +#endif + return released; + } + + /** @brief Replace the stored pointer. + * + * @param ptr The new pointer to store. + * + * The deleter will be invoked if a pointer is already owned. + */ + void reset(region_t region = region_t{}) + { + ::std::swap(*this, region); + if (region.start() != nullptr) { + get_deleter()(region); + } + } + + /// Exchange the pointer and deleter with another object. + void swap(unique_region& other) noexcept + { + ::std::swap(*this, other); + } +}; // class unique_region + +namespace device { + +using unique_region = memory::unique_region; + +namespace detail_ { + +inline unique_region make_unique_region(const context::handle_t context_handle, size_t num_bytes) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return unique_region{ allocate_in_current_context(num_bytes) }; +} + +} // namespace detail_ + +/** + * @brief Create a variant of ::std::unique_pointer for an array in + * device-global memory. + * + * @note CUDA's runtime API always has a current device; but - + * there is not necessary a current context; so a primary context + * for a device may be created through this call. + * + * @tparam T an array type; _not_ the type of individual elements + * + * @param context The CUDA device context in which to make the + * allocation. + * @param num_elements the number of elements to allocate + * + * @return an ::std::unique_ptr pointing to the constructed T array +*/ +inline unique_region make_unique_region(const context_t& context, size_t num_bytes); +inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +inline unique_region make_unique_region(size_t num_bytes); + +} // namespace device + + +/// See @ref `device::make_unique_region(const context_t& context, size_t num_elements)` +inline device::unique_region make_unique_region(const context_t& context, size_t num_elements) +{ + return device::make_unique_region(context, num_elements); +} + +/// See @ref `device::make_unique_region(const device_t& device, size_t num_elements)` +inline device::unique_region make_unique_region(const device_t& device, size_t num_elements) +{ + return device::make_unique_region(device, num_elements); +} + +namespace host { + +using unique_region = memory::unique_region; + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + allocation_options options = allocation_options{}); +inline unique_region make_unique_region(const device_t& device, size_t num_bytes); +inline unique_region make_unique_region(size_t num_bytes); + +} // namespace host + +namespace managed { + +using unique_region = memory::unique_region; + +namespace detail_ { + +inline unique_region make_unique_region( + const context::handle_t context_handle, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + return unique_region { allocate_in_current_context(num_bytes, initial_visibility) }; +} + +} // namespace detail_ + +inline unique_region make_unique_region( + const context_t& context, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); +inline unique_region make_unique_region( + const device_t& device, + size_t num_bytes, + initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices); +inline unique_region make_unique_region( + size_t num_bytes); + +} // namespace managed + +} // namespace memory +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_