Regards #291: Added support for untyped, owned, memory regions:

* Added a `unique_region` class - non-templated * `unique_region`'s "decay" into `cuda::memory::region_t`'s - so there's no need to reimplement half the world for them * Added a missing variant of `cuda::memory::copy()` * Added an example program based on vectorAdd, which uses unique_regions (I would have replaced vectorAdd with it - had it not requirted to lines of source more than before...) * `region_t`'s now have an `as_span` method, as well as an `operator span()` * Removed some commented-out unique-pointer-related code
eyalroz · Feb 27, 2024 · 660b51b · 660b51b
1 parent cdac862
commit 660b51b
Show file tree

Hide file tree

Showing 7 changed files with 477 additions and 5 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -56,6 +56,7 @@ link_libraries(cuda-api-wrappers::runtime-and-driver)
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
 add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
+add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu)
 add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
 add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
 add_executable(vectorAdd_nvrtc modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp)

diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp
@@ -28,13 +28,13 @@ void pointer_properties(const cuda::device_t& device)
  cuda::context::create(device),
  cuda::context::create(device)
  };
- cuda::memory::device::unique_ptr<char[]> regions[2] = {
- cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
- cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
+ cuda::memory::device::unique_region regions[2] = {
+ cuda::memory::make_unique_region(contexts[0], fixed_size),
+ cuda::memory::make_unique_region(contexts[1], fixed_size)
  };
  void* raw_pointers[2] = {
- regions[0].get(),
- regions[1].get()
+ regions[0].data(),
+ regions[1].data()
  };
  cuda::memory::pointer_t<void> pointers[2] = {
  cuda::memory::pointer::wrap(raw_pointers[0]),

diff --git a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
@@ -0,0 +1,85 @@
+/**
+ * Derived from the nVIDIA CUDA 8.0 samples by
+ *
+ * Eyal Rozenberg
+ *
+ * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
+ * and the deriver is the owner of this code according to the EULA.
+ *
+ * Use this reasonably. If you want to discuss licensing formalities, please
+ * contact the author.
+ */
+
+#include "../../common.hpp"
+
+#include <cuda/api.hpp>
+
+#include <iostream>
+#include <memory>
+#include <algorithm>
+
+__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ if (i < numElements) { C[i] = A[i] + B[i]; }
+}
+
+int main()
+{
+ if (cuda::device::count() == 0) {
+ std::cerr << "No CUDA devices on this system" << "\n";
+ exit(EXIT_FAILURE);
+ }
+
+ int numElements = 50000;
+ std::cout << "[Vector addition of " << numElements << " elements]\n";
+
+ // If we could rely on C++14, we would use std::make_unique
+ auto h_A = std::unique_ptr<float[]>(new float[numElements]);
+ auto h_B = std::unique_ptr<float[]>(new float[numElements]);
+ auto h_C = std::unique_ptr<float[]>(new float[numElements]);
+
+ auto generator = []() { return rand() / (float) RAND_MAX; };
+ std::generate(h_A.get(), h_A.get() + numElements, generator);
+ std::generate(h_B.get(), h_B.get() + numElements, generator);
+
+ auto device = cuda::device::current::get();
+
+ auto d_A = cuda::memory::make_unique_region(device, numElements * sizeof(float));
+ auto d_B = cuda::memory::make_unique_region(device, numElements * sizeof(float));
+ auto d_C = cuda::memory::make_unique_region(device, numElements * sizeof(float));
+ auto sp_A = d_A.as_span<float>();
+ auto sp_B = d_B.as_span<float>();
+ auto sp_C = d_C.as_span<float>();
+
+ cuda::memory::copy(sp_A, h_A.get());
+ cuda::memory::copy(sp_B, h_B.get());
+
+ auto launch_config = cuda::launch_config_builder()
+ .overall_size(numElements)
+ .block_size(256)
+ .build();
+
+ std::cout
+ << "CUDA kernel launch with " << launch_config.dimensions.grid.x
+ << " blocks of " << launch_config.dimensions.block.x << " threads each\n";
+
+ cuda::launch(
+ vectorAdd, launch_config,
+ sp_A.data(), sp_B.data(), sp_C.data(), numElements
+ );
+
+ cuda::memory::copy(h_C.get(), sp_C);
+
+ // Verify that the result vector is correct
+ for (int i = 0; i < numElements; ++i) {
+ if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) {
+ std::cerr << "Result verification failed at element " << i << "\n";
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ std::cout << "Test PASSED\n";
+ std::cout << "SUCCESS\n";
+}
+
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
@@ -32,6 +32,7 @@
 #include "api/memory_pool.hpp"
 #endif
 #include "api/unique_ptr.hpp"
+#include "api/unique_region.hpp"
 #include "api/link_options.hpp"
 
 #include "api/device.hpp"
@@ -68,6 +69,7 @@
 #include "api/multi_wrapper_impls/context.hpp"
 #include "api/multi_wrapper_impls/stream.hpp"
 #include "api/multi_wrapper_impls/memory.hpp"
+#include "api/multi_wrapper_impls/unique_region.hpp"
 #include "api/multi_wrapper_impls/virtual_memory.hpp"
 #include "api/multi_wrapper_impls/kernel.hpp"
 #include "api/multi_wrapper_impls/kernel_launch.hpp"

diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
@@ -528,6 +528,16 @@ inline void copy(region_t destination, void* source, size_t num_bytes)
  return copy(destination.start(), source, num_bytes);
 }
 
+inline void copy(void* destination, const_region_t source, size_t num_bytes)
+{
+#ifndef NDEBUG
+ if (source.size() < num_bytes) {
+ throw ::std::logic_error("Number of bytes to copy exceeds source size");
+ }
+#endif
+ return copy(destination, source.start(), num_bytes);
+}
+
 inline void copy(region_t destination, void* source)
 {
  return copy(destination, source, destination.size());

diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
@@ -0,0 +1,122 @@
+/**
+ * @file
+ *
+ * @brief Implementations of @ref `make_unique_region()` functions
+ */
+#pragma once
+#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+
+#include "../unique_region.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace memory {
+
+namespace device {
+
+inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements)
+{
+ return detail_::make_unique_region(context.handle(), num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory
+ *
+ * @tparam T an array type; _not_ the type of individual elements
+ *
+ * @param device on which to construct the array of elements
+ * @param num_elements the number of elements to allocate
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+inline unique_region make_unique_region(const device_t& device, size_t num_elements)
+{
+ auto pc = device.primary_context();
+ return make_unique_region(pc, num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory on the current device.
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T an array type; _not_ the type of individual elements
+ *
+ * @param num_elements the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+inline unique_region make_unique_region(size_t num_elements)
+{
+ auto current_device_id = cuda::device::current::detail_::get_id();
+ auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+ return make_unique_region(pc, num_elements);
+}
+
+} // namespace device
+
+namespace host {
+
+inline unique_region make_unique_region(
+ const context_t& context,
+ size_t num_bytes,
+ allocation_options options)
+{
+ CAW_SET_SCOPE_CONTEXT(context.handle());
+ return unique_region{ allocate(num_bytes, options) };
+}
+
+inline unique_region make_unique_region(const device_t& device, size_t num_bytes)
+{
+ auto pc = device.primary_context();
+ return make_unique_region(pc, num_bytes);
+}
+
+inline unique_region make_unique_region(size_t num_bytes)
+{
+ return unique_region { allocate(num_bytes) };
+}
+
+} // namespace host
+
+namespace managed {
+
+inline unique_region make_unique_region(
+ const context_t& context,
+ size_t num_bytes,
+ initial_visibility_t initial_visibility)
+{
+ CAW_SET_SCOPE_CONTEXT(context.handle());
+ return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) };
+}
+
+inline unique_region make_unique_region(
+ const device_t& device,
+ size_t num_bytes,
+ initial_visibility_t initial_visibility)
+{
+ auto pc = device.primary_context();
+ return make_unique_region(pc, num_bytes, initial_visibility);
+}
+
+inline unique_region make_unique_region(
+ size_t num_bytes,
+ initial_visibility_t initial_visibility)
+{
+ auto current_device_id = cuda::device::current::detail_::get_id();
+ auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+ return make_unique_region(pc, num_bytes, initial_visibility);
+}
+
+} // namespace managed
+
+} // namespace memory
+
+} // namespace cuda
+
+#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+