Skip to content

Commit

Permalink
Regards #291: Added support for untyped, owned, memory regions:
Browse files Browse the repository at this point in the history
* Added a `unique_region` class - non-templated
* `unique_region`'s "decay" into `cuda::memory::region_t`'s - so there's no need to reimplement half the world for them
* Added a missing variant of `cuda::memory::copy()`
* Added an example program based on vectorAdd, which uses unique_regions (I would have replaced vectorAdd with it - had it not requirted to lines of source more than before...)
* Removed some commented-out unique-pointer-related code
  • Loading branch information
eyalroz committed Feb 13, 2024
1 parent d7512ab commit 2b48e22
Show file tree
Hide file tree
Showing 8 changed files with 481 additions and 20 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ link_libraries(cuda-api-wrappers::runtime-and-driver)

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu)
add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
add_executable(vectorAdd_nvrtc modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp)
Expand Down
10 changes: 5 additions & 5 deletions examples/by_api_module/unified_addressing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ void pointer_properties(const cuda::device_t& device)
cuda::context::create(device),
cuda::context::create(device)
};
cuda::memory::device::unique_ptr<char[]> regions[2] = {
cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
cuda::memory::device::unique_region regions[2] = {
cuda::memory::make_unique_region(contexts[0], fixed_size),
cuda::memory::make_unique_region(contexts[1], fixed_size)
};
void* raw_pointers[2] = {
regions[0].get(),
regions[1].get()
regions[0].data(),
regions[1].data()
};
cuda::memory::pointer_t<void> pointers[2] = {
cuda::memory::pointer::wrap(raw_pointers[0]),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/**
* Derived from the nVIDIA CUDA 8.0 samples by
*
* Eyal Rozenberg
*
* The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
* and the deriver is the owner of this code according to the EULA.
*
* Use this reasonably. If you want to discuss licensing formalities, please
* contact the author.
*/

#include <cuda/api.hpp>

#include <iostream>
#include <memory>
#include <algorithm>

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) { C[i] = A[i] + B[i]; }
}

int main()
{
if (cuda::device::count() == 0) {
std::cerr << "No CUDA devices on this system" << "\n";
exit(EXIT_FAILURE);
}

int numElements = 50000;
std::cout << "[Vector addition of " << numElements << " elements]\n";

// If we could rely on C++14, we would use std::make_unique
auto h_A = std::unique_ptr<float[]>(new float[numElements]);
auto h_B = std::unique_ptr<float[]>(new float[numElements]);
auto h_C = std::unique_ptr<float[]>(new float[numElements]);

auto generator = []() { return rand() / (float) RAND_MAX; };
std::generate(h_A.get(), h_A.get() + numElements, generator);
std::generate(h_B.get(), h_B.get() + numElements, generator);

auto device = cuda::device::current::get();
auto d_A = cuda::memory::make_unique_region(device, numElements);
auto d_B = cuda::memory::make_unique_region(device, numElements);
auto d_C = cuda::memory::make_unique_region(device, numElements);
auto sp_A = d_A.as_span<float>();
auto sp_B = d_A.as_span<float>();
auto sp_C = d_A.as_span<float>();

cuda::memory::copy(sp_A, h_A.get());
cuda::memory::copy(sp_B, h_B.get());

auto launch_config = cuda::launch_config_builder()
.overall_size(numElements)
.block_size(256)
.build();

std::cout
<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";

cuda::launch(
vectorAdd, launch_config,
sp_A.data(), sp_B.data(), sp_C.data(), numElements
);

cuda::memory::copy(h_C.get(), d_C);

// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5) {
std::cerr << "Result verification failed at element " << i << "\n";
exit(EXIT_FAILURE);
}
}

std::cout << "Test PASSED\n";
std::cout << "SUCCESS\n";
}

2 changes: 2 additions & 0 deletions src/cuda/api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "api/memory_pool.hpp"
#endif
#include "api/unique_ptr.hpp"
#include "api/unique_region.hpp"
#include "api/link_options.hpp"

#include "api/device.hpp"
Expand Down Expand Up @@ -67,6 +68,7 @@
#include "api/multi_wrapper_impls/context.hpp"
#include "api/multi_wrapper_impls/stream.hpp"
#include "api/multi_wrapper_impls/memory.hpp"
#include "api/multi_wrapper_impls/unique_region.hpp"
#include "api/multi_wrapper_impls/virtual_memory.hpp"
#include "api/multi_wrapper_impls/kernel.hpp"
#include "api/multi_wrapper_impls/kernel_launch.hpp"
Expand Down
10 changes: 10 additions & 0 deletions src/cuda/api/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,16 @@ inline void copy(region_t destination, void* source, size_t num_bytes)
return copy(destination.start(), source, num_bytes);
}

inline void copy(void* destination, const_region_t source, size_t num_bytes)
{
#ifndef NDEBUG
if (source.size() < num_bytes) {
throw ::std::logic_error("Number of bytes to copy exceeds source size");
}
#endif
return copy(destination, source.start(), num_bytes);
}

inline void copy(region_t destination, void* source)
{
return copy(destination, source, destination.size());
Expand Down
99 changes: 99 additions & 0 deletions src/cuda/api/multi_wrapper_impls/unique_region.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/**
* @file
*
* @brief Implementations of @ref `make_unique_region()` functions
*/
#pragma once
#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_

#include "../unique_region.hpp"
#include "../types.hpp"

namespace cuda {

namespace memory {

namespace device {

inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements)
{
return detail_::make_unique_region(context.handle(), num_elements);
}

/**
* @brief Create a variant of ::std::unique_pointer for an array in
* device-global memory
*
* @tparam T an array type; _not_ the type of individual elements
*
* @param device on which to construct the array of elements
* @param num_elements the number of elements to allocate
* @return an ::std::unique_ptr pointing to the constructed T array
*/
inline unique_region make_unique_region(const device_t& device, size_t num_elements)
{
auto pc = device.primary_context();
CAW_SET_SCOPE_CONTEXT(pc.handle());
return make_unique_region(pc, num_elements);
}

/**
* @brief Create a variant of ::std::unique_pointer for an array in
* device-global memory on the current device.
*
* @note The allocation will be made in the device's primary context -
* which will be created if it has not yet been.
*
* @tparam T an array type; _not_ the type of individual elements
*
* @param num_elements the number of elements to allocate
*
* @return an ::std::unique_ptr pointing to the constructed T array
*/
inline unique_region make_unique_region(size_t num_elements)
{
auto current_device_id = cuda::device::current::detail_::get_id();
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
return make_unique_region(pc, num_elements);
}

} // namespace device

namespace managed {

inline unique_region make_unique_region(
const context_t& context,
size_t num_bytes,
initial_visibility_t initial_visibility)
{
CAW_SET_SCOPE_CONTEXT(context.handle());
return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) };
}

inline unique_region make_unique_region(
const device_t& device,
size_t num_bytes,
initial_visibility_t initial_visibility)
{
auto pc = device.primary_context();
return make_unique_region(pc, num_bytes, initial_visibility);
}

inline unique_region make_unique_region(
size_t num_bytes,
initial_visibility_t initial_visibility)
{
auto current_device_id = cuda::device::current::detail_::get_id();
auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
return make_unique_region(pc, num_bytes, initial_visibility);
}

} // namespace managed

} // namespace memory

} // namespace cuda

#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_

48 changes: 33 additions & 15 deletions src/cuda/api/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -637,14 +637,18 @@ namespace detail_ {
// Note: T should be either void or void const, nothing else
template <class T>
class base_region_t {
public:
using pointer = T*;
using const_pointer = const T*;
using size_type = size_t;

private:
T* start_ = nullptr;
size_t size_in_bytes_ = 0;
pointer start_ = nullptr;
size_type size_in_bytes_ = 0;

using char_type = typename ::std::conditional<::std::is_const<T>::value, const char *, char *>::type;
public:
base_region_t() noexcept = default;
base_region_t(T* start, size_t size_in_bytes) noexcept
base_region_t(pointer start, size_t size_in_bytes) noexcept
: start_(start), size_in_bytes_(size_in_bytes) {}
base_region_t(device::address_t start, size_t size_in_bytes) noexcept
: start_(as_pointer(start)), size_in_bytes_(size_in_bytes) {}
Expand All @@ -657,7 +661,7 @@ class base_region_t {
}

template <typename U>
explicit operator span<U>() const
span<U> as_span() const NOEXCEPT_IF_NDEBUG
{
static_assert(
::std::is_const<U>::value or not ::std::is_const<typename ::std::remove_pointer<T>::type>::value,
Expand All @@ -675,13 +679,16 @@ class base_region_t {
}


T*& start() noexcept { return start_; }
size_t& size() noexcept { return size_in_bytes_; }
template <typename U>
operator span<U>() const NOEXCEPT_IF_NDEBUG { return as_span<U>(); }

pointer& start() noexcept { return start_; }
size_type& size() noexcept { return size_in_bytes_; }

size_t size() const noexcept { return size_in_bytes_; }
T* start() const noexcept { return start_; }
T* data() const noexcept { return start(); }
T* get() const noexcept { return start(); }
size_type size() const noexcept { return size_in_bytes_; }
pointer start() const noexcept { return start_; }
pointer data() const noexcept { return start(); }
pointer get() const noexcept { return start(); }

device::address_t device_address() const noexcept
{
Expand All @@ -702,7 +709,10 @@ class base_region_t {
throw ::std::invalid_argument("subregion exceeds original region bounds");
}
#endif
return { static_cast<char_type>(start_) + offset_in_bytes, size_in_bytes };
using char_type = typename ::std::conditional<::std::is_const<T>::value, const char *, char *>::type;
return {
static_cast<char_type>(start_) + offset_in_bytes, size_in_bytes
};
}
};

Expand All @@ -724,7 +734,11 @@ bool operator!=(const base_region_t<T>& lhs, const base_region_t<T>& rhs)

struct region_t : public detail_::base_region_t<void> {
using base_region_t<void>::base_region_t;
region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
using base_region_t<void>::pointer;
using base_region_t<void>::const_pointer;
using base_region_t<void>::size_type;

region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const
{
auto parent_class_subregion = base_region_t<void>::subregion(offset_in_bytes, size_in_bytes);
return { parent_class_subregion.data(), parent_class_subregion.size() };
Expand All @@ -733,8 +747,12 @@ struct region_t : public detail_::base_region_t<void> {

struct const_region_t : public detail_::base_region_t<void const> {
using base_region_t<void const>::base_region_t;
const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
const_region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
using base_region_t<void const>::pointer;
using base_region_t<void const>::const_pointer;
using base_region_t<void const>::size_type;

const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
const_region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const
{
auto parent_class_subregion = base_region_t<void const>::subregion(offset_in_bytes, size_in_bytes);
return { parent_class_subregion.data(), parent_class_subregion.size() };
Expand Down
Loading

0 comments on commit 2b48e22

Please sign in to comment.