From 2b48e220d954c31fa6b0a226821b4da9b6988e66 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Sun, 11 Feb 2024 23:26:47 +0200
Subject: [PATCH] Regards #291: Added support for untyped, owned, memory
 regions:

* Added a `unique_region` class - non-templated
* `unique_region`'s "decay" into `cuda::memory::region_t`'s - so there's no need to reimplement half the world for them
* Added a missing variant of `cuda::memory::copy()`
* Added an example program based on vectorAdd, which uses unique_regions (I would have replaced vectorAdd with it - had it not requirted to lines of source more than before...)
* Removed some commented-out unique-pointer-related code
---
 examples/CMakeLists.txt                       |   1 +
 examples/by_api_module/unified_addressing.cpp |  10 +-
 .../vectorAdd_unique_regions.cu               |  82 ++++++
 src/cuda/api.hpp                              |   2 +
 src/cuda/api/memory.hpp                       |  10 +
 .../api/multi_wrapper_impls/unique_region.hpp |  99 +++++++
 src/cuda/api/types.hpp                        |  48 ++--
 src/cuda/api/unique_region.hpp                | 249 ++++++++++++++++++
 8 files changed, 481 insertions(+), 20 deletions(-)
 create mode 100644 examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
 create mode 100644 src/cuda/api/multi_wrapper_impls/unique_region.hpp
 create mode 100644 src/cuda/api/unique_region.hpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d4b9cb84..701fdaa4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -56,6 +56,7 @@ link_libraries(cuda-api-wrappers::runtime-and-driver)
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "bin")
 add_executable(vectorAdd modified_cuda_samples/vectorAdd/vectorAdd.cu)
+add_executable(vectorAdd_unique_regions modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu)
 add_executable(vectorAddMapped modified_cuda_samples/vectorAddMapped/vectorAddMapped.cu)
 add_executable(vectorAddManaged modified_cuda_samples/vectorAddManaged/vectorAddManaged.cu)
 add_executable(vectorAdd_nvrtc modified_cuda_samples/vectorAdd_nvrtc/vectorAdd_nvrtc.cpp)
diff --git a/examples/by_api_module/unified_addressing.cpp b/examples/by_api_module/unified_addressing.cpp
index 1cc545c0..ffd7d67a 100644
--- a/examples/by_api_module/unified_addressing.cpp
+++ b/examples/by_api_module/unified_addressing.cpp
@@ -28,13 +28,13 @@ void pointer_properties(const cuda::device_t& device)
 		cuda::context::create(device),
 		cuda::context::create(device)
 	};
-	cuda::memory::device::unique_ptr<char[]> regions[2] = {
-		cuda::memory::make_unique<char[]>(contexts[0], fixed_size),
-		cuda::memory::make_unique<char[]>(contexts[1], fixed_size)
+	cuda::memory::device::unique_region regions[2] = {
+		cuda::memory::make_unique_region(contexts[0], fixed_size),
+		cuda::memory::make_unique_region(contexts[1], fixed_size)
 	};
 	void* raw_pointers[2] = {
-		regions[0].get(),
-		regions[1].get()
+		regions[0].data(),
+		regions[1].data()
 	};
 	cuda::memory::pointer_t<void> pointers[2] = {
 		cuda::memory::pointer::wrap(raw_pointers[0]),
diff --git a/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
new file mode 100644
index 00000000..5bcc5971
--- /dev/null
+++ b/examples/modified_cuda_samples/vectorAdd_unique_regions/vectorAdd_unique_regions.cu
@@ -0,0 +1,82 @@
+/**
+ * Derived from the nVIDIA CUDA 8.0 samples by
+ *
+ *   Eyal Rozenberg
+ *
+ * The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
+ * and the deriver is the owner of this code according to the EULA.
+ *
+ * Use this reasonably. If you want to discuss licensing formalities, please
+ * contact the author.
+ */
+
+#include <cuda/api.hpp>
+
+#include <iostream>
+#include <memory>
+#include <algorithm>
+
+__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+	int i = blockDim.x * blockIdx.x + threadIdx.x;
+	if (i < numElements) { C[i] = A[i] + B[i]; }
+}
+
+int main()
+{
+	if (cuda::device::count() == 0) {
+		std::cerr << "No CUDA devices on this system" << "\n";
+		exit(EXIT_FAILURE);
+	}
+
+	int numElements = 50000;
+	std::cout << "[Vector addition of " << numElements << " elements]\n";
+
+	// If we could rely on C++14, we would  use std::make_unique
+	auto h_A = std::unique_ptr<float[]>(new float[numElements]);
+	auto h_B = std::unique_ptr<float[]>(new float[numElements]);
+	auto h_C = std::unique_ptr<float[]>(new float[numElements]);
+
+	auto generator = []() { return rand() / (float) RAND_MAX; };
+	std::generate(h_A.get(), h_A.get() + numElements, generator);
+	std::generate(h_B.get(), h_B.get() + numElements, generator);
+
+	auto device = cuda::device::current::get();
+	auto d_A = cuda::memory::make_unique_region(device, numElements);
+	auto d_B = cuda::memory::make_unique_region(device, numElements);
+	auto d_C = cuda::memory::make_unique_region(device, numElements);
+	auto sp_A = d_A.as_span<float>();
+	auto sp_B = d_A.as_span<float>();
+	auto sp_C = d_A.as_span<float>();
+
+	cuda::memory::copy(sp_A, h_A.get());
+	cuda::memory::copy(sp_B, h_B.get());
+
+	auto launch_config = cuda::launch_config_builder()
+		.overall_size(numElements)
+		.block_size(256)
+		.build();
+
+	std::cout
+		<< "CUDA kernel launch with " << launch_config.dimensions.grid.x
+		<< " blocks of " << launch_config.dimensions.block.x << " threads each\n";
+
+	cuda::launch(
+		vectorAdd, launch_config,
+		sp_A.data(), sp_B.data(), sp_C.data(), numElements
+	);
+
+	cuda::memory::copy(h_C.get(), d_C);
+
+	// Verify that the result vector is correct
+	for (int i = 0; i < numElements; ++i) {
+		if (fabs(h_A.get()[i] + h_B.get()[i] - h_C.get()[i]) > 1e-5)  {
+			std::cerr << "Result verification failed at element " << i << "\n";
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	std::cout << "Test PASSED\n";
+	std::cout << "SUCCESS\n";
+}
+
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
index 1f556219..b46c327e 100644
--- a/src/cuda/api.hpp
+++ b/src/cuda/api.hpp
@@ -31,6 +31,7 @@
 #include "api/memory_pool.hpp"
 #endif
 #include "api/unique_ptr.hpp"
+#include "api/unique_region.hpp"
 #include "api/link_options.hpp"
 
 #include "api/device.hpp"
@@ -67,6 +68,7 @@
 #include "api/multi_wrapper_impls/context.hpp"
 #include "api/multi_wrapper_impls/stream.hpp"
 #include "api/multi_wrapper_impls/memory.hpp"
+#include "api/multi_wrapper_impls/unique_region.hpp"
 #include "api/multi_wrapper_impls/virtual_memory.hpp"
 #include "api/multi_wrapper_impls/kernel.hpp"
 #include "api/multi_wrapper_impls/kernel_launch.hpp"
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 16436c8f..bb56ea12 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -503,6 +503,16 @@ inline void copy(region_t destination, void* source, size_t num_bytes)
 	return copy(destination.start(), source, num_bytes);
 }
 
+inline void copy(void* destination, const_region_t source, size_t num_bytes)
+{
+#ifndef NDEBUG
+	if (source.size() < num_bytes) {
+		throw ::std::logic_error("Number of bytes to copy exceeds source size");
+	}
+#endif
+	return copy(destination, source.start(), num_bytes);
+}
+
 inline void copy(region_t destination, void* source)
 {
 	return copy(destination, source, destination.size());
diff --git a/src/cuda/api/multi_wrapper_impls/unique_region.hpp b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
new file mode 100644
index 00000000..3ce64635
--- /dev/null
+++ b/src/cuda/api/multi_wrapper_impls/unique_region.hpp
@@ -0,0 +1,99 @@
+/**
+ * @file
+ *
+ * @brief Implementations of @ref `make_unique_region()` functions
+ */
+#pragma once
+#ifndef MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+#define MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+
+#include "../unique_region.hpp"
+#include "../types.hpp"
+
+namespace cuda {
+
+namespace memory {
+
+namespace device {
+
+inline unique_region make_unique_region(const context_t& context, cuda::size_t num_elements)
+{
+	return detail_::make_unique_region(context.handle(), num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param device        on which to construct the array of elements
+ * @param num_elements  the number of elements to allocate
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+inline unique_region make_unique_region(const device_t& device, size_t num_elements)
+{
+	auto pc = device.primary_context();
+	CAW_SET_SCOPE_CONTEXT(pc.handle());
+	return make_unique_region(pc, num_elements);
+}
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory on the current device.
+ *
+ * @note The allocation will be made in the device's primary context -
+ * which will be created if it has not yet been.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+ */
+inline unique_region make_unique_region(size_t num_elements)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_region(pc, num_elements);
+}
+
+} // namespace device
+
+namespace managed {
+
+inline unique_region make_unique_region(
+	const context_t&      context,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	CAW_SET_SCOPE_CONTEXT(context.handle());
+	return unique_region { detail_::allocate_in_current_context(num_bytes, initial_visibility) };
+}
+
+inline unique_region make_unique_region(
+	const device_t&       device,
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	auto pc = device.primary_context();
+	return make_unique_region(pc, num_bytes, initial_visibility);
+}
+
+inline unique_region make_unique_region(
+	size_t                num_bytes,
+	initial_visibility_t  initial_visibility)
+{
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	auto pc = cuda::device::primary_context::detail_::leaky_get(current_device_id);
+	return make_unique_region(pc, num_bytes, initial_visibility);
+}
+
+} // namespace managed
+
+} // namespace memory
+
+} // namespace cuda
+
+#endif // MULTI_WRAPPER_IMPLS_UNIQUE_REGION_HPP_
+
diff --git a/src/cuda/api/types.hpp b/src/cuda/api/types.hpp
index 3c96ba07..07dcfe7d 100644
--- a/src/cuda/api/types.hpp
+++ b/src/cuda/api/types.hpp
@@ -637,14 +637,18 @@ namespace detail_ {
 // Note: T should be either void or void const, nothing else
 template <class T>
 class base_region_t {
+public:
+    using pointer = T*;
+    using const_pointer = const T*;
+    using size_type = size_t;
+
 private:
-	T* start_ = nullptr;
-	size_t size_in_bytes_ = 0;
+	pointer start_ = nullptr;
+	size_type size_in_bytes_ = 0;
 
-	using char_type = typename ::std::conditional<::std::is_const<T>::value, const char *, char *>::type;
 public:
 	base_region_t() noexcept = default;
-	base_region_t(T* start, size_t size_in_bytes) noexcept
+	base_region_t(pointer start, size_t size_in_bytes) noexcept
 		: start_(start), size_in_bytes_(size_in_bytes) {}
 	base_region_t(device::address_t start, size_t size_in_bytes) noexcept
 		: start_(as_pointer(start)), size_in_bytes_(size_in_bytes) {}
@@ -657,7 +661,7 @@ class base_region_t {
 	}
 
 	template <typename U>
-	explicit operator span<U>() const
+	span<U> as_span() const NOEXCEPT_IF_NDEBUG
 	{
 		static_assert(
 			::std::is_const<U>::value or not ::std::is_const<typename ::std::remove_pointer<T>::type>::value,
@@ -675,13 +679,16 @@ class base_region_t {
 	}
 
 
-	T*& start() noexcept { return start_; }
-	size_t& size() noexcept { return size_in_bytes_; }
+	template <typename U>
+	operator span<U>() const NOEXCEPT_IF_NDEBUG { return as_span<U>(); }
+
+    pointer& start() noexcept { return start_; }
+	size_type& size() noexcept { return size_in_bytes_; }
 
-	size_t size() const noexcept { return size_in_bytes_; }
-	T* start() const noexcept { return start_; }
-	T* data() const noexcept { return start(); }
-	T* get() const noexcept { return start(); }
+    size_type size() const noexcept { return size_in_bytes_; }
+	pointer start() const noexcept { return start_; }
+	pointer data() const noexcept { return start(); }
+	pointer get() const noexcept { return start(); }
 
 	device::address_t device_address() const noexcept
 	{
@@ -702,7 +709,10 @@ class base_region_t {
 			throw ::std::invalid_argument("subregion exceeds original region bounds");
 		}
 #endif
-		return { static_cast<char_type>(start_) + offset_in_bytes, size_in_bytes };
+        using char_type = typename ::std::conditional<::std::is_const<T>::value, const char *, char *>::type;
+		return {
+           static_cast<char_type>(start_) + offset_in_bytes, size_in_bytes
+        };
 	}
 };
 
@@ -724,7 +734,11 @@ bool operator!=(const base_region_t<T>& lhs, const base_region_t<T>& rhs)
 
 struct region_t : public detail_::base_region_t<void> {
 	using base_region_t<void>::base_region_t;
-	region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
+    using base_region_t<void>::pointer;
+    using base_region_t<void>::const_pointer;
+    using base_region_t<void>::size_type;
+
+    region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const
 	{
 		auto parent_class_subregion = base_region_t<void>::subregion(offset_in_bytes, size_in_bytes);
 		return { parent_class_subregion.data(), parent_class_subregion.size() };
@@ -733,8 +747,12 @@ struct region_t : public detail_::base_region_t<void> {
 
 struct const_region_t : public detail_::base_region_t<void const> {
 	using base_region_t<void const>::base_region_t;
-	const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
-	const_region_t subregion(size_t offset_in_bytes, size_t size_in_bytes) const
+    using base_region_t<void const>::pointer;
+    using base_region_t<void const>::const_pointer;
+    using base_region_t<void const>::size_type;
+
+    const_region_t(const region_t& r) : base_region_t(r.start(), r.size()) {}
+	const_region_t subregion(size_type offset_in_bytes, size_type size_in_bytes) const
 	{
 		auto parent_class_subregion = base_region_t<void const>::subregion(offset_in_bytes, size_in_bytes);
 		return { parent_class_subregion.data(), parent_class_subregion.size() };
diff --git a/src/cuda/api/unique_region.hpp b/src/cuda/api/unique_region.hpp
new file mode 100644
index 00000000..98f21b7f
--- /dev/null
+++ b/src/cuda/api/unique_region.hpp
@@ -0,0 +1,249 @@
+/**
+ * @file
+ *
+ * @brief A smart pointer for CUDA device- and host-side memory, similar
+ * to the standard library's <a href="http://en.cppreference.com/w/cpp/memory/unique_ptr">::std::unique_ptr</a>.
+ *
+ * @note Unique pointers, like any (wrapped) memory allocations, do _not_ extend the lifetime of
+ * contexts (primary or otherwise). In particular, they do not increase primary context refcounts.
+ *
+ */
+#ifndef CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_
+#define CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_
+
+#include "memory.hpp"
+#include <cassert>
+
+namespace cuda {
+namespace memory {
+
+/**
+ * A class for holding a @ref `region_t` of memory owned "uniquely" by
+ * its creator - similar to how `::std::unique_ptr` holds a uniquely-
+ * owned pointer.
+ *
+ * @note The class is not templated on the element type - since that
+ * is quite immaterial to its management (as well as its copying etc.)
+ *
+ * @tparam Deleter Similar to @ref ::std::unique_ptr's Deleter parameter;
+ * it needs to be default-constructible and have an operator().
+ *
+ * @todo : Should we really expose the region parent class? We could,
+ * instead, have a `get()` or `region()` method which provides lvalue-ref
+ * access to it.
+ *
+ * @todo: Should this be called a `unique_region`, a-la-`unique_ptr`? Or
+ * perhaps `unique_ptr` is a misnomer, and should have been called `owned_ptr`
+ * (as opposed to `shared_ptr`), and regardless, this should be called an
+ * `owned_region`?
+ */
+template<typename Deleter>
+class unique_region : public region_t {
+public: // types
+    using parent = region_t;
+    using region_t::pointer;
+    using region_t::const_pointer;
+    using region_t::size_type;
+    using deleter_type = Deleter;
+    // and _no_ element_type!
+
+public:
+
+    /// Default constructor, creates an empty unique_region which owns nothing
+    constexpr unique_region() noexcept = default;
+
+    /// Act like the default constructor for nullptr_t's
+    constexpr unique_region(::std::nullptr_t) noexcept : unique_region() { }
+
+    /// Take ownership of an existing region
+    explicit unique_region(region_t region) noexcept : region_t{region} { }
+
+    // Note: No constructor which also takes a deleter. We do not hold a deleter
+    // member - unlike unique_ptr's. If we wanted a general-purpose unique region
+    // that's not just GPU allcoation-oriented, we might have had one of those.
+
+    /// Move constructor.
+    unique_region(unique_region&& other) noexcept : unique_region(other.release()) { }
+    // Disable copy construction
+    unique_region(const unique_region&) = delete;
+
+    // Note: No conversion from "another type" like with ::std::unique_pointer, since
+    // this class is not variant with the element type; and there's not much sense in
+    // supporting conversion of memory between different deleters (/ allocators).
+
+    ~unique_region() noexcept
+    {
+        if (data() != nullptr) {
+            deleter_type{}(data());
+        }
+        start() = nullptr;
+#ifndef NDEBUG
+        size() = 0;
+#endif
+    }
+
+    /// No copy-assignment - that would break our ownership guarantee
+    unique_region& operator=(const unique_region&) = delete;
+
+    /// A Move-assignment operator, which takes ownership of the other region
+    unique_region& operator=(unique_region&& other) noexcept
+    {
+        reset(other.release());
+        return *this;
+    }
+
+    // No "assignment from anoterh type", a s
+
+    /// Reset the %unique_region to empty, invoking the deleter if necessary.
+    unique_region&
+    operator=(::std::nullptr_t) noexcept
+    {
+        reset();
+        return *this;
+    }
+
+    /// No plain dereferencing - as there is no guarantee that any object has been
+    /// initialized at those locations, nor do we know its type
+
+    /// TODO: Should we support arrow-dereferencing?
+
+	operator const_region_t() const noexcept { return *this; }
+
+    /// Return the stored pointer.
+    region_t get() const noexcept { return *this; }
+
+    /// Return a deleter of the fixed type (it can't be a reference -
+    /// we don't keep a deleter object)
+    deleter_type get_deleter() const noexcept { return Deleter{}; }
+
+    /// Return @c true if the stored pointer is not null.
+    explicit operator bool() const noexcept { return data() != nullptr; }
+
+    // Modifiers.
+
+    /// Release ownership of any stored pointer.
+    region_t release() noexcept
+    {
+        // TODO: Shouldn't I use move construction for release?
+        region_t released { *this };
+        start() = nullptr;
+#ifndef NDEBUG
+        size() = 0;
+#endif
+        return released;
+    }
+
+    /** @brief Replace the stored pointer.
+     *
+     * @param ptr  The new pointer to store.
+     *
+     * The deleter will be invoked if a pointer is already owned.
+     */
+    void reset(region_t region = region_t{})
+    {
+        ::std::swap<region_t>(*this, region);
+        if (region.start() != nullptr) {
+            get_deleter()(region);
+        }
+    }
+
+    /// Exchange the pointer and deleter with another object.
+    void swap(unique_region& other) noexcept
+    {
+        ::std::swap<region_t>(*this, other);
+    }
+}; // class unique_region
+
+namespace device {
+
+using unique_region = memory::unique_region<detail_::deleter>;
+
+namespace detail_ {
+
+inline unique_region make_unique_region(const context::handle_t context_handle, size_t num_bytes)
+{
+    CAW_SET_SCOPE_CONTEXT(context_handle);
+    return unique_region{ allocate_in_current_context(num_bytes) };
+}
+
+} // namespace detail_
+
+/**
+ * @brief Create a variant of ::std::unique_pointer for an array in
+ * device-global memory.
+ *
+ * @note CUDA's runtime API always has a current device; but -
+ * there is not necessary a current context; so a primary context
+ * for a device may be created through this call.
+ *
+ * @tparam T  an array type; _not_ the type of individual elements
+ *
+ * @param context       The CUDA device context in which to make the
+ *                      allocation.
+ * @param num_elements  the number of elements to allocate
+ *
+ * @return an ::std::unique_ptr pointing to the constructed T array
+*/
+inline unique_region make_unique_region(const context_t& context, size_t num_bytes);
+inline unique_region make_unique_region(const device_t& device, size_t num_bytes);
+inline unique_region make_unique_region(size_t num_bytes);
+
+} // namespace device
+
+
+/// See @ref `device::make_unique_region(const context_t& context, size_t num_elements)`
+inline device::unique_region make_unique_region(const context_t& context, size_t num_elements)
+{
+	return device::make_unique_region(context, num_elements);
+}
+
+/// See @ref `device::make_unique_region(const device_t& device, size_t num_elements)`
+inline device::unique_region make_unique_region(const device_t& device, size_t num_elements)
+{
+	return device::make_unique_region(device, num_elements);
+}
+
+namespace host {
+
+using unique_region = memory::unique_region<detail_::deleter>;
+
+inline unique_region make_unique_region(const context_t& context, size_t num_bytes);
+inline unique_region make_unique_region(const device_t& device, size_t num_bytes);
+inline unique_region make_unique_region(size_t num_bytes);
+
+} // namespace host
+
+namespace managed {
+
+using unique_region = memory::unique_region<detail_::deleter>;
+
+namespace detail_ {
+
+inline unique_region make_unique_region(
+    const context::handle_t  context_handle,
+    size_t                   num_bytes,
+    initial_visibility_t     initial_visibility = initial_visibility_t::to_all_devices)
+{
+    CAW_SET_SCOPE_CONTEXT(context_handle);
+    return unique_region { allocate_in_current_context(num_bytes, initial_visibility) };
+}
+
+} // namespace detail_
+
+inline unique_region make_unique_region(
+    const context_t&      context,
+    size_t                num_bytes,
+    initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+inline unique_region make_unique_region(
+    const device_t&       device,
+    size_t                num_bytes,
+    initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);
+inline unique_region make_unique_region(
+    size_t                num_bytes);
+
+} // namespace managed
+
+} // namespace memory
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_UNIQUE_REGION_HPP_