From 3de5fe5f89b60a7e49f5515a7d7612ddfea47f49 Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Mon, 6 Feb 2023 12:58:10 +0200
Subject: [PATCH] Fixes #466, fixes #467, fixes #468 :

* Added a memory::type_t enum value, `non_cuda`, for memory not allocated by CUDA
* `memory::type_of()` will no longer fail for non-CUDA-allocated memory
* `copy_parameters_t<N>` is not out of `detail_`.
* The `endpoint_t` enum is now also out of `detail_`
* Beefed up `copy_parameters_t<N>` with additional convenience methods
*`copy_parameters_t<N>` now "recognizes" non-CUDA-allocated memory, treating it like CUDA host memory
* Our array-to-raw-mem copy function now determines the destination address' context
* Add an array-to-raw-mem copy function which takes the destination context explicitly
---
 .../unified_addressing.cpp                    |   1 +
 src/cuda/api.hpp                              |   1 +
 src/cuda/api/context.hpp                      |   1 +
 src/cuda/api/copy_parameters.hpp              | 561 ++++++++++++++++++
 src/cuda/api/memory.hpp                       | 326 ++++------
 src/cuda/api/multi_wrapper_impls/memory.hpp   |  17 +-
 src/cuda/api/pointer.hpp                      |  17 +-
 7 files changed, 695 insertions(+), 229 deletions(-)
 create mode 100644 src/cuda/api/copy_parameters.hpp
diff --git a/examples/by_runtime_api_module/unified_addressing.cpp b/examples/by_runtime_api_module/unified_addressing.cpp
index 97fd2867..da9883c5 100644
--- a/examples/by_runtime_api_module/unified_addressing.cpp
+++ b/examples/by_runtime_api_module/unified_addressing.cpp
@@ -107,6 +107,7 @@ void wrapped_pointers_and_regions(const cuda::device_t& device)
 
 	switch (cuda::memory::type_of(ptr)) {
 	using namespace cuda::memory;
+	case non_cuda:      die_("Pointer incorrectly reported to point into non-CUDA-allocated memory"); break;
 	case host_:         die_("Pointer incorrectly reported to point into host memory"); break;
 	case array:         die_("Pointer incorrectly reported to point to array memory"); break;
 //	case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break;
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
index 78e187af..a3206384 100644
--- a/src/cuda/api.hpp
+++ b/src/cuda/api.hpp
@@ -25,6 +25,7 @@
 #include <cuda/api/ipc.hpp>
 #include <cuda/api/array.hpp>
 #include <cuda/api/texture_view.hpp>
+#include <cuda/api/copy_parameters.hpp>
 #include <cuda/api/memory.hpp>
 #include <cuda/api/unique_ptr.hpp>
 #include <cuda/api/link_options.hpp>
diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
index 78849e05..be2bb838 100644
--- a/src/cuda/api/context.hpp
+++ b/src/cuda/api/context.hpp
@@ -103,6 +103,7 @@ constexpr flags_t inline make_flags(
 		| (keep_larger_local_mem_after_resize    ? CU_CTX_LMEM_RESIZE_TO_MAX : 0) );
 }
 
+// consider renaming this: device_id_of
 inline device::id_t get_device_id(handle_t context_handle)
 {
 	auto needed_push = current::detail_::push_if_not_on_top(context_handle);
diff --git a/src/cuda/api/copy_parameters.hpp b/src/cuda/api/copy_parameters.hpp
new file mode 100644
index 00000000..4216b2a3
--- /dev/null
+++ b/src/cuda/api/copy_parameters.hpp
@@ -0,0 +1,561 @@
+/**
+ * @file
+ *
+ * @brief The @ref copy_parameters class template and related definitions.
+ */
+#ifndef CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP
+#define CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP
+
+#include <cuda/api/array.hpp>
+#include <cuda/api/pointer.hpp>
+#include <cuda/api/constants.hpp>
+#include <cuda/api/error.hpp>
+
+namespace cuda {
+
+namespace memory {
+
+enum class endpoint_t {
+	source, destination
+};
+
+template<dimensionality_t NumDimensions>
+struct copy_parameters_t;
+
+namespace detail_ {
+
+template<dimensionality_t NumDimensions>
+struct base_copy_params;
+
+template<>
+struct base_copy_params<2> {
+	using intra_context_type = CUDA_MEMCPY2D;
+	using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ?
+};
+
+template<>
+struct base_copy_params<3> {
+	using type = CUDA_MEMCPY3D_PEER;
+	using intra_context_type = CUDA_MEMCPY3D;
+};
+
+// Note these, by default, support inter-context
+template<dimensionality_t NumDimensions>
+using base_copy_params_t = typename base_copy_params<NumDimensions>::type;
+
+template<size_t NumDimensions>
+array::dimensions_t<NumDimensions>
+non_array_endpoint_dimensions(endpoint_t endpoint, const copy_parameters_t<NumDimensions>& params);
+
+} //namespace detail_
+
+
+/**
+ * @brief A builder-ish subclass template around the basic 2D or 3D copy
+ * parameters which CUDA's complex copying API actually takes.
+ *
+ * {@note This class is not "safe", in the sense that there is currently no
+ * checks to ensure you've actively set all fields properly before passing
+ * it on to the CUDA driver.}
+ *
+ * {@note this class cannot hold reference units to any contexts or allocated
+ * memory, so one must ensure every resource relevant to the source and the
+ * destination remains alive until the copy operation is both scheduled _and
+ * executed_.}
+ */
+template<dimensionality_t NumDimensions>
+struct copy_parameters_t : detail_::base_copy_params_t<NumDimensions> {
+	using parent = detail_::base_copy_params_t<NumDimensions>;
+	using this_type = copy_parameters_t<NumDimensions>;
+	// TODO: Perhaps use proxies?
+
+	using intra_context_type = typename detail_::base_copy_params<NumDimensions>::intra_context_type;
+
+	using dimensions_type = array::dimensions_t<NumDimensions>;
+	using dimension_type = array::dimension_t;
+
+	bool is_intra_context() const noexcept { return parent::srcContext == parent::dstContext; }
+
+	this_type& set_context(endpoint_t endpoint, const context_t& context) noexcept;
+
+	this_type& set_single_context(endpoint_t endpoint, const context_t& context) noexcept
+	{
+		set_context(endpoint_t::source, context);
+		set_context(endpoint_t::destination, context);
+		return *this;
+	}
+
+	// Note: This assumes default pitch
+	template<typename T>
+	this_type& set_endpoint(endpoint_t endpoint, const cuda::array_t<T, NumDimensions> &array) noexcept;
+
+	// Note: This assumes default pitch
+	this_type& set_endpoint_untyped(
+		endpoint_t endpoint,
+		context::handle_t context_handle,
+		void *ptr,
+		dimensions_type dimensions);
+
+	// Note: This assumes default pitch
+	template<typename T>
+	this_type& set_endpoint(endpoint_t endpoint, T *ptr, dimensions_type dimensions);
+
+	// Note: This assumes default pitch
+	template<typename T>
+	this_type& set_endpoint(endpoint_t endpoint, span <T> span) noexcept
+	{
+		return set_endpoint(endpoint, span.data(), dimensions_type(span.size()));
+	}
+
+	// Note: This assumes default pitch
+	// TODO: Perhaps we should have an dimensioned offset type?
+	template<typename T>
+	this_type& set_endpoint(
+		endpoint_t endpoint,
+		context::handle_t context_handle,
+		T *ptr,
+		dimensions_type dimensions) noexcept;
+
+	template<typename T>
+	this_type& set_source(const cuda::array_t<T, NumDimensions> &array) noexcept
+	{
+		return set_endpoint(endpoint_t::source, array);
+	}
+
+	this_type& set_source_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions)
+	{
+		return set_endpoint(endpoint_t::source, context_handle, ptr, dimensions);
+	}
+
+	template<typename T>
+	this_type& set_source(T *ptr, dimensions_type dimensions) noexcept
+	{
+		return set_endpoint(endpoint_t::source, ptr, dimensions);
+	}
+
+	template<typename T>
+	this_type& set_source(span <T> span) noexcept
+	{
+		return set_source(span.data(), dimensions_type{span.size()});
+	}
+
+	template<typename T>
+	this_type& set_source(context::handle_t context_handle, T *ptr, dimensions_type dimensions) noexcept
+	{
+		return set_endpoint(endpoint_t::source, context_handle, ptr, dimensions);
+	}
+
+	template<typename T>
+	this_type& set_destination(const cuda::array_t<T, NumDimensions> &array) noexcept
+	{
+		return set_endpoint(endpoint_t::destination, array);
+	}
+
+	void set_destination_untyped(
+		context::handle_t context_handle,
+		void *ptr,
+		dimensions_type dimensions) noexcept
+	{
+		set_endpoint(endpoint_t::destination, context_handle, ptr, dimensions);
+	}
+
+	template<typename T>
+	this_type& set_destination(T *ptr, dimensions_type dimensions) noexcept
+	{
+		return set_endpoint(endpoint_t::destination, ptr, dimensions);
+	}
+
+	template<typename T>
+	this_type& set_destination(span<T> span) noexcept
+	{
+		return set_destination(span.data(), {span.size(), 1, 1});
+	}
+
+
+	template<typename T>
+	this_type& set_destination(context::handle_t context_handle, T *ptr, dimensions_type dimensions) noexcept
+	{
+		return set_endpoint(endpoint_t::destination, context_handle, ptr, dimensions);
+	}
+
+	this_type& set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept;
+
+	// TODO: Perhaps we should have an dimensioned offset type?
+	template<typename T>
+	this_type& set_offset(endpoint_t endpoint, dimensions_type offset) noexcept;
+
+	this_type& clear_offset(endpoint_t endpoint) noexcept
+	{
+		return set_bytes_offset(endpoint, dimensions_type::zero());
+	}
+
+	this_type& clear_offsets() noexcept
+	{
+		clear_offset(endpoint_t::source);
+		clear_offset(endpoint_t::destination);
+		return *this;
+	}
+
+	this_type& set_bytes_pitch(endpoint_t endpoint, dimension_type pitch_in_bytes) noexcept
+	{
+		(endpoint == endpoint_t::source ? parent::srcPitch : parent::dstPitch) = pitch_in_bytes;
+		return *this;
+	}
+
+	template<typename T>
+	this_type& set_pitch(endpoint_t endpoint, dimension_type pitch_in_elements) noexcept
+	{
+		return set_bytes_pitch(endpoint, pitch_in_elements * sizeof(T));
+	}
+
+	template<typename T>
+	this_type& set_pitches(dimension_type uniform_pitch_in_elements) noexcept
+	{
+		auto uniform_pitch_in_bytes { uniform_pitch_in_elements * sizeof(T) };
+		set_pitch<T>(endpoint_t::source, uniform_pitch_in_bytes);
+		set_pitch<T>(endpoint_t::destination, uniform_pitch_in_bytes);
+		return *this;
+	}
+
+	// Note: Must be called after copy extents have been set
+	this_type& set_default_pitch(endpoint_t endpoint) noexcept
+	{
+		return set_bytes_pitch(endpoint, parent::WidthInBytes);
+	}
+
+	// Note: Must be called after copy extents have been set
+	this_type& set_default_pitches() noexcept
+	{
+		set_default_pitch(endpoint_t::source);
+		set_default_pitch(endpoint_t::destination);
+		return *this;
+	}
+
+	this_type& set_bytes_extent(dimensions_type extent) noexcept;
+
+	template<typename T>
+	this_type& set_extent(dimensions_type extent) noexcept;
+	// Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger
+
+	dimensions_type bytes_extent() const noexcept;
+
+	template <typename T>
+	dimensions_type extent() const noexcept
+	{
+		auto extent_ = bytes_extent();
+#ifndef NDEBUG
+		if (extent_.width % sizeof(T) != 0) {
+			throw ::std::invalid_argument(
+				"Attempt to get the copy extent with assumed type of size "
+				+ ::std::to_string(sizeof(T)) + " while the byte extent's "
+				+ "minor dimension is not a multiple of this size");
+		}
+#endif
+		extent_.width /= sizeof(T);
+		return extent_;
+	}
+
+	this_type& set_pitches(array::dimension_t uniform_pitch_in_bytes) noexcept
+	{
+		set_pitch(endpoint_t::source, uniform_pitch_in_bytes);
+		set_pitch(endpoint_t::destination, uniform_pitch_in_bytes);
+		return *this;
+	}
+
+	this_type& clear_rest() noexcept;
+	// Clear any dummy fields which are required to be set to 0. Note that important fields,
+	// which you have not set explicitly, will _not_ be cleared by this method.
+
+};
+
+template<>
+inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint_untyped(
+	endpoint_t              endpoint,
+	context::handle_t,
+	void *                  ptr,
+	array::dimensions_t<2>  dimensions)
+{
+	auto memory_type = memory::type_of(ptr);
+	if (memory_type == memory::type_t::array) {
+		throw ::std::invalid_argument("Attempt to use the non-array endpoint setter with array memory at " + cuda::detail_::ptr_as_hex(ptr));
+	}
+	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_)
+	{
+		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
+	}
+	else {
+		// Either memory::type_t::host or memory::type_t::non_cuda
+		if (endpoint == endpoint_t::source) { srcHost = ptr; }
+		else { dstHost = ptr; }
+	}
+	set_bytes_pitch(endpoint, dimensions.width);
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype)
+		(memory_type == memory::type_t::non_cuda ? memory::type_t::host_ : memory_type);
+	// Can't set the endpoint context - the basic data structure doesn't support that!
+	// (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle;
+	return *this;
+}
+
+template<>
+template<typename T>
+inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint(
+	endpoint_t endpoint,
+	context::handle_t context_handle,
+	T *ptr,
+	array::dimensions_t<2> dimensions) noexcept
+{
+	array::dimensions_t<2> untyped_dims = {dimensions.width * sizeof(T), dimensions.height};
+	return set_endpoint_untyped(endpoint, context_handle, ptr, untyped_dims);
+}
+
+template<>
+template<typename T>
+inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint(
+	endpoint_t endpoint,
+	T *ptr,
+	array::dimensions_t<2> dimensions)
+{
+	// We would have _liked_ to say:
+	// auto context_handle = context::current::detail_::get_handle();
+	// ... here, but alas, 2D copy structures don't support contexts, so...
+	auto context_handle = context::detail_::none;
+	return set_endpoint<T>(endpoint, context_handle, ptr, dimensions);
+}
+
+template<>
+template<typename T>
+copy_parameters_t<2> &copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 2> &array) noexcept
+{
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
+	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
+	(endpoint == endpoint_t::source ? srcDevice : dstDevice) = array.device_id();
+	// Can't set the endpoint context - the basic data structure doesn't support that!
+	return *this;
+}
+
+namespace detail_ {
+
+template<>
+inline array::dimensions_t<2> non_array_endpoint_dimensions<2>(endpoint_t endpoint, const copy_parameters_t<2>& params)
+{
+	using dims_type = copy_parameters_t<2>::dimensions_type;
+	return (endpoint == endpoint_t::source) ?
+		   dims_type{ params.WidthInBytes, params.Height } :
+		   dims_type{ params.WidthInBytes, params.Height };
+}
+
+template<>
+inline array::dimensions_t<3> non_array_endpoint_dimensions<3>(endpoint_t endpoint, const copy_parameters_t<3>& params)
+{
+	using dims_type = copy_parameters_t<3>::dimensions_type;
+	return (endpoint == endpoint_t::source) ?
+		   dims_type{ params.srcPitch, params.Height, params.Depth } :
+		   dims_type{ params.WidthInBytes, params.Height, params.Depth };
+}
+
+} //
+
+template<>
+template<typename T>
+copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 3> &array) noexcept
+{
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
+	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
+	(endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle();
+	return *this;
+}
+
+template<>
+inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint_untyped(
+	endpoint_t              endpoint,
+	context::handle_t       context_handle,
+	void *                  ptr,
+	array::dimensions_t<3>  dimensions)
+{
+	auto memory_type = memory::type_of(ptr);
+	if (memory_type == memory::type_t::array) {
+		throw ::std::invalid_argument("Attempt to use the non-array endpoint setter with array memory at " + cuda::detail_::ptr_as_hex(ptr));
+	}
+	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_)
+	{
+		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
+	}
+	else {
+		// Either memory::type_t::host or memory::type_t::non_cuda
+		if (endpoint == endpoint_t::source) { srcHost = ptr; }
+		else { dstHost = ptr; }
+	}
+	set_bytes_pitch(endpoint, dimensions.width);
+	(endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height;
+	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype)
+		(memory_type == memory::type_t::non_cuda ? memory::type_t::host_ : memory_type);
+	(endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle;
+	return *this;
+}
+
+// 2D copy parameters only have an intra-context variant; should we silently assume the context
+// is the same for both ends?
+template<>
+inline copy_parameters_t<2>& copy_parameters_t<2>::set_context(endpoint_t endpoint, const context_t& context) noexcept = delete;
+
+template<>
+inline copy_parameters_t<3>& copy_parameters_t<3>::set_context(endpoint_t endpoint, const context_t& context) noexcept
+{
+	(endpoint == endpoint_t::source ? srcContext : dstContext) = context.handle();
+	return *this;
+}
+
+template<>
+template<typename T>
+inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint(
+	endpoint_t endpoint,
+	context::handle_t context_handle,
+	T *ptr,
+	array::dimensions_t<3> dimensions) noexcept
+{
+	array::dimensions_t<3> untyped_dims = {dimensions.width * sizeof(T), dimensions.height, dimensions.depth};
+	return set_endpoint_untyped(endpoint, context_handle, ptr, untyped_dims);
+}
+
+template<>
+template<typename T>
+inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint(
+	endpoint_t endpoint,
+	T *ptr,
+	array::dimensions_t<3> dimensions)
+{
+	return set_endpoint<T>(endpoint, context::current::detail_::get_handle(), ptr, dimensions);
+}
+
+template<>
+inline copy_parameters_t<2> &copy_parameters_t<2>::clear_rest() noexcept
+{
+	return *this;
+}
+
+template<>
+inline copy_parameters_t<3>& copy_parameters_t<3>::clear_rest() noexcept
+{
+	srcLOD = 0;
+	dstLOD = 0;
+	return *this;
+}
+
+template<>
+template<typename T>
+inline copy_parameters_t<2> &copy_parameters_t<2>::set_extent(dimensions_type extent) noexcept
+{
+	WidthInBytes = extent.width * sizeof(T);
+	Height = extent.height;
+	return *this;
+}
+
+template<>
+inline copy_parameters_t<2>& copy_parameters_t<2>::set_bytes_extent(dimensions_type extent) noexcept
+{
+	WidthInBytes = extent.width;
+	Height = extent.height;
+	return *this;
+}
+
+template<>
+inline copy_parameters_t<3>& copy_parameters_t<3>::set_bytes_extent(dimensions_type extent) noexcept
+{
+	WidthInBytes = extent.width;
+	Height = extent.height;
+	Depth = extent.depth;
+	return *this;
+}
+
+template<>
+template<typename T>
+copy_parameters_t<3>& copy_parameters_t<3>::set_extent(dimensions_type extent) noexcept
+{
+	dimensions_type extent_in_bytes{extent.width * sizeof(T), extent.height, extent.depth};
+	return set_bytes_extent(extent_in_bytes);
+}
+
+template<>
+inline copy_parameters_t<3>&
+copy_parameters_t<3>::set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept
+{
+	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width;
+	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
+	(endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth;
+	return *this;
+}
+
+template<>
+inline copy_parameters_t<2> &
+copy_parameters_t<2>::set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept
+{
+	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width;
+	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
+	return *this;
+}
+
+template<>
+template<typename T>
+copy_parameters_t<3>& copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset) noexcept
+{
+	dimensions_type offset_in_bytes{offset.width * sizeof(T), offset.height, offset.depth};
+	return set_bytes_offset(endpoint, offset_in_bytes);
+}
+
+template<>
+template<typename T>
+copy_parameters_t<2> &copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset) noexcept
+{
+	dimensions_type offset_in_bytes{offset.width * sizeof(T), offset.height};
+	return set_bytes_offset(endpoint, offset_in_bytes);
+}
+
+copy_parameters_t<3>::intra_context_type
+inline as_intra_context_parameters(const copy_parameters_t<3>& params)
+{
+	if (params.srcDevice != params.dstDevice) {
+		throw ::std::invalid_argument("Attempt to use inter-device copy parameters for an intra-context copy");
+	}
+	if (params.srcContext != params.dstContext) {
+		throw ::std::invalid_argument("Attempt to use inter-context copy parameters for an intra-context copy");
+	}
+
+	// TODO: Use designated initializers in C++20
+	copy_parameters_t<3>::intra_context_type result;
+
+	result.srcXInBytes = params.srcXInBytes;
+	result.srcY = params.srcY;
+	result.srcZ = params.srcZ;
+	result.srcLOD = params.srcLOD;
+	result.srcMemoryType = params.srcMemoryType;
+	result.srcHost = params.srcHost;
+	result.srcDevice = params.srcDevice;
+	result.srcArray = params.srcArray;
+	result.reserved0 = nullptr; // srcContext
+	result.srcPitch = params.srcPitch;
+	result.srcHeight = params.srcHeight;
+
+	result.dstXInBytes = params.dstXInBytes;
+	result.dstY = params.dstY;
+	result.dstZ = params.dstZ;
+	result.dstLOD = params.dstLOD;
+	result.dstMemoryType = params.dstMemoryType;
+	result.dstHost = params.dstHost;
+	result.dstDevice = params.dstDevice;
+	result.dstArray = params.dstArray;
+	result.reserved1 = nullptr;
+	result.dstPitch = params.dstPitch;
+	result.dstHeight = params.dstHeight;
+
+	result.WidthInBytes = params.WidthInBytes;
+	result.Height = params.Height;
+	result.Depth = params.Depth;
+	return result;
+}
+
+} //namespace memory
+
+} // namespace cuda
+
+
+#endif //CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP
diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp
index 9b1dcf14..eb08de0a 100644
--- a/src/cuda/api/memory.hpp
+++ b/src/cuda/api/memory.hpp
@@ -26,6 +26,7 @@
 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
 #define CUDA_API_WRAPPERS_MEMORY_HPP_
 
+#include <cuda/api/copy_parameters.hpp>
 #include <cuda/api/array.hpp>
 #include <cuda/api/constants.hpp>
 #include <cuda/api/current_device.hpp>
@@ -573,196 +574,28 @@ inline void zero(T* ptr)
 
 namespace detail_ {
 
-template<dimensionality_t NumDimensions>
-struct base_copy_params;
-
-template<>
-struct base_copy_params<2> {
-	using intra_context_type = CUDA_MEMCPY2D;
-	using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ?
-};
-
-template<>
-struct base_copy_params<3> {
-	using type = CUDA_MEMCPY3D_PEER;
-	using intra_context_type = CUDA_MEMCPY3D;
-};
-
-// Note these, by default, support inter-context
-template<dimensionality_t NumDimensions>
-using base_copy_params_t = typename base_copy_params<NumDimensions>::type;
-
-
-enum class endpoint_t {
-	source, destination
-};
-
-template<dimensionality_t NumDimensions>
-struct copy_parameters_t : base_copy_params_t<NumDimensions> {
-	// TODO: Perhaps use proxies?
-
-	using intra_context_type = typename base_copy_params<NumDimensions>::intra_context_type;
-
-	using dimensions_type = array::dimensions_t<NumDimensions>;
-
-	template<typename T>
-	void set_endpoint(endpoint_t endpoint, const cuda::array_t<T, NumDimensions> &array);
-
-	template<typename T>
-	void set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<NumDimensions> dimensions);
-
-	template<typename T>
-	void set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
-		array::dimensions_t<NumDimensions> dimensions);
-
-	// TODO: Perhaps we should have an dimensioned offset type?
-	template<typename T>
-	void set_offset(endpoint_t endpoint, dimensions_type offset);
-
-	template<typename T>
-	void clear_offset(endpoint_t endpoint)
-	{ set_offset<T>(endpoint, dimensions_type::zero()); }
-
-	template<typename T>
-	void set_extent(dimensions_type extent);
-	// Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger
-
-	void clear_rest();
-	// Clear any dummy fields which are required to be set to 0. Note that important fields,
-	// which you have not set explicitly, will _not_ be cleared by this method.
-
-};
-
-template<>
-template<typename T>
-void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 2> &array)
-{
-	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
-	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
-	// Can't set the endpoint context - the basic data structure doesn't support that!
-}
-
-template<>
-template<typename T>
-void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 3> &array)
-{
-	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
-	(endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
-	(endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle();
-}
-
-template<>
-template<typename T>
-inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
-	array::dimensions_t<2> dimensions)
-{
-	if (context_handle != context::detail_::none) {
-		throw cuda::runtime_error(
-			cuda::status::named_t::not_supported,
-			"Inter-context copying of 2D arrays is not supported by the CUDA driver");
-	}
-	set_endpoint<2>(endpoint, ptr, dimensions);
-}
-
-template<>
-template<typename T>
-inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<2> dimensions)
-{
-	auto memory_type = memory::type_of(ptr);
-	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
-		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
-	} else {
-		if (endpoint == endpoint_t::source) { srcHost = ptr; }
-		else { dstHost = ptr; }
-	}
-	(endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
-	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
-	// Can't set the endpoint context - the basic data structure doesn't support that!
-}
-
-template<>
-template<typename T>
-inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
-	array::dimensions_t<3> dimensions)
-{
-	cuda::memory::pointer_t<void> wrapped{ptr};
-	auto memory_type = memory::type_of(ptr);
-	if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
-		(endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
-	} else {
-		if (endpoint == endpoint_t::source) { srcHost = ptr; }
-		else { dstHost = ptr; }
-	}
-	(endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
-	(endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height;
-	(endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
-	(endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle;
-}
-
-template<>
-template<typename T>
-inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<3> dimensions)
-{
-	set_endpoint<T>(endpoint, context::detail_::none, ptr, dimensions);
-}
-
-template<>
-inline void copy_parameters_t<2>::clear_rest()
-{}
-
-template<>
-inline void copy_parameters_t<3>::clear_rest()
-{
-	srcLOD = 0;
-	dstLOD = 0;
-}
-
-template<>
-template<typename T>
-inline void copy_parameters_t<2>::set_extent(dimensions_type extent)
-{
-	WidthInBytes = extent.width * sizeof(T);
-	Height = extent.height;
-}
-
-template<>
-template<typename T>
-void copy_parameters_t<3>::set_extent(dimensions_type extent)
-{
-	WidthInBytes = extent.width * sizeof(T);
-	Height = extent.height;
-	Depth = extent.depth;
-}
-
-template<>
-template<typename T>
-void copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset)
-{
-	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
-	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
-	(endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth;
-}
-
-template<>
-template<typename T>
-void copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset)
-{
-	(endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
-	(endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
-}
-
-void set_endpoint(endpoint_t endpoint, void *src);
-
 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params)
 {
+	// TODO: Move this logic into the scoped ensurer class
+	auto context_handle = context::current::detail_::get_handle();
+	if  (context_handle != context::detail_::none) {
+		return cuMemcpy2D(&params);
+	}
+	auto current_device_id = cuda::device::current::detail_::get_id();
+	context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
+	context::current::detail_::push(context_handle);
 	// Note this _must_ be an intra-context copy, as inter-context is not supported
 	// and there's no indication of context in the relevant data structures
-	return cuMemcpy2D(&params);
+	auto status = cuMemcpy2D(&params);
+	context::current::detail_::pop();
+	cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
+	return status;
 }
 
 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params)
 {
 	if (params.srcContext == params.dstContext) {
+		context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
 		auto *intra_context_params = reinterpret_cast<base_copy_params<3>::intra_context_type *>(&params);
 		return cuMemcpy3D(intra_context_params);
 	}
@@ -770,14 +603,31 @@ inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, cop
 }
 
 template<dimensionality_t NumDimensions>
-status_t multidim_copy(context::handle_t context_handle, copy_parameters_t<NumDimensions> params)
+status_t multidim_copy(copy_parameters_t<NumDimensions> params)
 {
-	context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{context_handle};
 	return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
 }
 
+
 } // namespace detail_
 
+/**
+ * @brief An almost-generalized-case memory copy, taking a rather complex structure of
+ * copy parameters - wrapping the CUDA driver's own most-generalized-case copy
+ *
+ * @tparam NumDimensions The number of dimensions of the parameter structure.
+ * @param params A parameter structure with details regarding the copy source
+ * and destination, including CUDA context specifications, which must have been
+ * set in advance. This function will _not_ verify its validity, but rather
+ * merely pass it on to the CUDA driver
+ */
+template<dimensionality_t NumDimensions>
+void copy(copy_parameters_t<NumDimensions> params)
+{
+	status_t status = detail_::multidim_copy(params);
+	throw_if_error_lazy(status, "Copying using a general copy parameters structure");
+}
+
 /**
  * Synchronously copies data from a CUDA array into non-array memory.
  *
@@ -789,19 +639,34 @@ status_t multidim_copy(context::handle_t context_handle, copy_parameters_t<NumDi
  * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
  */
 template<typename T, dimensionality_t NumDimensions>
-void copy(const array_t<T, NumDimensions>& destination, const T *source)
+void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source)
 {
-	detail_::copy_parameters_t<NumDimensions> params{};
 	auto dims = destination.dimensions();
-	params.template clear_offset<T>(detail_::endpoint_t::source);
-	params.template clear_offset<T>(detail_::endpoint_t::destination);
+	auto params = copy_parameters_t<NumDimensions> {};
+	params.clear_offsets();
 	params.template set_extent<T>(dims);
+	params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
+	params.set_endpoint(endpoint_t::destination, destination);
 	params.clear_rest();
-	params.set_endpoint(detail_::endpoint_t::source, const_cast<T*>(source), dims);
-	params.set_endpoint(detail_::endpoint_t::destination, destination);
-	auto status = detail_::multidim_copy<NumDimensions>(destination.context_handle(), params);
-    throw_if_error(status, "Copying from a regular memory region into a CUDA array");
+	copy(params);
+}
+
+/**
+ * Synchronously copies data from a CUDA array into non-array memory.
+ *
+ * @tparam NumDimensions the number of array dimensions; only 2 and 3 are supported values
+ * @tparam T array element type
+ *
+ * @param destination A {@tparam NumDimensions}-dimensional CUDA array
+ * @param source A pointer to a region of contiguous memory holding `destination.size()` values
+ * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
+ */
+template<typename T, dimensionality_t NumDimensions>
+void copy(const array_t<T, NumDimensions>& destination, const T *source)
+{
+	copy(destination, context_of(source), source);
 }
+
 /**
  * Synchronously copies data into a CUDA array from non-array memory.
  *
@@ -813,33 +678,48 @@ void copy(const array_t<T, NumDimensions>& destination, const T *source)
  * @param source A {@tparam NumDimensions}-dimensional CUDA array
  */
 template <typename T, dimensionality_t NumDimensions>
-void copy(T *destination, const array_t<T, NumDimensions>& source)
+void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source)
 {
-	detail_::copy_parameters_t<NumDimensions> params{};
 	auto dims = source.dimensions();
-	params.template clear_offset<T>(detail_::endpoint_t::source);
-	params.template clear_offset<T>(detail_::endpoint_t::destination);
-	params.template set_extent<T>(source.dimensions());
+	auto params = copy_parameters_t<NumDimensions> {};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.set_endpoint(endpoint_t::source, source);
+	params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
+	params.set_default_pitches();
 	params.clear_rest();
-	params.set_endpoint(detail_::endpoint_t::source, source);
-	params.template set_endpoint<T>(detail_::endpoint_t::destination, destination, dims);
-    params.dstPitch = params.srcPitch = dims.width * sizeof(T);
-    auto status = detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
-    throw_if_error(status, "Copying from a CUDA array into a regular memory region");
+	copy(params);
+}
+
+/**
+ * Synchronously copies data into a CUDA array from non-array memory.
+ *
+ * @tparam NumDimensions the number of array dimensions; only 2 and 3 are supported values
+ * @tparam T array element type
+ *
+ * @param destination A pointer to a region of contiguous memory holding `destination.size()` values
+ * of type @tparam T. The memory may be located either on a CUDA device or in host memory.
+ * @param source A {@tparam NumDimensions}-dimensional CUDA array
+ */
+template <typename T, dimensionality_t NumDimensions>
+void copy(T *destination, const array_t<T, NumDimensions>& source)
+{
+    copy(context_of(destination), destination, source);
 }
 
 template <typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source)
 {
-	detail_::copy_parameters_t<NumDimensions> params{};
 	auto dims = source.dimensions();
-	params.template clear_offset<T>(detail_::endpoint_t::source);
-	params.template clear_offset<T>(detail_::endpoint_t::destination);
-	params.template set_extent<T>(source.dimensions());
-	params.clear_rest();
-	params.set_endpoint(detail_::endpoint_t::source, source);
-	params.set_endpoint(detail_::endpoint_t::destination, destination);
-	params.dstPitch = params.srcPitch = dims.width * sizeof(T);
+	auto params = copy_parameters_t<NumDimensions> {};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
+	params.template set_extent<T>(dims);
+	params.set_endpoint(endpoint_t::source, source);
+	params.set_endpoint(endpoint_t::destination, destination);
+	params.set_default_pitches();
+	params.clear_rest();;
 	auto status = //(source.context() == destination.context()) ?
 		detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
 	throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
@@ -930,7 +810,7 @@ inline void copy(region_t destination, const_region_t source, stream::handle_t s
 }
 ///@}
 
-using memory::detail_::copy_parameters_t;
+using memory::copy_parameters_t;
 
 inline status_t multidim_copy_in_current_context(
 	::std::integral_constant<dimensionality_t, 2>,
@@ -978,17 +858,17 @@ status_t multidim_copy(
 template <typename T, dimensionality_t NumDimensions>
 void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handle_t stream_handle)
 {
-	using  memory::detail_::endpoint_t;
+	using  memory::endpoint_t;
 	auto dims = source.dimensions();
 	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
-	detail_::copy_parameters_t<NumDimensions> params{};
-	params.template clear_offset<T>(endpoint_t::source);
-	params.template clear_offset<T>(endpoint_t::destination);
+	auto params = copy_parameters_t<NumDimensions> {};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
 	params.template set_extent<T>(dims);
-	params.clear_rest();
 	params.set_endpoint(endpoint_t::source, source);
 	params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
-    params.dstPitch = dims.width * sizeof(T);
+	params.set_default_pitches();
+	params.clear_rest();
     auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
     throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
 }
@@ -997,17 +877,17 @@ void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handl
 template <typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>&  destination, const T* source, stream::handle_t stream_handle)
 {
-	using  memory::detail_::endpoint_t;
+	using memory::endpoint_t;
 	auto dims = destination.dimensions();
 	//auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
-	detail_::copy_parameters_t<NumDimensions> params{};
-	params.template clear_offset<T>(endpoint_t::source);
-	params.template clear_offset<T>(endpoint_t::destination);
-	params.template set_extent<T>(destination.dimensions());
-    params.srcPitch = dims.width * sizeof(T);
-	params.clear_rest();
+	auto params = copy_parameters_t<NumDimensions>{};
+	params.clear_offset(endpoint_t::source);
+	params.clear_offset(endpoint_t::destination);
+	params.template set_extent<T>(dims);
 	params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
 	params.set_endpoint(endpoint_t::destination, destination);
+	params.set_default_pitches();
+	params.clear_rest();
     auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
     throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
 }
diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp
index 8daa2363..e60dd12c 100644
--- a/src/cuda/api/multi_wrapper_impls/memory.hpp
+++ b/src/cuda/api/multi_wrapper_impls/memory.hpp
@@ -497,14 +497,23 @@ namespace pointer {
 namespace detail_ {
 
 template<attribute_t attribute>
-attribute_value_type_t <attribute> get_attribute(const void *ptr)
+status_and_attribute_value<attribute> get_attribute_with_status(const void *ptr)
 {
 	context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
 	attribute_value_type_t <attribute> attribute_value;
 	auto status = cuPointerGetAttribute(&attribute_value, attribute, device::address(ptr));
-	throw_if_error_lazy(status, "Obtaining attribute " + ::std::to_string((int) attribute)
-						   + " for pointer " + cuda::detail_::ptr_as_hex(ptr) );
-	return attribute_value;
+	return { status, attribute_value };
+}
+
+
+template<attribute_t attribute>
+attribute_value_type_t<attribute> get_attribute(const void *ptr)
+{
+	auto status_and_attribute_value = get_attribute_with_status<attribute>(ptr);
+	throw_if_error_lazy(status_and_attribute_value.status,
+		"Obtaining attribute " + ::std::to_string((int) attribute)
+		+ " for pointer " + cuda::detail_::ptr_as_hex(ptr) );
+	return status_and_attribute_value.value;
 }
 
 // TODO: Consider switching to a span with C++20
diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
index 1c8cebe6..223dbe4d 100644
--- a/src/cuda/api/pointer.hpp
+++ b/src/cuda/api/pointer.hpp
@@ -36,7 +36,6 @@ class context_t;
 
 namespace memory {
 
-
 /**
  * @brief see @ref memory::host, @ref memory::device, @ref memory::managed
  */
@@ -46,6 +45,7 @@ enum type_t : ::std::underlying_type<CUmemorytype>::type {
 	array         = CU_MEMORYTYPE_ARRAY,
     unified_      = CU_MEMORYTYPE_UNIFIED,
 	managed_      = CU_MEMORYTYPE_UNIFIED, // an alias (more like the runtime API name)
+	non_cuda      = ~(::std::underlying_type<CUmemorytype>::type{0})
 };
 
 #if CUDA_VERSION >= 11020
@@ -91,6 +91,15 @@ template <> struct attribute_value<CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE>
 template <CUpointer_attribute attribute>
 using attribute_value_type_t = typename attribute_value<attribute>::type;
 
+template<attribute_t attribute>
+struct status_and_attribute_value {
+	status_t status;
+	attribute_value_type_t<attribute> value;
+};
+
+template<attribute_t attribute>
+status_and_attribute_value<attribute> get_attribute_with_status(const void *ptr);
+
 template <attribute_t attribute>
 attribute_value_type_t<attribute> get_attribute(const void* ptr);
 
@@ -107,7 +116,11 @@ inline cuda::device::id_t device_id_of(const void* ptr);
 
 inline memory::type_t type_of(const void* ptr)
 {
-	return pointer::detail_::get_attribute<CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(ptr);
+	auto result = pointer::detail_::get_attribute_with_status<CU_POINTER_ATTRIBUTE_MEMORY_TYPE>(ptr);
+	// Note: As of CUDA 12, CUDA treats passing a non-CUDA-allocated pointer to the memory type check
+	// as an error, though it really should not be
+	return (result.status == status::named_t::invalid_value) ?
+		memory::type_t::non_cuda : result.value;
 }
 
 inline context_t context_of(const void* ptr);