From 3de5fe5f89b60a7e49f5515a7d7612ddfea47f49 Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Mon, 6 Feb 2023 12:58:10 +0200 Subject: [PATCH] Fixes #466, fixes #467, fixes #468 : * Added a memory::type_t enum value, `non_cuda`, for memory not allocated by CUDA * `memory::type_of()` will no longer fail for non-CUDA-allocated memory * `copy_parameters_t` is not out of `detail_`. * The `endpoint_t` enum is now also out of `detail_` * Beefed up `copy_parameters_t` with additional convenience methods *`copy_parameters_t` now "recognizes" non-CUDA-allocated memory, treating it like CUDA host memory * Our array-to-raw-mem copy function now determines the destination address' context * Add an array-to-raw-mem copy function which takes the destination context explicitly --- .../unified_addressing.cpp | 1 + src/cuda/api.hpp | 1 + src/cuda/api/context.hpp | 1 + src/cuda/api/copy_parameters.hpp | 561 ++++++++++++++++++ src/cuda/api/memory.hpp | 326 ++++------ src/cuda/api/multi_wrapper_impls/memory.hpp | 17 +- src/cuda/api/pointer.hpp | 17 +- 7 files changed, 695 insertions(+), 229 deletions(-) create mode 100644 src/cuda/api/copy_parameters.hpp diff --git a/examples/by_runtime_api_module/unified_addressing.cpp b/examples/by_runtime_api_module/unified_addressing.cpp index 97fd2867..da9883c5 100644 --- a/examples/by_runtime_api_module/unified_addressing.cpp +++ b/examples/by_runtime_api_module/unified_addressing.cpp @@ -107,6 +107,7 @@ void wrapped_pointers_and_regions(const cuda::device_t& device) switch (cuda::memory::type_of(ptr)) { using namespace cuda::memory; + case non_cuda: die_("Pointer incorrectly reported to point into non-CUDA-allocated memory"); break; case host_: die_("Pointer incorrectly reported to point into host memory"); break; case array: die_("Pointer incorrectly reported to point to array memory"); break; // case unregistered_memory: die_("Pointer incorrectly reported to point to \"unregistered\" memory"); break; diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index 78e187af..a3206384 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp index 78849e05..be2bb838 100644 --- a/src/cuda/api/context.hpp +++ b/src/cuda/api/context.hpp @@ -103,6 +103,7 @@ constexpr flags_t inline make_flags( | (keep_larger_local_mem_after_resize ? CU_CTX_LMEM_RESIZE_TO_MAX : 0) ); } +// consider renaming this: device_id_of inline device::id_t get_device_id(handle_t context_handle) { auto needed_push = current::detail_::push_if_not_on_top(context_handle); diff --git a/src/cuda/api/copy_parameters.hpp b/src/cuda/api/copy_parameters.hpp new file mode 100644 index 00000000..4216b2a3 --- /dev/null +++ b/src/cuda/api/copy_parameters.hpp @@ -0,0 +1,561 @@ +/** + * @file + * + * @brief The @ref copy_parameters class template and related definitions. + */ +#ifndef CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP +#define CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP + +#include +#include +#include +#include + +namespace cuda { + +namespace memory { + +enum class endpoint_t { + source, destination +}; + +template +struct copy_parameters_t; + +namespace detail_ { + +template +struct base_copy_params; + +template<> +struct base_copy_params<2> { + using intra_context_type = CUDA_MEMCPY2D; + using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ? +}; + +template<> +struct base_copy_params<3> { + using type = CUDA_MEMCPY3D_PEER; + using intra_context_type = CUDA_MEMCPY3D; +}; + +// Note these, by default, support inter-context +template +using base_copy_params_t = typename base_copy_params::type; + +template +array::dimensions_t +non_array_endpoint_dimensions(endpoint_t endpoint, const copy_parameters_t& params); + +} //namespace detail_ + + +/** + * @brief A builder-ish subclass template around the basic 2D or 3D copy + * parameters which CUDA's complex copying API actually takes. + * + * {@note This class is not "safe", in the sense that there is currently no + * checks to ensure you've actively set all fields properly before passing + * it on to the CUDA driver.} + * + * {@note this class cannot hold reference units to any contexts or allocated + * memory, so one must ensure every resource relevant to the source and the + * destination remains alive until the copy operation is both scheduled _and + * executed_.} + */ +template +struct copy_parameters_t : detail_::base_copy_params_t { + using parent = detail_::base_copy_params_t; + using this_type = copy_parameters_t; + // TODO: Perhaps use proxies? + + using intra_context_type = typename detail_::base_copy_params::intra_context_type; + + using dimensions_type = array::dimensions_t; + using dimension_type = array::dimension_t; + + bool is_intra_context() const noexcept { return parent::srcContext == parent::dstContext; } + + this_type& set_context(endpoint_t endpoint, const context_t& context) noexcept; + + this_type& set_single_context(endpoint_t endpoint, const context_t& context) noexcept + { + set_context(endpoint_t::source, context); + set_context(endpoint_t::destination, context); + return *this; + } + + // Note: This assumes default pitch + template + this_type& set_endpoint(endpoint_t endpoint, const cuda::array_t &array) noexcept; + + // Note: This assumes default pitch + this_type& set_endpoint_untyped( + endpoint_t endpoint, + context::handle_t context_handle, + void *ptr, + dimensions_type dimensions); + + // Note: This assumes default pitch + template + this_type& set_endpoint(endpoint_t endpoint, T *ptr, dimensions_type dimensions); + + // Note: This assumes default pitch + template + this_type& set_endpoint(endpoint_t endpoint, span span) noexcept + { + return set_endpoint(endpoint, span.data(), dimensions_type(span.size())); + } + + // Note: This assumes default pitch + // TODO: Perhaps we should have an dimensioned offset type? + template + this_type& set_endpoint( + endpoint_t endpoint, + context::handle_t context_handle, + T *ptr, + dimensions_type dimensions) noexcept; + + template + this_type& set_source(const cuda::array_t &array) noexcept + { + return set_endpoint(endpoint_t::source, array); + } + + this_type& set_source_untyped(context::handle_t context_handle, void *ptr, dimensions_type dimensions) + { + return set_endpoint(endpoint_t::source, context_handle, ptr, dimensions); + } + + template + this_type& set_source(T *ptr, dimensions_type dimensions) noexcept + { + return set_endpoint(endpoint_t::source, ptr, dimensions); + } + + template + this_type& set_source(span span) noexcept + { + return set_source(span.data(), dimensions_type{span.size()}); + } + + template + this_type& set_source(context::handle_t context_handle, T *ptr, dimensions_type dimensions) noexcept + { + return set_endpoint(endpoint_t::source, context_handle, ptr, dimensions); + } + + template + this_type& set_destination(const cuda::array_t &array) noexcept + { + return set_endpoint(endpoint_t::destination, array); + } + + void set_destination_untyped( + context::handle_t context_handle, + void *ptr, + dimensions_type dimensions) noexcept + { + set_endpoint(endpoint_t::destination, context_handle, ptr, dimensions); + } + + template + this_type& set_destination(T *ptr, dimensions_type dimensions) noexcept + { + return set_endpoint(endpoint_t::destination, ptr, dimensions); + } + + template + this_type& set_destination(span span) noexcept + { + return set_destination(span.data(), {span.size(), 1, 1}); + } + + + template + this_type& set_destination(context::handle_t context_handle, T *ptr, dimensions_type dimensions) noexcept + { + return set_endpoint(endpoint_t::destination, context_handle, ptr, dimensions); + } + + this_type& set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept; + + // TODO: Perhaps we should have an dimensioned offset type? + template + this_type& set_offset(endpoint_t endpoint, dimensions_type offset) noexcept; + + this_type& clear_offset(endpoint_t endpoint) noexcept + { + return set_bytes_offset(endpoint, dimensions_type::zero()); + } + + this_type& clear_offsets() noexcept + { + clear_offset(endpoint_t::source); + clear_offset(endpoint_t::destination); + return *this; + } + + this_type& set_bytes_pitch(endpoint_t endpoint, dimension_type pitch_in_bytes) noexcept + { + (endpoint == endpoint_t::source ? parent::srcPitch : parent::dstPitch) = pitch_in_bytes; + return *this; + } + + template + this_type& set_pitch(endpoint_t endpoint, dimension_type pitch_in_elements) noexcept + { + return set_bytes_pitch(endpoint, pitch_in_elements * sizeof(T)); + } + + template + this_type& set_pitches(dimension_type uniform_pitch_in_elements) noexcept + { + auto uniform_pitch_in_bytes { uniform_pitch_in_elements * sizeof(T) }; + set_pitch(endpoint_t::source, uniform_pitch_in_bytes); + set_pitch(endpoint_t::destination, uniform_pitch_in_bytes); + return *this; + } + + // Note: Must be called after copy extents have been set + this_type& set_default_pitch(endpoint_t endpoint) noexcept + { + return set_bytes_pitch(endpoint, parent::WidthInBytes); + } + + // Note: Must be called after copy extents have been set + this_type& set_default_pitches() noexcept + { + set_default_pitch(endpoint_t::source); + set_default_pitch(endpoint_t::destination); + return *this; + } + + this_type& set_bytes_extent(dimensions_type extent) noexcept; + + template + this_type& set_extent(dimensions_type extent) noexcept; + // Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger + + dimensions_type bytes_extent() const noexcept; + + template + dimensions_type extent() const noexcept + { + auto extent_ = bytes_extent(); +#ifndef NDEBUG + if (extent_.width % sizeof(T) != 0) { + throw ::std::invalid_argument( + "Attempt to get the copy extent with assumed type of size " + + ::std::to_string(sizeof(T)) + " while the byte extent's " + + "minor dimension is not a multiple of this size"); + } +#endif + extent_.width /= sizeof(T); + return extent_; + } + + this_type& set_pitches(array::dimension_t uniform_pitch_in_bytes) noexcept + { + set_pitch(endpoint_t::source, uniform_pitch_in_bytes); + set_pitch(endpoint_t::destination, uniform_pitch_in_bytes); + return *this; + } + + this_type& clear_rest() noexcept; + // Clear any dummy fields which are required to be set to 0. Note that important fields, + // which you have not set explicitly, will _not_ be cleared by this method. + +}; + +template<> +inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint_untyped( + endpoint_t endpoint, + context::handle_t, + void * ptr, + array::dimensions_t<2> dimensions) +{ + auto memory_type = memory::type_of(ptr); + if (memory_type == memory::type_t::array) { + throw ::std::invalid_argument("Attempt to use the non-array endpoint setter with array memory at " + cuda::detail_::ptr_as_hex(ptr)); + } + if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) + { + (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); + } + else { + // Either memory::type_t::host or memory::type_t::non_cuda + if (endpoint == endpoint_t::source) { srcHost = ptr; } + else { dstHost = ptr; } + } + set_bytes_pitch(endpoint, dimensions.width); + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) + (memory_type == memory::type_t::non_cuda ? memory::type_t::host_ : memory_type); + // Can't set the endpoint context - the basic data structure doesn't support that! + // (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle; + return *this; +} + +template<> +template +inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint( + endpoint_t endpoint, + context::handle_t context_handle, + T *ptr, + array::dimensions_t<2> dimensions) noexcept +{ + array::dimensions_t<2> untyped_dims = {dimensions.width * sizeof(T), dimensions.height}; + return set_endpoint_untyped(endpoint, context_handle, ptr, untyped_dims); +} + +template<> +template +inline copy_parameters_t<2>& copy_parameters_t<2>::set_endpoint( + endpoint_t endpoint, + T *ptr, + array::dimensions_t<2> dimensions) +{ + // We would have _liked_ to say: + // auto context_handle = context::current::detail_::get_handle(); + // ... here, but alas, 2D copy structures don't support contexts, so... + auto context_handle = context::detail_::none; + return set_endpoint(endpoint, context_handle, ptr, dimensions); +} + +template<> +template +copy_parameters_t<2> ©_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) noexcept +{ + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; + (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); + (endpoint == endpoint_t::source ? srcDevice : dstDevice) = array.device_id(); + // Can't set the endpoint context - the basic data structure doesn't support that! + return *this; +} + +namespace detail_ { + +template<> +inline array::dimensions_t<2> non_array_endpoint_dimensions<2>(endpoint_t endpoint, const copy_parameters_t<2>& params) +{ + using dims_type = copy_parameters_t<2>::dimensions_type; + return (endpoint == endpoint_t::source) ? + dims_type{ params.WidthInBytes, params.Height } : + dims_type{ params.WidthInBytes, params.Height }; +} + +template<> +inline array::dimensions_t<3> non_array_endpoint_dimensions<3>(endpoint_t endpoint, const copy_parameters_t<3>& params) +{ + using dims_type = copy_parameters_t<3>::dimensions_type; + return (endpoint == endpoint_t::source) ? + dims_type{ params.srcPitch, params.Height, params.Depth } : + dims_type{ params.WidthInBytes, params.Height, params.Depth }; +} + +} // + +template<> +template +copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) noexcept +{ + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; + (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); + (endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle(); + return *this; +} + +template<> +inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint_untyped( + endpoint_t endpoint, + context::handle_t context_handle, + void * ptr, + array::dimensions_t<3> dimensions) +{ + auto memory_type = memory::type_of(ptr); + if (memory_type == memory::type_t::array) { + throw ::std::invalid_argument("Attempt to use the non-array endpoint setter with array memory at " + cuda::detail_::ptr_as_hex(ptr)); + } + if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) + { + (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); + } + else { + // Either memory::type_t::host or memory::type_t::non_cuda + if (endpoint == endpoint_t::source) { srcHost = ptr; } + else { dstHost = ptr; } + } + set_bytes_pitch(endpoint, dimensions.width); + (endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height; + (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) + (memory_type == memory::type_t::non_cuda ? memory::type_t::host_ : memory_type); + (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle; + return *this; +} + +// 2D copy parameters only have an intra-context variant; should we silently assume the context +// is the same for both ends? +template<> +inline copy_parameters_t<2>& copy_parameters_t<2>::set_context(endpoint_t endpoint, const context_t& context) noexcept = delete; + +template<> +inline copy_parameters_t<3>& copy_parameters_t<3>::set_context(endpoint_t endpoint, const context_t& context) noexcept +{ + (endpoint == endpoint_t::source ? srcContext : dstContext) = context.handle(); + return *this; +} + +template<> +template +inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint( + endpoint_t endpoint, + context::handle_t context_handle, + T *ptr, + array::dimensions_t<3> dimensions) noexcept +{ + array::dimensions_t<3> untyped_dims = {dimensions.width * sizeof(T), dimensions.height, dimensions.depth}; + return set_endpoint_untyped(endpoint, context_handle, ptr, untyped_dims); +} + +template<> +template +inline copy_parameters_t<3>& copy_parameters_t<3>::set_endpoint( + endpoint_t endpoint, + T *ptr, + array::dimensions_t<3> dimensions) +{ + return set_endpoint(endpoint, context::current::detail_::get_handle(), ptr, dimensions); +} + +template<> +inline copy_parameters_t<2> ©_parameters_t<2>::clear_rest() noexcept +{ + return *this; +} + +template<> +inline copy_parameters_t<3>& copy_parameters_t<3>::clear_rest() noexcept +{ + srcLOD = 0; + dstLOD = 0; + return *this; +} + +template<> +template +inline copy_parameters_t<2> ©_parameters_t<2>::set_extent(dimensions_type extent) noexcept +{ + WidthInBytes = extent.width * sizeof(T); + Height = extent.height; + return *this; +} + +template<> +inline copy_parameters_t<2>& copy_parameters_t<2>::set_bytes_extent(dimensions_type extent) noexcept +{ + WidthInBytes = extent.width; + Height = extent.height; + return *this; +} + +template<> +inline copy_parameters_t<3>& copy_parameters_t<3>::set_bytes_extent(dimensions_type extent) noexcept +{ + WidthInBytes = extent.width; + Height = extent.height; + Depth = extent.depth; + return *this; +} + +template<> +template +copy_parameters_t<3>& copy_parameters_t<3>::set_extent(dimensions_type extent) noexcept +{ + dimensions_type extent_in_bytes{extent.width * sizeof(T), extent.height, extent.depth}; + return set_bytes_extent(extent_in_bytes); +} + +template<> +inline copy_parameters_t<3>& +copy_parameters_t<3>::set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept +{ + (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width; + (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; + (endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth; + return *this; +} + +template<> +inline copy_parameters_t<2> & +copy_parameters_t<2>::set_bytes_offset(endpoint_t endpoint, dimensions_type offset) noexcept +{ + (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width; + (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; + return *this; +} + +template<> +template +copy_parameters_t<3>& copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset) noexcept +{ + dimensions_type offset_in_bytes{offset.width * sizeof(T), offset.height, offset.depth}; + return set_bytes_offset(endpoint, offset_in_bytes); +} + +template<> +template +copy_parameters_t<2> ©_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset) noexcept +{ + dimensions_type offset_in_bytes{offset.width * sizeof(T), offset.height}; + return set_bytes_offset(endpoint, offset_in_bytes); +} + +copy_parameters_t<3>::intra_context_type +inline as_intra_context_parameters(const copy_parameters_t<3>& params) +{ + if (params.srcDevice != params.dstDevice) { + throw ::std::invalid_argument("Attempt to use inter-device copy parameters for an intra-context copy"); + } + if (params.srcContext != params.dstContext) { + throw ::std::invalid_argument("Attempt to use inter-context copy parameters for an intra-context copy"); + } + + // TODO: Use designated initializers in C++20 + copy_parameters_t<3>::intra_context_type result; + + result.srcXInBytes = params.srcXInBytes; + result.srcY = params.srcY; + result.srcZ = params.srcZ; + result.srcLOD = params.srcLOD; + result.srcMemoryType = params.srcMemoryType; + result.srcHost = params.srcHost; + result.srcDevice = params.srcDevice; + result.srcArray = params.srcArray; + result.reserved0 = nullptr; // srcContext + result.srcPitch = params.srcPitch; + result.srcHeight = params.srcHeight; + + result.dstXInBytes = params.dstXInBytes; + result.dstY = params.dstY; + result.dstZ = params.dstZ; + result.dstLOD = params.dstLOD; + result.dstMemoryType = params.dstMemoryType; + result.dstHost = params.dstHost; + result.dstDevice = params.dstDevice; + result.dstArray = params.dstArray; + result.reserved1 = nullptr; + result.dstPitch = params.dstPitch; + result.dstHeight = params.dstHeight; + + result.WidthInBytes = params.WidthInBytes; + result.Height = params.Height; + result.Depth = params.Depth; + return result; +} + +} //namespace memory + +} // namespace cuda + + +#endif //CUDA_API_WRAPPERS_COPY_PARAMETERS_HPP diff --git a/src/cuda/api/memory.hpp b/src/cuda/api/memory.hpp index 9b1dcf14..eb08de0a 100644 --- a/src/cuda/api/memory.hpp +++ b/src/cuda/api/memory.hpp @@ -26,6 +26,7 @@ #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_ #define CUDA_API_WRAPPERS_MEMORY_HPP_ +#include #include #include #include @@ -573,196 +574,28 @@ inline void zero(T* ptr) namespace detail_ { -template -struct base_copy_params; - -template<> -struct base_copy_params<2> { - using intra_context_type = CUDA_MEMCPY2D; - using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ? -}; - -template<> -struct base_copy_params<3> { - using type = CUDA_MEMCPY3D_PEER; - using intra_context_type = CUDA_MEMCPY3D; -}; - -// Note these, by default, support inter-context -template -using base_copy_params_t = typename base_copy_params::type; - - -enum class endpoint_t { - source, destination -}; - -template -struct copy_parameters_t : base_copy_params_t { - // TODO: Perhaps use proxies? - - using intra_context_type = typename base_copy_params::intra_context_type; - - using dimensions_type = array::dimensions_t; - - template - void set_endpoint(endpoint_t endpoint, const cuda::array_t &array); - - template - void set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t dimensions); - - template - void set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, - array::dimensions_t dimensions); - - // TODO: Perhaps we should have an dimensioned offset type? - template - void set_offset(endpoint_t endpoint, dimensions_type offset); - - template - void clear_offset(endpoint_t endpoint) - { set_offset(endpoint, dimensions_type::zero()); } - - template - void set_extent(dimensions_type extent); - // Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger - - void clear_rest(); - // Clear any dummy fields which are required to be set to 0. Note that important fields, - // which you have not set explicitly, will _not_ be cleared by this method. - -}; - -template<> -template -void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) -{ - (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; - (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); - // Can't set the endpoint context - the basic data structure doesn't support that! -} - -template<> -template -void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t &array) -{ - (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY; - (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get(); - (endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle(); -} - -template<> -template -inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, - array::dimensions_t<2> dimensions) -{ - if (context_handle != context::detail_::none) { - throw cuda::runtime_error( - cuda::status::named_t::not_supported, - "Inter-context copying of 2D arrays is not supported by the CUDA driver"); - } - set_endpoint<2>(endpoint, ptr, dimensions); -} - -template<> -template -inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<2> dimensions) -{ - auto memory_type = memory::type_of(ptr); - if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) { - (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); - } else { - if (endpoint == endpoint_t::source) { srcHost = ptr; } - else { dstHost = ptr; } - } - (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T); - (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type; - // Can't set the endpoint context - the basic data structure doesn't support that! -} - -template<> -template -inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr, - array::dimensions_t<3> dimensions) -{ - cuda::memory::pointer_t wrapped{ptr}; - auto memory_type = memory::type_of(ptr); - if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) { - (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr); - } else { - if (endpoint == endpoint_t::source) { srcHost = ptr; } - else { dstHost = ptr; } - } - (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T); - (endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height; - (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type; - (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle; -} - -template<> -template -inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<3> dimensions) -{ - set_endpoint(endpoint, context::detail_::none, ptr, dimensions); -} - -template<> -inline void copy_parameters_t<2>::clear_rest() -{} - -template<> -inline void copy_parameters_t<3>::clear_rest() -{ - srcLOD = 0; - dstLOD = 0; -} - -template<> -template -inline void copy_parameters_t<2>::set_extent(dimensions_type extent) -{ - WidthInBytes = extent.width * sizeof(T); - Height = extent.height; -} - -template<> -template -void copy_parameters_t<3>::set_extent(dimensions_type extent) -{ - WidthInBytes = extent.width * sizeof(T); - Height = extent.height; - Depth = extent.depth; -} - -template<> -template -void copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset) -{ - (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T); - (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; - (endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth; -} - -template<> -template -void copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset) -{ - (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T); - (endpoint == endpoint_t::source ? srcY : dstY) = offset.height; -} - -void set_endpoint(endpoint_t endpoint, void *src); - inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<2> params) { + // TODO: Move this logic into the scoped ensurer class + auto context_handle = context::current::detail_::get_handle(); + if (context_handle != context::detail_::none) { + return cuMemcpy2D(¶ms); + } + auto current_device_id = cuda::device::current::detail_::get_id(); + context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id); + context::current::detail_::push(context_handle); // Note this _must_ be an intra-context copy, as inter-context is not supported // and there's no indication of context in the relevant data structures - return cuMemcpy2D(¶ms); + auto status = cuMemcpy2D(¶ms); + context::current::detail_::pop(); + cuda::device::primary_context::detail_::decrease_refcount(current_device_id); + return status; } inline status_t multidim_copy(::std::integral_constant, copy_parameters_t<3> params) { if (params.srcContext == params.dstContext) { + context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext}; auto *intra_context_params = reinterpret_cast::intra_context_type *>(¶ms); return cuMemcpy3D(intra_context_params); } @@ -770,14 +603,31 @@ inline status_t multidim_copy(::std::integral_constant, cop } template -status_t multidim_copy(context::handle_t context_handle, copy_parameters_t params) +status_t multidim_copy(copy_parameters_t params) { - context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{context_handle}; return multidim_copy(::std::integral_constant{}, params); } + } // namespace detail_ +/** + * @brief An almost-generalized-case memory copy, taking a rather complex structure of + * copy parameters - wrapping the CUDA driver's own most-generalized-case copy + * + * @tparam NumDimensions The number of dimensions of the parameter structure. + * @param params A parameter structure with details regarding the copy source + * and destination, including CUDA context specifications, which must have been + * set in advance. This function will _not_ verify its validity, but rather + * merely pass it on to the CUDA driver + */ +template +void copy(copy_parameters_t params) +{ + status_t status = detail_::multidim_copy(params); + throw_if_error_lazy(status, "Copying using a general copy parameters structure"); +} + /** * Synchronously copies data from a CUDA array into non-array memory. * @@ -789,19 +639,34 @@ status_t multidim_copy(context::handle_t context_handle, copy_parameters_t -void copy(const array_t& destination, const T *source) +void copy(const array_t& destination, const context_t& source_context, const T *source) { - detail_::copy_parameters_t params{}; auto dims = destination.dimensions(); - params.template clear_offset(detail_::endpoint_t::source); - params.template clear_offset(detail_::endpoint_t::destination); + auto params = copy_parameters_t {}; + params.clear_offsets(); params.template set_extent(dims); + params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast(source), dims); + params.set_endpoint(endpoint_t::destination, destination); params.clear_rest(); - params.set_endpoint(detail_::endpoint_t::source, const_cast(source), dims); - params.set_endpoint(detail_::endpoint_t::destination, destination); - auto status = detail_::multidim_copy(destination.context_handle(), params); - throw_if_error(status, "Copying from a regular memory region into a CUDA array"); + copy(params); +} + +/** + * Synchronously copies data from a CUDA array into non-array memory. + * + * @tparam NumDimensions the number of array dimensions; only 2 and 3 are supported values + * @tparam T array element type + * + * @param destination A {@tparam NumDimensions}-dimensional CUDA array + * @param source A pointer to a region of contiguous memory holding `destination.size()` values + * of type @tparam T. The memory may be located either on a CUDA device or in host memory. + */ +template +void copy(const array_t& destination, const T *source) +{ + copy(destination, context_of(source), source); } + /** * Synchronously copies data into a CUDA array from non-array memory. * @@ -813,33 +678,48 @@ void copy(const array_t& destination, const T *source) * @param source A {@tparam NumDimensions}-dimensional CUDA array */ template -void copy(T *destination, const array_t& source) +void copy(const context_t& context, T *destination, const array_t& source) { - detail_::copy_parameters_t params{}; auto dims = source.dimensions(); - params.template clear_offset(detail_::endpoint_t::source); - params.template clear_offset(detail_::endpoint_t::destination); - params.template set_extent(source.dimensions()); + auto params = copy_parameters_t {}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); + params.template set_extent(dims); + params.set_endpoint(endpoint_t::source, source); + params.template set_endpoint(endpoint_t::destination, context.handle(), destination, dims); + params.set_default_pitches(); params.clear_rest(); - params.set_endpoint(detail_::endpoint_t::source, source); - params.template set_endpoint(detail_::endpoint_t::destination, destination, dims); - params.dstPitch = params.srcPitch = dims.width * sizeof(T); - auto status = detail_::multidim_copy(source.context_handle(), params); - throw_if_error(status, "Copying from a CUDA array into a regular memory region"); + copy(params); +} + +/** + * Synchronously copies data into a CUDA array from non-array memory. + * + * @tparam NumDimensions the number of array dimensions; only 2 and 3 are supported values + * @tparam T array element type + * + * @param destination A pointer to a region of contiguous memory holding `destination.size()` values + * of type @tparam T. The memory may be located either on a CUDA device or in host memory. + * @param source A {@tparam NumDimensions}-dimensional CUDA array + */ +template +void copy(T *destination, const array_t& source) +{ + copy(context_of(destination), destination, source); } template void copy(const array_t& destination, const array_t& source) { - detail_::copy_parameters_t params{}; auto dims = source.dimensions(); - params.template clear_offset(detail_::endpoint_t::source); - params.template clear_offset(detail_::endpoint_t::destination); - params.template set_extent(source.dimensions()); - params.clear_rest(); - params.set_endpoint(detail_::endpoint_t::source, source); - params.set_endpoint(detail_::endpoint_t::destination, destination); - params.dstPitch = params.srcPitch = dims.width * sizeof(T); + auto params = copy_parameters_t {}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); + params.template set_extent(dims); + params.set_endpoint(endpoint_t::source, source); + params.set_endpoint(endpoint_t::destination, destination); + params.set_default_pitches(); + params.clear_rest();; auto status = //(source.context() == destination.context()) ? detail_::multidim_copy(source.context_handle(), params); throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region"); @@ -930,7 +810,7 @@ inline void copy(region_t destination, const_region_t source, stream::handle_t s } ///@} -using memory::detail_::copy_parameters_t; +using memory::copy_parameters_t; inline status_t multidim_copy_in_current_context( ::std::integral_constant, @@ -978,17 +858,17 @@ status_t multidim_copy( template void copy(T *destination, const array_t& source, stream::handle_t stream_handle) { - using memory::detail_::endpoint_t; + using memory::endpoint_t; auto dims = source.dimensions(); //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); - detail_::copy_parameters_t params{}; - params.template clear_offset(endpoint_t::source); - params.template clear_offset(endpoint_t::destination); + auto params = copy_parameters_t {}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); params.template set_extent(dims); - params.clear_rest(); params.set_endpoint(endpoint_t::source, source); params.set_endpoint(endpoint_t::destination, const_cast(destination), dims); - params.dstPitch = dims.width * sizeof(T); + params.set_default_pitches(); + params.clear_rest(); auto status = multidim_copy_in_current_context(params, stream_handle); throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region"); } @@ -997,17 +877,17 @@ void copy(T *destination, const array_t& source, stream::handl template void copy(const array_t& destination, const T* source, stream::handle_t stream_handle) { - using memory::detail_::endpoint_t; + using memory::endpoint_t; auto dims = destination.dimensions(); //auto params = make_multidim_copy_params(destination, const_cast(source), destination.dimensions()); - detail_::copy_parameters_t params{}; - params.template clear_offset(endpoint_t::source); - params.template clear_offset(endpoint_t::destination); - params.template set_extent(destination.dimensions()); - params.srcPitch = dims.width * sizeof(T); - params.clear_rest(); + auto params = copy_parameters_t{}; + params.clear_offset(endpoint_t::source); + params.clear_offset(endpoint_t::destination); + params.template set_extent(dims); params.set_endpoint(endpoint_t::source, const_cast(source), dims); params.set_endpoint(endpoint_t::destination, destination); + params.set_default_pitches(); + params.clear_rest(); auto status = multidim_copy_in_current_context(params, stream_handle); throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array"); } diff --git a/src/cuda/api/multi_wrapper_impls/memory.hpp b/src/cuda/api/multi_wrapper_impls/memory.hpp index 8daa2363..e60dd12c 100644 --- a/src/cuda/api/multi_wrapper_impls/memory.hpp +++ b/src/cuda/api/multi_wrapper_impls/memory.hpp @@ -497,14 +497,23 @@ namespace pointer { namespace detail_ { template -attribute_value_type_t get_attribute(const void *ptr) +status_and_attribute_value get_attribute_with_status(const void *ptr) { context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context; attribute_value_type_t attribute_value; auto status = cuPointerGetAttribute(&attribute_value, attribute, device::address(ptr)); - throw_if_error_lazy(status, "Obtaining attribute " + ::std::to_string((int) attribute) - + " for pointer " + cuda::detail_::ptr_as_hex(ptr) ); - return attribute_value; + return { status, attribute_value }; +} + + +template +attribute_value_type_t get_attribute(const void *ptr) +{ + auto status_and_attribute_value = get_attribute_with_status(ptr); + throw_if_error_lazy(status_and_attribute_value.status, + "Obtaining attribute " + ::std::to_string((int) attribute) + + " for pointer " + cuda::detail_::ptr_as_hex(ptr) ); + return status_and_attribute_value.value; } // TODO: Consider switching to a span with C++20 diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp index 1c8cebe6..223dbe4d 100644 --- a/src/cuda/api/pointer.hpp +++ b/src/cuda/api/pointer.hpp @@ -36,7 +36,6 @@ class context_t; namespace memory { - /** * @brief see @ref memory::host, @ref memory::device, @ref memory::managed */ @@ -46,6 +45,7 @@ enum type_t : ::std::underlying_type::type { array = CU_MEMORYTYPE_ARRAY, unified_ = CU_MEMORYTYPE_UNIFIED, managed_ = CU_MEMORYTYPE_UNIFIED, // an alias (more like the runtime API name) + non_cuda = ~(::std::underlying_type::type{0}) }; #if CUDA_VERSION >= 11020 @@ -91,6 +91,15 @@ template <> struct attribute_value template using attribute_value_type_t = typename attribute_value::type; +template +struct status_and_attribute_value { + status_t status; + attribute_value_type_t value; +}; + +template +status_and_attribute_value get_attribute_with_status(const void *ptr); + template attribute_value_type_t get_attribute(const void* ptr); @@ -107,7 +116,11 @@ inline cuda::device::id_t device_id_of(const void* ptr); inline memory::type_t type_of(const void* ptr) { - return pointer::detail_::get_attribute(ptr); + auto result = pointer::detail_::get_attribute_with_status(ptr); + // Note: As of CUDA 12, CUDA treats passing a non-CUDA-allocated pointer to the memory type check + // as an error, though it really should not be + return (result.status == status::named_t::invalid_value) ? + memory::type_t::non_cuda : result.value; } inline context_t context_of(const void* ptr);