From a272fe799a6d7aacb661f64c691a6ad628393ef7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 26 Sep 2024 16:23:21 -0700 Subject: [PATCH 1/6] initial impl --- .../cudf/detail/utilities/cuda_memcpy.hpp | 39 +++++++++++++++++++ cpp/src/io/utilities/hostdevice_vector.hpp | 8 ++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..93bcf34cae3 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -53,5 +54,43 @@ void cuda_memcpy_async( void cuda_memcpy( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void copy_to_device_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +template +void copy_to_device(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + copy_to_device_async(dst, src, stream); + stream.synchronize(); +} + +template +void copy_from_device_async(host_span dst, + device_span src, + rmm::cuda_stream_view stream) +{ + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +template +void copy_from_device(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + copy_from_device_async(dst, src, stream); + stream.synchronize(); +} + } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..53ed60bb812 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,22 +125,22 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + copy_to_device_async(d_data, h_data, stream); } void host_to_device_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + copy_to_device(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + copy_from_device_async(h_data, d_data, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + copy_from_device(h_data, d_data, stream); } /** From c0a2e71507c03dc089b554facca285d46c74ec29 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 26 Sep 2024 22:37:59 -0700 Subject: [PATCH 2/6] rework API --- .../cudf/detail/utilities/cuda_memcpy.hpp | 56 ++++++++----------- .../detail/utilities/vector_factories.hpp | 16 +----- cpp/src/io/json/host_tree_algorithms.cu | 13 +---- cpp/src/io/utilities/hostdevice_vector.hpp | 14 ++--- 4 files changed, 33 insertions(+), 66 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 93bcf34cae3..d5e467d6475 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -24,6 +24,8 @@ namespace CUDF_EXPORT cudf { namespace detail { +namespace impl { + enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; /** @@ -40,55 +42,43 @@ enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; void cuda_memcpy_async( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); -/** - * @brief Synchronously copies data between the host and device. - * - * Implementation may use different strategies depending on the size and type of host data. - * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory - * @param stream CUDA stream used for the copy - */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +} // namespace impl template -void copy_to_device_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) { auto const is_pinned = src.is_device_accessible(); - cuda_memcpy_async(dst.data(), - src.data(), - src.size_bytes(), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + impl::cuda_memcpy_async( + dst.data(), + src.data(), + std::min(dst.size_bytes(), src.size_bytes()), + is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, + stream); } template -void copy_to_device(device_span dst, host_span src, rmm::cuda_stream_view stream) +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) { - copy_to_device_async(dst, src, stream); - stream.synchronize(); + auto const is_pinned = dst.is_device_accessible(); + impl::cuda_memcpy_async( + dst.data(), + src.data(), + std::min(dst.size_bytes(), src.size_bytes()), + is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, + stream); } template -void copy_from_device_async(host_span dst, - device_span src, - rmm::cuda_stream_view stream) +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) { - auto const is_pinned = dst.is_device_accessible(); - cuda_memcpy_async(dst.data(), - src.data(), - src.size_bytes(), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); } template -void copy_from_device(host_span dst, device_span src, rmm::cuda_stream_view stream) +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) { - copy_from_device_async(dst, src, stream); + cuda_memcpy_async(dst, src, stream); stream.synchronize(); } diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 5855f1b5a5f..f7e8134b68d 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -634,11 +634,8 @@ std::pair, hashmap_of_device_columns> build_tree is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async( + d_column_tree.node_categories, column_categories, stream); } // ignore all children of columns forced as string @@ -653,11 +650,7 @@ std::pair, hashmap_of_device_columns> build_tree forced_as_string_column[this_col_id]) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, column_categories, stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 53ed60bb812..634e6d78ebc 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,23 +125,17 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - copy_to_device_async(d_data, h_data, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - copy_to_device(d_data, h_data, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - copy_from_device_async(h_data, d_data, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - copy_from_device(h_data, d_data, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. From db97c3da51b9402896a1f55aa0a717ebbaef2edb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 27 Sep 2024 08:14:14 -0700 Subject: [PATCH 3/6] impl fix --- cpp/src/utilities/cuda_memcpy.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..45299fb2e35 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -25,7 +25,7 @@ #include -namespace cudf::detail { +namespace cudf::detail::impl { namespace { @@ -80,4 +80,4 @@ void cuda_memcpy( stream.synchronize(); } -} // namespace cudf::detail +} // namespace cudf::detail::impl From 80047ebd7f6884ede50e0b44fbf8941a3be4579d Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 27 Sep 2024 09:47:32 -0700 Subject: [PATCH 4/6] docs --- .../cudf/detail/utilities/cuda_memcpy.hpp | 45 ++++++++++++++----- cpp/src/utilities/cuda_memcpy.cu | 7 --- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index d5e467d6475..46d3f9aa463 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -28,22 +28,20 @@ namespace impl { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + +} // namespace impl + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); - -} // namespace impl - template void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) { @@ -56,6 +54,15 @@ void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_str stream); } +/** + * @brief Asynchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ template void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) { @@ -68,6 +75,15 @@ void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_str stream); } +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ template void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) { @@ -75,6 +91,15 @@ void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_vi stream.synchronize(); } +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ template void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) { diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 45299fb2e35..e8193243229 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail::impl From 6cf40b364614209d41b34d9e37468fb5e941a640 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 27 Sep 2024 13:06:43 -0700 Subject: [PATCH 5/6] throw when mismatched sizes --- cpp/include/cudf/detail/utilities/cuda_memcpy.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 46d3f9aa463..33f4c33d4e0 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -45,11 +45,12 @@ void cuda_memcpy_async( template void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); auto const is_pinned = src.is_device_accessible(); impl::cuda_memcpy_async( dst.data(), src.data(), - std::min(dst.size_bytes(), src.size_bytes()), + src.size_bytes(), is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, stream); } @@ -66,11 +67,12 @@ void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_str template void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); auto const is_pinned = dst.is_device_accessible(); impl::cuda_memcpy_async( dst.data(), src.data(), - std::min(dst.size_bytes(), src.size_bytes()), + src.size_bytes(), is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, stream); } From f6a9266cc6787e2ca2d48bb8757594dea421612f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 1 Oct 2024 11:04:44 -0700 Subject: [PATCH 6/6] remove impl namespace --- .../cudf/detail/utilities/cuda_memcpy.hpp | 28 ++++++++----------- cpp/src/utilities/cuda_memcpy.cu | 8 +++--- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 33f4c33d4e0..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -24,15 +24,11 @@ namespace CUDF_EXPORT cudf { namespace detail { -namespace impl { - enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); -} // namespace impl - /** * @brief Asynchronously copies data from host to device memory. * @@ -47,12 +43,11 @@ void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_str { CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); auto const is_pinned = src.is_device_accessible(); - impl::cuda_memcpy_async( - dst.data(), - src.data(), - src.size_bytes(), - is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); } /** @@ -69,12 +64,11 @@ void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_str { CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); auto const is_pinned = dst.is_device_accessible(); - impl::cuda_memcpy_async( - dst.data(), - src.data(), - src.size_bytes(), - is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); } /** diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index e8193243229..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -25,12 +25,12 @@ #include -namespace cudf::detail::impl { +namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,4 +73,4 @@ void cuda_memcpy_async( } } -} // namespace cudf::detail::impl +} // namespace cudf::detail