rapidsai · vuule · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,33 +26,82 @@ namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async_impl(
+ void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
- void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+ CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+ auto const is_pinned = src.is_device_accessible();
+ cuda_memcpy_async_impl(dst.data(),
+ src.data(),
+ src.size_bytes(),
+ is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+ stream);
+}
 
 /**
- * @brief Synchronously copies data between the host and device.
+ * @brief Asynchronously copies data from device to host memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination host memory
+ * @param src Source device memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy(
- void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+template <typename T>
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+ CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
+ auto const is_pinned = dst.is_device_accessible();
+ cuda_memcpy_async_impl(dst.data(),
+ src.data(),
+ src.size_bytes(),
+ is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+ stream);
+}
+
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+ cuda_memcpy_async(dst, src, stream);
+ stream.synchronize();
+}
+
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
+template <typename T>
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+ cuda_memcpy_async(dst, src, stream);
+ stream.synchronize();
+}
 
 } // namespace detail
 } // namespace CUDF_EXPORT cudf
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
  rmm::device_async_resource_ref mr)
 {
  rmm::device_uvector<T> ret(source_data.size(), stream, mr);
- auto const is_pinned = source_data.is_device_accessible();
- cuda_memcpy_async(ret.data(),
- source_data.data(),
- source_data.size() * sizeof(T),
- is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
- stream);
+ cuda_memcpy_async<T>(ret, source_data, stream);
  return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
- auto result = make_host_vector<T>(v.size(), stream);
- auto const is_pinned = result.get_allocator().is_device_accessible();
- cuda_memcpy_async(result.data(),
- v.data(),
- v.size() * sizeof(T),
- is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
- stream);
+ auto result = make_host_vector<T>(v.size(), stream);
+ cuda_memcpy_async<T>(result, v, stream);
  return result;
 }
 

@@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
  is_mixed_type_column[this_col_id] == 1)
  column_categories[this_col_id] = NC_STR;
  }
- cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
- column_categories.data(),
- column_categories.size() * sizeof(column_categories[0]),
- cudf::detail::host_memory_kind::PAGEABLE,
- stream);
+ cudf::detail::cuda_memcpy_async<NodeT>(
+ d_column_tree.node_categories, column_categories, stream);
  }
 
  // ignore all children of columns forced as string
@@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
  forced_as_string_column[this_col_id])
  column_categories[this_col_id] = NC_STR;
  }
- cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
- column_categories.data(),
- column_categories.size() * sizeof(column_categories[0]),
- cudf::detail::host_memory_kind::PAGEABLE,
- stream);
+ cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
 
  // restore unique_col_ids order
  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {

@@ -125,23 +125,17 @@ class hostdevice_vector {
 
  void host_to_device_async(rmm::cuda_stream_view stream)
  {
- cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+ cuda_memcpy_async<T>(d_data, h_data, stream);
  }
 
- void host_to_device_sync(rmm::cuda_stream_view stream)
- {
- cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
- }
+ void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
  void device_to_host_async(rmm::cuda_stream_view stream)
  {
- cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+ cuda_memcpy_async<T>(h_data, d_data, stream);
  }
 
- void device_to_host_sync(rmm::cuda_stream_view stream)
- {
- cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
- }
+ void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
  /**
  * @brief Converts a hostdevice_vector into a hostdevice_span.

@@ -30,7 +30,7 @@ namespace cudf::detail {
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
  auto const idx = cudf::detail::grid_1d::global_thread_id();
  if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 }; // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
  if (kind == host_memory_kind::PINNED) {
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
  }
 }
 
-void cuda_memcpy(
- void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
- cuda_memcpy_async(dst, src, size, kind, stream);
- stream.synchronize();
-}
-
 } // namespace cudf::detail