From a272fe799a6d7aacb661f64c691a6ad628393ef7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 26 Sep 2024 16:23:21 -0700
Subject: [PATCH 1/6] initial impl

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 39 +++++++++++++++++++
 cpp/src/io/utilities/hostdevice_vector.hpp    |  8 ++--
 2 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 632d5a732ec..93bcf34cae3 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -53,5 +54,43 @@ void cuda_memcpy_async(
 void cuda_memcpy(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
+template <typename T>
+void copy_to_device_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  auto const is_pinned = src.is_device_accessible();
+  cuda_memcpy_async(dst.data(),
+                    src.data(),
+                    src.size_bytes(),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+}
+
+template <typename T>
+void copy_to_device(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+{
+  copy_to_device_async(dst, src, stream);
+  stream.synchronize();
+}
+
+template <typename T>
+void copy_from_device_async(host_span<T> dst,
+                            device_span<T const> src,
+                            rmm::cuda_stream_view stream)
+{
+  auto const is_pinned = dst.is_device_accessible();
+  cuda_memcpy_async(dst.data(),
+                    src.data(),
+                    src.size_bytes(),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+}
+
+template <typename T>
+void copy_from_device(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+{
+  copy_from_device_async(dst, src, stream);
+  stream.synchronize();
+}
+
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index aed745c42dd..53ed60bb812 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,22 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    copy_to_device_async<T>(d_data, h_data, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    copy_to_device<T>(d_data, h_data, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    copy_from_device_async<T>(h_data, d_data, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
+    copy_from_device<T>(h_data, d_data, stream);
   }
 
   /**

From c0a2e71507c03dc089b554facca285d46c74ec29 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 26 Sep 2024 22:37:59 -0700
Subject: [PATCH 2/6] rework API

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 56 ++++++++-----------
 .../detail/utilities/vector_factories.hpp     | 16 +-----
 cpp/src/io/json/host_tree_algorithms.cu       | 13 +----
 cpp/src/io/utilities/hostdevice_vector.hpp    | 14 ++---
 4 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 93bcf34cae3..d5e467d6475 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -24,6 +24,8 @@
 namespace CUDF_EXPORT cudf {
 namespace detail {
 
+namespace impl {
+
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
 /**
@@ -40,55 +42,43 @@ enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 void cuda_memcpy_async(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
-/**
- * @brief Synchronously copies data between the host and device.
- *
- * Implementation may use different strategies depending on the size and type of host data.
- *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
- * @param stream CUDA stream used for the copy
- */
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+}  // namespace impl
 
 template <typename T>
-void copy_to_device_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
 {
   auto const is_pinned = src.is_device_accessible();
-  cuda_memcpy_async(dst.data(),
-                    src.data(),
-                    src.size_bytes(),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  impl::cuda_memcpy_async(
+    dst.data(),
+    src.data(),
+    std::min(dst.size_bytes(), src.size_bytes()),
+    is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
+    stream);
 }
 
 template <typename T>
-void copy_to_device(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
+void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
 {
-  copy_to_device_async(dst, src, stream);
-  stream.synchronize();
+  auto const is_pinned = dst.is_device_accessible();
+  impl::cuda_memcpy_async(
+    dst.data(),
+    src.data(),
+    std::min(dst.size_bytes(), src.size_bytes()),
+    is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
+    stream);
 }
 
 template <typename T>
-void copy_from_device_async(host_span<T> dst,
-                            device_span<T const> src,
-                            rmm::cuda_stream_view stream)
+void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
 {
-  auto const is_pinned = dst.is_device_accessible();
-  cuda_memcpy_async(dst.data(),
-                    src.data(),
-                    src.size_bytes(),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async(dst, src, stream);
+  stream.synchronize();
 }
 
 template <typename T>
-void copy_from_device(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
+void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
 {
-  copy_from_device_async(dst, src, stream);
+  cuda_memcpy_async(dst, src, stream);
   stream.synchronize();
 }
 
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 953ae5b9308..1f1e7a2db77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  auto const is_pinned = source_data.is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    source_data.data(),
-                    source_data.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  cuda_memcpy_async<T>(ret, source_data, stream);
   return ret;
 }
 
@@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = make_host_vector<T>(v.size(), stream);
-  auto const is_pinned = result.get_allocator().is_device_accessible();
-  cuda_memcpy_async(result.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
+  auto result = make_host_vector<T>(v.size(), stream);
+  cuda_memcpy_async<T>(result, v, stream);
   return result;
 }
 
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 5855f1b5a5f..f7e8134b68d 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
+    cudf::detail::cuda_memcpy_async<NodeT>(
+      d_column_tree.node_categories, column_categories, stream);
   }
 
   // ignore all children of columns forced as string
@@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
         forced_as_string_column[this_col_id])
       column_categories[this_col_id] = NC_STR;
   }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 53ed60bb812..634e6d78ebc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,23 +125,17 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    copy_to_device_async<T>(d_data, h_data, stream);
+    cuda_memcpy_async<T>(d_data, h_data, stream);
   }
 
-  void host_to_device_sync(rmm::cuda_stream_view stream)
-  {
-    copy_to_device<T>(d_data, h_data, stream);
-  }
+  void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    copy_from_device_async<T>(h_data, d_data, stream);
+    cuda_memcpy_async<T>(h_data, d_data, stream);
   }
 
-  void device_to_host_sync(rmm::cuda_stream_view stream)
-  {
-    copy_from_device<T>(h_data, d_data, stream);
-  }
+  void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }
 
   /**
    * @brief Converts a hostdevice_vector into a hostdevice_span.

From db97c3da51b9402896a1f55aa0a717ebbaef2edb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 27 Sep 2024 08:14:14 -0700
Subject: [PATCH 3/6] impl fix

---
 cpp/src/utilities/cuda_memcpy.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 0efb881eb3e..45299fb2e35 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -25,7 +25,7 @@
 
 #include <thrust/copy.h>
 
-namespace cudf::detail {
+namespace cudf::detail::impl {
 
 namespace {
 
@@ -80,4 +80,4 @@ void cuda_memcpy(
   stream.synchronize();
 }
 
-}  // namespace cudf::detail
+}  // namespace cudf::detail::impl

From 80047ebd7f6884ede50e0b44fbf8941a3be4579d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 27 Sep 2024 09:47:32 -0700
Subject: [PATCH 4/6] docs

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 45 ++++++++++++++-----
 cpp/src/utilities/cuda_memcpy.cu              |  7 ---
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index d5e467d6475..46d3f9aa463 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -28,22 +28,20 @@ namespace impl {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+}  // namespace impl
+
 /**
- * @brief Asynchronously copies data between the host and device.
+ * @brief Asynchronously copies data from host to device memory.
  *
  * Implementation may use different strategies depending on the size and type of host data.
  *
- * @param dst Destination memory address
- * @param src Source memory address
- * @param size Number of bytes to copy
- * @param kind Type of host memory
+ * @param dst Destination device memory
+ * @param src Source host memory
  * @param stream CUDA stream used for the copy
  */
-void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
-
-}  // namespace impl
-
 template <typename T>
 void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
 {
@@ -56,6 +54,15 @@ void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_str
     stream);
 }
 
+/**
+ * @brief Asynchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
 template <typename T>
 void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
 {
@@ -68,6 +75,15 @@ void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_str
     stream);
 }
 
+/**
+ * @brief Synchronously copies data from host to device memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination device memory
+ * @param src Source host memory
+ * @param stream CUDA stream used for the copy
+ */
 template <typename T>
 void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
 {
@@ -75,6 +91,15 @@ void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_vi
   stream.synchronize();
 }
 
+/**
+ * @brief Synchronously copies data from device to host memory.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination host memory
+ * @param src Source device memory
+ * @param stream CUDA stream used for the copy
+ */
 template <typename T>
 void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 45299fb2e35..e8193243229 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -73,11 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-void cuda_memcpy(
-  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
-
 }  // namespace cudf::detail::impl

From 6cf40b364614209d41b34d9e37468fb5e941a640 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 27 Sep 2024 13:06:43 -0700
Subject: [PATCH 5/6] throw when mismatched sizes

---
 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 46d3f9aa463..33f4c33d4e0 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -45,11 +45,12 @@ void cuda_memcpy_async(
 template <typename T>
 void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
   auto const is_pinned = src.is_device_accessible();
   impl::cuda_memcpy_async(
     dst.data(),
     src.data(),
-    std::min(dst.size_bytes(), src.size_bytes()),
+    src.size_bytes(),
     is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
     stream);
 }
@@ -66,11 +67,12 @@ void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_str
 template <typename T>
 void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
   auto const is_pinned = dst.is_device_accessible();
   impl::cuda_memcpy_async(
     dst.data(),
     src.data(),
-    std::min(dst.size_bytes(), src.size_bytes()),
+    src.size_bytes(),
     is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
     stream);
 }

From f6a9266cc6787e2ca2d48bb8757594dea421612f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 1 Oct 2024 11:04:44 -0700
Subject: [PATCH 6/6] remove impl namespace

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 28 ++++++++-----------
 cpp/src/utilities/cuda_memcpy.cu              |  8 +++---
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 33f4c33d4e0..4f0c52c5954 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -24,15 +24,11 @@
 namespace CUDF_EXPORT cudf {
 namespace detail {
 
-namespace impl {
-
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
-}  // namespace impl
-
 /**
  * @brief Asynchronously copies data from host to device memory.
  *
@@ -47,12 +43,11 @@ void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_str
 {
   CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
   auto const is_pinned = src.is_device_accessible();
-  impl::cuda_memcpy_async(
-    dst.data(),
-    src.data(),
-    src.size_bytes(),
-    is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
-    stream);
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
 }
 
 /**
@@ -69,12 +64,11 @@ void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_str
 {
   CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async");
   auto const is_pinned = dst.is_device_accessible();
-  impl::cuda_memcpy_async(
-    dst.data(),
-    src.data(),
-    src.size_bytes(),
-    is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
-    stream);
+  cuda_memcpy_async_impl(dst.data(),
+                         src.data(),
+                         src.size_bytes(),
+                         is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                         stream);
 }
 
 /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index e8193243229..c0af27a1748 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -25,12 +25,12 @@
 
 #include <thrust/copy.h>
 
-namespace cudf::detail::impl {
+namespace cudf::detail {
 
 namespace {
 
 // Simple kernel to copy between device buffers
-CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
@@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 
 };  // namespace
 
-void cuda_memcpy_async(
+void cuda_memcpy_async_impl(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   if (kind == host_memory_kind::PINNED) {
@@ -73,4 +73,4 @@ void cuda_memcpy_async(
   }
 }
 
-}  // namespace cudf::detail::impl
+}  // namespace cudf::detail