Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the cuda_memcpy functions to make them more usable #16945

Open
wants to merge 9 commits into
base: branch-24.12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 68 additions & 14 deletions cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,95 @@
#pragma once

#include <cudf/utilities/export.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace CUDF_EXPORT cudf {
namespace detail {

namespace impl {
vuule marked this conversation as resolved.
Show resolved Hide resolved

enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };

void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);

} // namespace impl

/**
* @brief Asynchronously copies data between the host and device.
* @brief Asynchronously copies data from host to device memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param dst Destination device memory
* @param src Source host memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy_async(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
template <typename T>
void cuda_memcpy_async(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
{
auto const is_pinned = src.is_device_accessible();
impl::cuda_memcpy_async(
dst.data(),
src.data(),
std::min(dst.size_bytes(), src.size_bytes()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might cause surprises to users if they unintentionally pass spans of different sizes. We could instead throw in this case.

Yes, this should be a runtime check and it should throw. If the caller wants to copy subspans, the caller can create subspans. Spans, as view types, are meant to make this easy.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.
Just one thing to keep in mind for this use case:
host_span{my_hv.data(), subsize} is not the same as host_span{my_hv}.subspan(0, subsize) because the first one will not know if it's pointing to pinned memory.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!

is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
stream);
}

/**
* @brief Synchronously copies data between the host and device.
* @brief Asynchronously copies data from device to host memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination memory address
* @param src Source memory address
* @param size Number of bytes to copy
* @param kind Type of host memory
* @param dst Destination host memory
* @param src Source device memory
* @param stream CUDA stream used for the copy
*/
void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
template <typename T>
void cuda_memcpy_async(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
{
auto const is_pinned = dst.is_device_accessible();
impl::cuda_memcpy_async(
dst.data(),
src.data(),
std::min(dst.size_bytes(), src.size_bytes()),
is_pinned ? impl::host_memory_kind::PINNED : impl::host_memory_kind::PAGEABLE,
stream);
}

/**
* @brief Synchronously copies data from host to device memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination device memory
* @param src Source host memory
* @param stream CUDA stream used for the copy
*/
template <typename T>
void cuda_memcpy(device_span<T> dst, host_span<T const> src, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, stream);
stream.synchronize();
}

/**
* @brief Synchronously copies data from device to host memory.
*
* Implementation may use different strategies depending on the size and type of host data.
*
* @param dst Destination host memory
* @param src Source device memory
* @param stream CUDA stream used for the copy
*/
template <typename T>
void cuda_memcpy(host_span<T> dst, device_span<T const> src, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, stream);
stream.synchronize();
}

} // namespace detail
} // namespace CUDF_EXPORT cudf
16 changes: 3 additions & 13 deletions cpp/include/cudf/detail/utilities/vector_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,7 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
rmm::device_async_resource_ref mr)
{
rmm::device_uvector<T> ret(source_data.size(), stream, mr);
auto const is_pinned = source_data.is_device_accessible();
cuda_memcpy_async(ret.data(),
source_data.data(),
source_data.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
cuda_memcpy_async<T>(ret, source_data, stream);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, yes. This is much better.

return ret;
}

Expand Down Expand Up @@ -405,13 +400,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
template <typename T>
host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
{
auto result = make_host_vector<T>(v.size(), stream);
auto const is_pinned = result.get_allocator().is_device_accessible();
cuda_memcpy_async(result.data(),
v.data(),
v.size() * sizeof(T),
is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
stream);
auto result = make_host_vector<T>(v.size(), stream);
cuda_memcpy_async<T>(result, v, stream);
return result;
}

Expand Down
13 changes: 3 additions & 10 deletions cpp/src/io/json/host_tree_algorithms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -634,11 +634,8 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
is_mixed_type_column[this_col_id] == 1)
column_categories[this_col_id] = NC_STR;
}
cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
column_categories.data(),
column_categories.size() * sizeof(column_categories[0]),
cudf::detail::host_memory_kind::PAGEABLE,
stream);
cudf::detail::cuda_memcpy_async<NodeT>(
d_column_tree.node_categories, column_categories, stream);
}

// ignore all children of columns forced as string
Expand All @@ -653,11 +650,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
forced_as_string_column[this_col_id])
column_categories[this_col_id] = NC_STR;
}
cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
column_categories.data(),
column_categories.size() * sizeof(column_categories[0]),
cudf::detail::host_memory_kind::PAGEABLE,
stream);
cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, column_categories, stream);

// restore unique_col_ids order
std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
Expand Down
14 changes: 4 additions & 10 deletions cpp/src/io/utilities/hostdevice_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,23 +125,17 @@ class hostdevice_vector {

void host_to_device_async(rmm::cuda_stream_view stream)
{
cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
cuda_memcpy_async<T>(d_data, h_data, stream);
}

void host_to_device_sync(rmm::cuda_stream_view stream)
{
cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}
void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(d_data, h_data, stream); }

void device_to_host_async(rmm::cuda_stream_view stream)
{
cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
cuda_memcpy_async<T>(h_data, d_data, stream);
}

void device_to_host_sync(rmm::cuda_stream_view stream)
{
cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
}
void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy<T>(h_data, d_data, stream); }

/**
* @brief Converts a hostdevice_vector into a hostdevice_span.
Expand Down
11 changes: 2 additions & 9 deletions cpp/src/utilities/cuda_memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

#include <thrust/copy.h>

namespace cudf::detail {
namespace cudf::detail::impl {

namespace {

Expand Down Expand Up @@ -73,11 +73,4 @@ void cuda_memcpy_async(
}
}

void cuda_memcpy(
void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
{
cuda_memcpy_async(dst, src, size, kind, stream);
stream.synchronize();
}

} // namespace cudf::detail
} // namespace cudf::detail::impl
Loading